# LAB 6: Text classification with linear models

Objectives:

* Train and evaluate linear text classifiers using SGDClassifier
* Experiment with different feature extraction and training methods
* Log and evaluate experimental results using [mlflow](https://mlflow.org)

In [27]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Load and preprocess data

In [28]:
train = pd.read_parquet(
    "s3://ling583/rcv1-topics-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet(
    "s3://ling583/rcv1-topics-test.parquet", storage_options={"anon": True}
)

In [29]:
train.head()

Unnamed: 0,text,topics
0,NZ bonds close well bid ahead of key U.S. data...,MCAT
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT
2,U.S. public schools get a C report card in qua...,GCAT
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT


CCAT : CORPORATE/INDUSTRIAL  
ECAT : ECONOMICS  
GCAT : GOVERNMENT/SOCIAL  
MCAT : MARKETS

In [30]:
train["topics"].value_counts()

CCAT    5896
MCAT    3281
GCAT    3225
ECAT    1073
Name: topics, dtype: int64

In [31]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha]

In [32]:
import multiprocessing as mp

In [33]:
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/13475 [00:00<?, ?it/s]

  0%|          | 0/3369 [00:00<?, ?it/s]

In [34]:
train.head()

Unnamed: 0,text,topics,tokens
0,NZ bonds close well bid ahead of key U.S. data...,MCAT,"[nz, bonds, close, well, bid, ahead, of, key, ..."
1,Asia Product Swaps - Jet/gas oil regrade at di...,MCAT,"[asia, product, swaps, jet, gas, oil, regrade,..."
2,U.S. public schools get a C report card in qua...,GCAT,"[public, schools, get, a, c, report, card, in,..."
3,Thunder Bay vessel clearances - May 12. Daily ...,MCAT,"[thunder, bay, vessel, clearances, may, daily,..."
4,"Amoco gains shares in Ula,Gyda N.Sea fields. A...",CCAT,"[amoco, gains, shares, in, ula, gyda, fields, ..."


---

### SGDClassifier

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline

In [36]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.97      0.95      0.96      1475
        ECAT       0.92      0.85      0.88       268
        GCAT       0.95      0.98      0.97       806
        MCAT       0.93      0.96      0.95       820

    accuracy                           0.95      3369
   macro avg       0.94      0.94      0.94      3369
weighted avg       0.95      0.95      0.95      3369



In [37]:
import logger
import mlflow
from logger import log_search, log_test

In [38]:
mlflow.set_experiment("lab-6")
log_test(sgd, test["topics"], predicted)

---

In [39]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:36825")
client

0,1
Client  Scheduler: tcp://127.0.0.1:36825  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [40]:
from dask_ml.model_selection import RandomizedSearchCV
from scipy.stats.distributions import loguniform, randint, uniform

In [41]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [47]:
%%time
# I'm not sure what I'm doing wrong, the commented out values match to parameters of 
# SGDClassifier but it doesn't recognize them and throws an error.
search = RandomizedSearchCV(
    sgd,
    {        
        "countvectorizer__min_df": randint(1, 10),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        #'average': [True, False],
        #'l1_ratio': uniform(0, 1),
        #'alpha': loguniform(1e-4, 1e0)
    },
    n_iter=25,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 6.38 s, sys: 354 ms, total: 6.73 s
Wall time: 1min 12s


----

### Optimized model

In [50]:
sgd = make_pipeline(CountVectorizer(analyzer=identity, min_df=1, max_df=0.94), SGDClassifier())
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        CCAT       0.97      0.96      0.97      1475
        ECAT       0.86      0.90      0.88       268
        GCAT       0.96      0.98      0.97       806
        MCAT       0.96      0.94      0.95       820

    accuracy                           0.96      3369
   macro avg       0.94      0.94      0.94      3369
weighted avg       0.96      0.96      0.96      3369

