# LAB 5: Text classification with Naive Bayes

Objectives:

* Train and evaluate Naive Bayes text classifiers
* Cross-validation
* Hyperparameter search

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Load and preprocess data

In [None]:
df = pd.read_parquet(
    "s3://ling583/rcv1-politics.parquet", storage_options={"anon": True}
)

In [None]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)

In [None]:
def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha]

In [None]:
df["tokens"] = df["text"].progress_apply(tokenize)

---

### Baseline dummy classifier

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score

Set up five-fold cross-validation. We'll use the same training/test splits for all our experiments so the results will be easier to compare.

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5432)

In [None]:
baseline = DummyClassifier()
cross_val_score(baseline, df["tokens"], df["pol"], cv=cv)

In [None]:
predicted = cross_val_predict(baseline, df["tokens"], df["pol"], cv=cv)
print(classification_report(df["pol"], predicted))

----

### Bernoulli Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline

Set up a pipeline: first convert tokenized text into feature vectors, then apply naive Bayes classifier

In [None]:
bnb = make_pipeline(CountVectorizer(analyzer=identity), BernoulliNB())

In [None]:
predicted = cross_val_predict(bnb, df["tokens"], df["pol"], cv=cv, n_jobs=-1)
print(classification_report(df["pol"], predicted))

---

### Hyperparameter search

In [None]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:34795")
client

In [None]:
from dask_ml.model_selection import RandomizedSearchCV
from scipy.stats.distributions import loguniform, randint, uniform

In [None]:
from warnings import simplefilter
simplefilter(action="ignore", category=FutureWarning)

In [None]:
bnb = make_pipeline(CountVectorizer(analyzer=identity), BernoulliNB())

In [None]:
%%time

search = RandomizedSearchCV(
    bnb, {"bernoullinb__alpha": loguniform(1e-10, 10.0)}, n_iter=25, scoring="f1"
)
search.fit(df["tokens"], df["pol"])

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
cv_results = pd.DataFrame(search.cv_results_)

In [None]:
cv_results.plot(
    "param_bernoullinb__alpha", "mean_test_score", kind="scatter", logx=True
)

In [None]:
%%time

search = RandomizedSearchCV(
    bnb,
    {
        "countvectorizer__min_df": randint(1, 5),
        "bernoullinb__alpha": loguniform(1e-10, 1e-5),
    },
    n_iter=25,
    scoring="f1",
)
search.fit(df["tokens"], df["pol"])

In [None]:
search.best_params_, search.best_score_

In [None]:
cv_results = pd.DataFrame(search.cv_results_)

In [None]:
cv_results.plot("param_countvectorizer__min_df", "mean_test_score", kind="scatter")

In [None]:
cv_results.plot(
    "param_bernoullinb__alpha", "mean_test_score", kind="scatter", logx=True
)

In [None]:
cv_results.plot(
    "param_bernoullinb__alpha",
    "mean_test_score",
    kind="scatter",
    logx=True,
    c="param_countvectorizer__min_df",
    colormap="Set1",
)

In [None]:
%%time

search = RandomizedSearchCV(
    bnb,
    {
        "countvectorizer__min_df": [1],
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "bernoullinb__alpha": loguniform(1e-10, 1e-5),
    },
    n_iter=25,
    scoring="f1",
)
search.fit(df["tokens"], df["pol"])

In [None]:
search.best_params_, search.best_score_

In [None]:
cv_results = pd.DataFrame(search.cv_results_)
cv_results.plot("param_countvectorizer__max_df", "mean_test_score", kind="scatter")

---

### Evaluate final model

In [None]:
bnb.set_params(
    bernoullinb__alpha=1e-10, countvectorizer__min_df=1, countvectorizer__max_df=0.73
)

In [None]:
predicted = cross_val_predict(bnb, df["tokens"], df["pol"], cv=cv, n_jobs=-1)
print(classification_report(df["pol"], predicted))