# LAB 7: Error analysis

Objectives
* Construct a  linear text classifier using SGDClassifier
* Evaluate its performance and categorize the errors that it makes
* Eaxmine model's coefficients and decision function values
* Interpret model results using LIME

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

---

## Load data

In [2]:
train = pd.read_parquet(
    "s3://ling583/lab7-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet("s3://ling583/lab7-test.parquet", storage_options={"anon": True})

In [3]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if not (t.is_space or t.is_punct or t.like_num)]

In [4]:
import multiprocessing as mp

In [5]:
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/19054 [00:00<?, ?it/s]

  0%|          | 0/4764 [00:00<?, ?it/s]

The labels are: GPOL = domestic politics, GSPO = sports, GVIO = war/civil war, GJOB = labor issues

In [6]:
train["topics"].value_counts()
#GPOL Politics    
#GSPO Sports
#GVIO Violence - War/ Civil war
#GJOB labor issues

GPOL    7410
GSPO    5639
GVIO    3712
GJOB    2293
Name: topics, dtype: int64

---

## Baseline classifier

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline

In [8]:
baseline = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
baseline.fit(train["tokens"], train["topics"])
base_predicted = baseline.predict(test["tokens"])
print(classification_report(test["topics"], base_predicted))

              precision    recall  f1-score   support

        GJOB       0.94      0.94      0.94       573
        GPOL       0.95      0.93      0.94      1853
        GSPO       0.99      0.99      0.99      1410
        GVIO       0.88      0.91      0.89       928

    accuracy                           0.95      4764
   macro avg       0.94      0.94      0.94      4764
weighted avg       0.95      0.95      0.95      4764



----

## Hyperparameter search

Find an optimal set of hyperparameters for a Tfidf+SGDClassifier model

In [9]:
import mlflow
from dask_ml.model_selection import RandomizedSearchCV
from logger import log_search
from scipy.stats.distributions import loguniform, randint, uniform

In [10]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [12]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:46331")
client

0,1
Client  Scheduler: tcp://127.0.0.1:46331  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [16]:
mlflow.set_experiment("lab-7")
sgd = make_pipeline(
    CountVectorizer(analyzer=identity), TfidfTransformer(), SGDClassifier()
)
# Skeleton classifier

In [17]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 20),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "tfidftransformer__use_idf": [True, False],
        "sgdclassifier__alpha": loguniform(1e-6, 1e-2),
    },
    n_iter=50,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["topics"])
log_search(search)

CPU times: user 10 s, sys: 1.39 s, total: 11.4 s
Wall time: 3min 33s


---

## Compare optimized model to baseline

In [18]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=5, max_df=0.75),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=1e-4),
)
sgd.fit(train["tokens"], train["topics"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        GJOB       0.98      0.93      0.95       573
        GPOL       0.94      0.97      0.95      1853
        GSPO       1.00      1.00      1.00      1410
        GVIO       0.93      0.90      0.91       928

    accuracy                           0.96      4764
   macro avg       0.96      0.95      0.95      4764
weighted avg       0.96      0.96      0.96      4764



In [19]:
base_f1 = f1_score(test["topics"], base_predicted, average="macro")
sgd_f1 = f1_score(test["topics"], predicted, average="macro")

In [27]:
print(f"Base F1 score: {base_f1}")
print(f"SGD F1 score:  {sgd_f1}")
print(f"Difference:    {sgd_f1 - base_f1}") 

Base F1 score: 0.9426472666025926
SGD F1 score:  0.954526192005073
Difference:    0.011878925402480478


In [21]:
(sgd_f1 - base_f1) / (1 - base_f1)
# Percentage error reduction; how much we imroved over the base.

0.20712047532537398

In [28]:
from scipy.stats import binom_test, wilcoxon

In [33]:
# Predicted is the SGD prediction
# test["topics"] is the right answer
# if they are equal, the value is true, if they are not, then it is false
diff = (predicted == test["topics"]).astype(int) - (base_predicted == test["topics"]).astype(int)
# if both base and SGD have the same answer, thehn we get 0
# If baseline was wrong (0) and SGD was right(1) we get 1
# If baseline was right (1) and SGD was wrong (0) we get -1

print(f"SGD and baseline agreed {sum(diff == 0)} times")
print(f"SGD was right, and baseline was wrong {sum(diff == 1)} times")
print(f"Baseline was right, and SGD was wrong {sum(diff == -1)} times")

SGD and baseline agreed 4610 times
SGD was right, and baseline was wrong 105 times
Baseline was right, and SGD was wrong 49 times


In [35]:
# for those that were classified differently by the two classifiers, they theoretically have a 50/50 
# chance to get into either classifier. We run the binomial test to see if the distribution of these
# choices matches with that assumption.

binom_test([sum(diff == 1), sum(diff == -1)], alternative="greater")

# the result, approximately 0.000000375 is much lower that the standard 0.05 alpha for the test
# this just means that in the case of a true 50/50 chance scenario, the probability of achieving the same outcome as above is 
# incredibly small. This would indicate that the SGD classifier actually is better than the baseline.

3.750695876493649e-06

In [36]:
# similar to the binomial test above.
# is only really applicable when you only care about the sign, plus or minus
wilcoxon(diff, alternative="greater")

WilcoxonResult(statistic=8137.5, pvalue=3.2017563999193402e-06)

**TO DO:** Summarize your results: how much better is the optimized model? Is it significantly better than the baseline?

-----

## Save model

In [37]:
import cloudpickle

In [38]:
# In this version we change the preprocessor portion and add a tokenizer
sgd = make_pipeline(
    CountVectorizer(preprocessor=identity, tokenizer=tokenize, min_df=5, max_df=0.75),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=1e-4),
)
sgd.fit(train["text"], train["topics"])
predicted = sgd.predict(test["text"])
print(classification_report(test["topics"], predicted))

              precision    recall  f1-score   support

        GJOB       0.97      0.94      0.96       573
        GPOL       0.94      0.96      0.95      1853
        GSPO       1.00      0.99      1.00      1410
        GVIO       0.92      0.91      0.92       928

    accuracy                           0.96      4764
   macro avg       0.96      0.95      0.96      4764
weighted avg       0.96      0.96      0.96      4764



In [40]:
# The built in pickle function does not work with these complicated structures so we use cloudpickle
cloudpickle.dump(sgd, open("sgd.model", "wb"))