# Sentiment analysis
Construct a text classifier that predicts sentiment labels as if they were topics

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

-----

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train1, test1 = train_test_split(df, test_size=0.2, stratify=df["sentiment"], random_state=619)

In [5]:
test = test1.copy().reset_index(drop=True)
train = train1.copy().reset_index(drop=True)
train

Unnamed: 0,title,text,date_stayed,date,service,cleanliness,overall,value,location,sleep_quality,rooms,locality,name,sentiment
0,“The Place to Stay in Manhattan! Simply Superb!”,I have been to New york before and stayed in t...,December 2008,2009-06-18,4.0,5.0,5.0,5.0,5.0,,5.0,New York City,Wingate by Wyndham Manhattan Midtown,good
1,"“Howard Johnson, Phoenix”",The hotel is located one mile from Sky Harbor ...,January 2012,2012-01-17,4.0,3.0,3.0,3.0,5.0,2.0,3.0,Phoenix,Howard Johnson Phoenix Airport/Downtown Area,bad
2,“Pier 5 hotel was unlike any other hotel we ha...,Most hotels and hotel rooms look the same but ...,September 2009,2009-10-09,4.0,5.0,4.0,4.0,5.0,,5.0,Baltimore,Pier 5 Hotel,good
3,"“New York's best kept secret...well, not so se...","Excellent rooms, wonderful service......value ...",July 2011,2011-09-04,5.0,5.0,5.0,5.0,,5.0,,New York City,On The Ave Hotel,good
4,“Cheerful Location-Best Value.”,We recently stayed for 5 nights in Hotel 140. ...,March 2012,2012-03-23,4.0,4.0,4.0,5.0,5.0,4.0,4.0,Boston,Hotel 140,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,“WOW!!”,"I was apprehensive to stay on the ""edge"" of do...",August 2011,2011-12-17,5.0,4.0,5.0,5.0,4.0,5.0,5.0,Seattle,Hyatt At Olive 8,good
39996,“basic cosy and comfortable”,We stayed during the hurricaine Sandy and foun...,October 2012,2012-11-09,3.0,4.0,4.0,3.0,5.0,3.0,4.0,New York City,The Hotel at Times Square,good
39997,"“Super place, don't be turned off by how it lo...",Really nice place to stay. Had a room in the c...,June 2012,2012-07-31,5.0,5.0,5.0,5.0,5.0,4.0,5.0,San Francisco,Marina Motel,good
39998,“Well located and friendly”,The MOTEL was very close to downtown which was...,March 2012,2012-03-27,5.0,2.0,3.0,4.0,4.0,4.0,3.0,Phoenix,Americas Best Value Inn - Downtown Phoenix,bad


In [6]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if not (t.is_space or t.is_punct or t.like_num)]

In [7]:
import multiprocessing as mp
with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

---

## Baseline classifier

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline

In [9]:
baseline = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
baseline.fit(train["tokens"], train["sentiment"])
base_predicted = baseline.predict(test["tokens"])
print(classification_report(test["sentiment"], base_predicted))

              precision    recall  f1-score   support

         bad       0.87      0.65      0.75      2678
        good       0.88      0.96      0.92      7322

    accuracy                           0.88     10000
   macro avg       0.88      0.81      0.83     10000
weighted avg       0.88      0.88      0.88     10000



----

## Hyperparameter search

Find an optimal set of hyperparameters for a Tfidf+SGDClassifier model

In [10]:
import mlflow
from dask_ml.model_selection import RandomizedSearchCV
from logger import log_search
from scipy.stats.distributions import loguniform, randint, uniform

In [11]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [13]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:46089")
client

0,1
Client  Scheduler: tcp://127.0.0.1:46089  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [14]:
mlflow.set_experiment("lab-8")
sgd = make_pipeline(
    CountVectorizer(analyzer=identity), TfidfTransformer(), SGDClassifier()
)
# Skeleton classifier

In [15]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 20),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "tfidftransformer__use_idf": [True, False],
        "sgdclassifier__alpha": loguniform(1e-6, 1e-2),
    },
    n_iter=50,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["sentiment"])
log_search(search)

CPU times: user 10.7 s, sys: 1.54 s, total: 12.2 s
Wall time: 3min 21s


---

## Compare optimized model to baseline

In [26]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=12, max_df=0.84),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=5.4e-5),
)
sgd.fit(train["tokens"], train["sentiment"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["sentiment"], predicted))

              precision    recall  f1-score   support

         bad       0.85      0.77      0.81      2678
        good       0.92      0.95      0.93      7322

    accuracy                           0.90     10000
   macro avg       0.88      0.86      0.87     10000
weighted avg       0.90      0.90      0.90     10000



In [27]:
base_f1 = f1_score(test["sentiment"], base_predicted, average="macro")
sgd_f1 = f1_score(test["sentiment"], predicted, average="macro")

In [1]:
print(f"Base F1 score: {base_f1}")
print(f"SGD F1 score:  {sgd_f1}")
print(f"Difference:    {sgd_f1 - base_f1}") 
print(f"Improvement:   {(sgd_f1 - base_f1) / (1 - base_f1)}")

NameError: name 'base_f1' is not defined

In [29]:
(sgd_f1 - base_f1) / (1 - base_f1)
# Percentage error reduction; how much we improved over the base.

0.2213704451268561

In [20]:
# Predicted is the SGD prediction
# test["sentiment"] is the right answer
# if they are equal, the value is true, if they are not, then it is false
diff = (predicted == test["sentiment"]).astype(int) - (base_predicted == test["sentiment"]).astype(int)
# if both base and SGD have the same answer, thehn we get 0
# If baseline was wrong (0) and SGD was right(1) we get 1
# If baseline was right (1) and SGD was wrong (0) we get -1

print(f"SGD and baseline agreed {sum(diff == 0)} times")
print(f"SGD was right, and baseline was wrong {sum(diff == 1)} times")
print(f"Baseline was right, and SGD was wrong {sum(diff == -1)} times")

SGD and baseline agreed 9436 times
SGD was right, and baseline was wrong 384 times
Baseline was right, and SGD was wrong 180 times


In [21]:
from scipy.stats import binom_test, wilcoxon

binom_test([sum(diff == 1), sum(diff == -1)], alternative="greater")


2.7280047216608303e-18