# Sentiment analysis

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy

nlp = spacy.load("en_core_web_sm", exclude=["ner"])

In [3]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

In [4]:
docs = DocBin().from_disk('parsed.docbin')
df['doc'] = list(docs.get_docs(nlp.vocab))

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train,test = train_test_split(df,
                             test_size=0.2,
                             stratify=df['sentiment'],
                             random_state=619)

----

In [8]:
displacy.render(nlp("They didn't have any clean towels."))

In [9]:
from spacy.tokens import Token
Token.set_extension('neg', default=False)

In [10]:
for doc in df['doc']:
    for t in doc:
        if t.dep_ == 'neg':
            t.head._.neg = True

In [11]:
def add_neg(token):
    return 'NOT:'+token.norm_ if token._.neg else token.norm_

In [12]:
def tokenize(doc):
    return [add_neg(t) for t in doc]

In [13]:
m1 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m1.fit(train['doc'], train['sentiment'])
m1.score(test['doc'], test['sentiment'])

0.8991

In [14]:
def print_top_feats(M, k=0):
    V = M.named_steps['countvectorizer'].get_feature_names()
    coef = M.named_steps['sgdclassifier'].coef_[0]
    order = coef.argsort()
    for w1, w2 in zip(order[-k:][::-1],order[:k]):
        print(f'{V[w1]:20s} {coef[w1]:7.3f} | {V[w2]:20s} {coef[w2]:7.3f}')

In [15]:
print_top_feats(m1, 25)

great                  5.446 | NOT:stay              -5.221
loved                  4.941 | average               -4.982
perfect                4.900 | ok                    -4.972
excellent              4.496 | dirty                 -4.838
amazing                4.317 | poor                  -4.648
definitely             4.109 | unhelpful             -4.545
wonderful              3.952 | ruined                -4.502
comfortable            3.952 | tiny                  -4.357
appointed              3.872 | not                   -4.246
pleasantly             3.829 | worst                 -4.211
minor                  3.751 | dated                 -4.190
spacious               3.706 | filthy                -4.035
NOT:beat               3.694 | terrible              -3.882
downside               3.607 | dingy                 -3.858
spotless               3.564 | outdated              -3.767
complaint              3.470 | uncomfortable         -3.704
elegant                3.441 | update   

In [16]:
def negify(tok):
    tok._.neg = True
    for child in tok.children:
        negify(child)

In [17]:
for doc in df['doc']:
    for t in doc:
        t._.neg = False
    for t in doc:        
        if t.dep_ == 'neg':
            t.head._.neg = True
            for r in t.head.rights:
                if r.dep_ in ['acomp', 'advmod', 'attr', 'dobj', 'prep', 'xcomp']:
                    negify(r)

In [18]:
m2 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=tokenize),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m2.fit(train['doc'], train['sentiment'])
m2.score(test['doc'], test['sentiment'])

0.9023

In [19]:
print_top_feats(m2, 25)

great                  5.590 | dirty                 -5.071
NOT:hesitate           5.416 | average               -4.767
loved                  5.038 | poor                  -4.625
perfect                4.572 | ok                    -4.564
excellent              4.382 | ruined                -4.400
comfortable            4.106 | dated                 -4.248
wonderful              3.998 | disappointed          -4.232
amazing                3.853 | not                   -4.186
pleasantly             3.799 | outdated              -4.182
downside               3.736 | unhelpful             -4.113
NOT:better             3.703 | NOT:again             -4.060
NOT:beat               3.691 | worst                 -4.033
definitely             3.626 | terrible              -3.980
spacious               3.620 | filthy                -3.912
appointed              3.574 | tiny                  -3.809
lovely                 3.559 | horrible              -3.648
complaint              3.548 | uncomfort

In [20]:
def mod_tokenizer(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc 
            if w.dep_ in ['amod', 'advmod'] ] + \
            [ add_neg(w) for w in doc]

In [21]:
mod_tokenizer(nlp("The didn't have any clean towels."))

['towels_clean', 'the', 'do', 'not', 'have', 'any', 'clean', 'towels', '.']

In [22]:
m3 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=mod_tokenizer),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m3.fit(train['doc'], train['sentiment'])
m3.score(test['doc'], test['sentiment'])

0.9091

In [23]:
print_top_feats(m3, 25)

NOT:hesitate           5.152 | dirty                 -4.827
loved                  4.729 | average               -4.139
great                  4.566 | ok                    -3.967
perfect                4.331 | poor                  -3.950
excellent              3.676 | terrible              -3.871
lovely                 3.448 | worst                 -3.709
quiet                  3.446 | tiny                  -3.705
amazing                3.422 | ruined                -3.698
wonderful              3.206 | filthy                -3.672
immaculate             3.083 | unhelpful             -3.589
NOT:disappointed       3.078 | not                   -3.588
NOT:better             3.064 | dated                 -3.384
thing_bad              3.036 | disappointed          -3.314
NOT:eat                2.997 | outdated              -3.269
NOT:beat               2.993 | horrible              -3.212
spotless               2.939 | thing_best            -3.025
downside               2.927 | rude     

In [24]:
def everything(doc):
    return [ add_neg(w.head) + '_' + add_neg(w) for w in doc ] + \
            [ add_neg(w) for w in doc]

In [25]:
everything(nlp("They didn't have any clean towels."))

['have_they',
 'have_do',
 'have_not',
 'have_have',
 'towels_any',
 'towels_clean',
 'have_towels',
 'have_.',
 'they',
 'do',
 'not',
 'have',
 'any',
 'clean',
 'towels',
 '.']

In [26]:
m4 = make_pipeline(CountVectorizer(preprocessor=identity, tokenizer=everything),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m4.fit(train['doc'], train['sentiment'])
m4.score(test['doc'], test['sentiment'])

0.9116

In [27]:
print_top_feats(m4, 50)

great                  5.348 | average               -4.757
excellent              4.115 | ok                    -4.620
perfect                3.821 | dirty                 -4.309
wonderful              3.489 | not                   -4.021
comfortable            3.343 | poor                  -3.994
amazing                3.231 | terrible              -3.573
lovely                 3.153 | worst                 -3.520
quiet                  3.086 | no                    -3.434
clean_very             2.794 | tiny                  -3.239
minor                  2.755 | rude                  -2.997
definitely             2.748 | nothing               -2.952
awesome                2.730 | disappointed          -2.945
loved                  2.659 | dated                 -2.851
comfortable_very       2.659 | horrible              -2.799
appointed              2.463 | unhelpful             -2.661
spacious               2.423 | bad                   -2.641
fantastic              2.397 | NOT:again

# Failed manual runs
in ascending order of improvement

In [28]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=12),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-6))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.8942

In [29]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])


0.9053

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-6))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-6))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=8),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.5,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7),
                   TfidfTransformer(),
                   SGDClassifier(alpha=1e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=12),
                   TfidfTransformer(),
                   SGDClassifier(alpha=3e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=14),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.9,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [None]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.8,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

In [35]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.9,
                                   min_df=12),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.9128

In [34]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.9124

In [33]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.7,
                                   min_df=12),
                   TfidfTransformer(),
                   SGDClassifier(alpha=2e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.9124

In [31]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.8,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=3e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.9126

# Best manual runs

In [36]:
m5 = make_pipeline(CountVectorizer(preprocessor=identity, 
                                   tokenizer=everything, 
                                   max_df=0.8,
                                   min_df=10),
                   TfidfTransformer(),
                   SGDClassifier(alpha=4e-5))
m5.fit(train['doc'], train['sentiment'])
m5.score(test['doc'], test['sentiment'])

0.9131

# Compare to our first classifier

In [37]:
base_predicted = m1.predict(test["doc"])

In [38]:
predicted = m5.predict(test["doc"])

In [39]:
base_f1 = f1_score(test["sentiment"], base_predicted, average="macro")
sgd_f1 = f1_score(test["sentiment"], predicted, average="macro")

In [40]:
print(f"Base F1 score: {base_f1}")
print(f"SGD F1 score:  {sgd_f1}")
print(f"Difference:    {sgd_f1 - base_f1}") 
print(f"Improvement:   {(sgd_f1 - base_f1) / (1 - base_f1)}")

Base F1 score: 0.8667684233550119
SGD F1 score:  0.8847607251396229
Difference:    0.017992301784610976
Improvement:   0.1350453266236853
