# Imports

In [1]:
import string
import pandas as pd
import numpy as np
import scipy.stats as stats
import json
import matplotlib.pyplot as plt

from collections import defaultdict
from preprocessing import Preprocessor
from data_loader import DataLoader, Encoder
from helpers import select_n_components, pos_check
from textblob import TextBlob
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator
from sklearn.utils.fixes import loguniform
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.base import TransformerMixin
from tempfile import mkdtemp
from shutil import rmtree
from joblib import Memory

[nltk_data] Downloading package stopwords to
[nltk_data]     /afs/inf.ed.ac.uk/user/s21/s2125219/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
train, validate, test = DataLoader().create_dataframe(preprocess=True, split=True, remove_duplicates=True)

  1%|▏         | 3508/239073 [00:00<00:06, 35079.03it/s]

Preprocessing...


100%|██████████| 239073/239073 [00:05<00:00, 42710.34it/s]


Deduplicating...


In [3]:
train.head()

Unnamed: 0,phrase_id,phrase,phrase_clean,sentiment_val,label_id,label,word_count
147608,42267,"know the ` truth ' about this man , while deco...",know truth man deconstructing format biography...,0.55556,3.0,Neutral,7
143906,33690,it mostly worth the trip,mostly worth trip,0.56944,3.0,Neutral,3
90526,194916,branched out into their own pseudo-witty copyc...,branched pseudowitty copycat interpretations,0.34722,2.0,Negative,4
41158,147570,Murder by Numbers just does n't add up .,Murder Numbers nt add .,0.38889,2.0,Negative,5
164045,234333,of `` Minority Report,Minority Report,0.5,3.0,Neutral,2


In [4]:
train.shape

(126124, 7)

# Train, test, dev split

In [5]:
X_train, y_train = train['phrase_clean'], train['label_id']
X_val, y_val = validate['phrase_clean'], validate['label_id']
X_test, y_test = test['phrase_clean'], test['label_id']

# Feature Engineering & Selection

- Features to include:

    - phrase length
    - punctuation count
    - capital letters count

In [6]:
punct_count = lambda l1, l2: sum([1 for x in l1 if x in l2])
caps_count = lambda l1: sum([1 for x in l1 if x.isupper()])

def get_phrase_length(text):
    return np.array([len(t) for t in tqdm(text)]).reshape(-1, 1)

def get_num_punct(text):
    return np.array([punct_count(t, set(string.punctuation)) for t in tqdm(text)]).reshape(-1, 1)

def get_num_caps(text):
    return np.array([caps_count(t) for t in tqdm(text)]).reshape(-1, 1)

In [7]:
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             max_features=10000, 
                             use_idf=True, 
                             ngram_range=(1,3))

# Prediction

In [15]:
class Result:
    def __init__(self):
        self.accuracy = 0
        self.precision = 0
        self.recall = 0
        self.f1 = 0
        self.kappa = 0
        

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        print('Making dense transformation...\n')
        return X.todense()

class ClassificationPipeline():
    def __init__(self, clf_id, clf, vectorizer, feature_processing, pipe=None):
        self.pipe = pipe 
        self.clf_id = clf_id 
        self.clf = clf
        self.vectorizer = vectorizer
        self.feature_processing = feature_processing
                   
    def create_feature_pipeline(self, memory):
        self.pipe = Pipeline([
            ('feature_pipeline', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', self.vectorizer),
                    ('to_dense', DenseTransformer()),
                    ('lda', LinearDiscriminantAnalysis(n_components=4)),
                ])),
                ('feature_processing', self.feature_processing)
            ])),
            (self.clf_id, self.clf)
        ], memory=memory)
    
    def create_pipeline(self, memory):
        self.pipe = Pipeline([
            ('feature_pipeline', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', self.vectorizer),
                    ('to_dense', DenseTransformer()),
                    ('lda', LinearDiscriminantAnalysis(n_components=4)),
                ])),
            ])),
            (self.clf_id, self.clf)
        ], memory=memory)
            
    def train_and_evaluate(self, X_train, y_train, X_val, y_val, memory, use_features=True):
        if use_features:
            self.create_feature_pipeline(memory=memory)
        else:
            self.create_pipeline(memory=memory)
            
        self.pipe.fit(X_train, y_train)
        preds = self.pipe.predict(X_val)
        
        accuracy = accuracy_score(y_val, preds)
        precision = precision_score(y_val, preds, average='macro')
        recall = recall_score(y_val, preds, average='macro')
        f1 = f1_score(y_val, preds, average='macro')
        kappa = cohen_kappa_score(y_val, preds)
        
        return accuracy, precision, recall, f1, kappa

In [16]:
# Features
features = FeatureUnion([
    ('phrase_length', Pipeline([
        ('f1', FunctionTransformer(get_phrase_length, validate=False))]
    ))
])

feature_processing = Pipeline([('features', features)])

# Classifiers
dc = ClassificationPipeline(clf_id='dc', 
                            clf=DummyClassifier(strategy='most_frequent'),
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
gnb = ClassificationPipeline(clf_id='gnb',
                            clf=GaussianNB(),
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
lr = ClassificationPipeline(clf_id='lr', 
                           clf=LogisticRegression(max_iter=10000),
                           vectorizer=tfidf_vect,
                           feature_processing=feature_processing)
lin_svm = ClassificationPipeline(clf_id='lin_svm', 
                                 clf=LinearSVC(),
                                 vectorizer=tfidf_vect,
                                 feature_processing=feature_processing)
rbf_svm = ClassificationPipeline(clf_id='rbf_svm', 
                                 clf=SVC(kernel='rbf'), 
                                 vectorizer=tfidf_vect,
                                 feature_processing=feature_processing)
rf = ClassificationPipeline(clf_id='rf', 
                            clf=RandomForestClassifier(max_depth=10, n_estimators=50), 
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
mlp = ClassificationPipeline(clf_id='mlp', 
                             clf=MLPClassifier(max_iter=800),
                             vectorizer=tfidf_vect,
                             feature_processing=feature_processing)

In [14]:
# Train and evaluate classifiers using additional features
clfs = [dc, gnb, lr, lin_svm, rbf_svm, rf, mlp]

# Train classifiers using additional features

In [13]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

results = defaultdict(Result)

for clf in clfs:
    print(f'Training {clf.clf_id}...\n')
    
    accuracy, precision, recall, f1, kappa = clf.train_and_evaluate(X_train, y_train, X_val, y_val, 
                                                          use_features=True, memory=memory)
    
    results[clf.clf_id].accuracy = accuracy 
    results[clf.clf_id].precision = precision
    results[clf.clf_id].recall = recall
    results[clf.clf_id].f1 = f1
    results[clf.clf_id].kappa = kappa
    
rmtree(cachedir)

You provided "cachedir='/tmp/tmp2msbi13o'", use "location='/tmp/tmp2msbi13o'" instead.
  """


Training dc...

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <__main__.DenseTransformer object at 0x7f3be27c4190>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                               ('feature_processing',
                                Pipeline(steps=[('features',
                                          

100%|██████████| 126124/126124 [00:00<00:00, 1941015.41it/s]
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 272.8s, 4.5min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1731847.77it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Training gnb...

[Memory]276.9s, 4.6min  : Loading _fit_transform_one from /tmp/tmp2msbi13o/joblib/sklearn/pipeline/_fit_transform_one/3d6a0d23f84ddf42fe6ce6bacd6ab287
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1750838.57it/s]


Training lr...

[Memory]279.7s, 4.7min  : Loading _fit_transform_one from /tmp/tmp2msbi13o/joblib/sklearn/pipeline/_fit_transform_one/3d6a0d23f84ddf42fe6ce6bacd6ab287
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1774890.00it/s]


Training lin_svm...

[Memory]300.1s, 5.0min  : Loading _fit_transform_one from /tmp/tmp2msbi13o/joblib/sklearn/pipeline/_fit_transform_one/3d6a0d23f84ddf42fe6ce6bacd6ab287
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min




Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1782389.58it/s]


Training rbf_svm...

[Memory]349.0s, 5.8min  : Loading _fit_transform_one from /tmp/tmp2msbi13o/joblib/sklearn/pipeline/_fit_transform_one/3d6a0d23f84ddf42fe6ce6bacd6ab287
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1777092.00it/s]


Training rf...

[Memory]1054.8s, 17.6min: Loading _fit_transform_one from /tmp/tmp2msbi13o/joblib/sklearn/pipeline/_fit_transform_one/3d6a0d23f84ddf42fe6ce6bacd6ab287
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1811910.62it/s]


Training mlp...

[Memory]1065.9s, 17.8min: Loading _fit_transform_one from /tmp/tmp2msbi13o/joblib/sklearn/pipeline/_fit_transform_one/3d6a0d23f84ddf42fe6ce6bacd6ab287
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1722886.11it/s]


In [16]:
for k, v in json.loads(Encoder().encode(results)).items():
    accuracy = v['accuracy']
    precision = v['precision']
    recall = v['recall']
    f1 = v['f1']
    kappa = v['kappa']
    
    print(f'{k} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, Kappa: {kappa:.3f}')
    print()

dc --- Accuracy: 0.498, Precision: 0.100, Recall: 0.200, F1: 0.133, Kappa: 0.000

gnb --- Accuracy: 0.611, Precision: 0.521, Recall: 0.500, F1: 0.509, Kappa: 0.404

lr --- Accuracy: 0.622, Precision: 0.554, Recall: 0.469, F1: 0.498, Kappa: 0.405

lin_svm --- Accuracy: 0.606, Precision: 0.540, Recall: 0.432, F1: 0.442, Kappa: 0.401

rbf_svm --- Accuracy: 0.627, Precision: 0.559, Recall: 0.463, F1: 0.493, Kappa: 0.413

rf --- Accuracy: 0.627, Precision: 0.549, Recall: 0.493, F1: 0.515, Kappa: 0.421

mlp --- Accuracy: 0.624, Precision: 0.540, Recall: 0.498, F1: 0.516, Kappa: 0.422



# Train classifiers without using additional features

In [17]:
# Train and evaulate classifiers without additional features
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

results = defaultdict(Result)

for clf in clfs:
    print(f'Training {clf.clf_id}...\n')
    
    accuracy, precision, recall, f1, kappa = clf.train_and_evaluate(X_train, y_train, X_val, y_val, 
                                                          use_features=True, memory=memory)
    
    print(f'{clf.clf_id} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1}, Kappa: {kappa}')
    
    results[clf.clf_id].accuracy = accuracy 
    results[clf.clf_id].precision = precision
    results[clf.clf_id].recall = recall
    results[clf.clf_id].f1 = f1
    results[clf.clf_id].kappa = kappa
    

rmtree(cachedir)

You provided "cachedir='/tmp/tmpdw6ugtj8'", use "location='/tmp/tmpdw6ugtj8'" instead.
  This is separate from the ipykernel package so we can avoid doing imports until


Training dc...

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <__main__.DenseTransformer object at 0x7fef961d1a50>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                               ('feature_processing',
                                Pipeline(steps=[('features',
                                          

100%|██████████| 126124/126124 [00:00<00:00, 2297243.74it/s]
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 273.5s, 4.6min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1988173.14it/s]
  _warn_prf(average, modifier, msg_start, len(result))


dc --- Accuracy: 0.498, Precision: 0.100, Recall: 0.200, F1: 0.13290195526797016, Kappa: 0.0
Training gnb...

[Memory]277.7s, 4.6min  : Loading _fit_transform_one from /tmp/tmpdw6ugtj8/joblib/sklearn/pipeline/_fit_transform_one/d225e5ba4ef299475900aa4c8daee4a8
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2001403.38it/s]


gnb --- Accuracy: 0.611, Precision: 0.521, Recall: 0.500, F1: 0.5094656485071292, Kappa: 0.40408449055598505
Training lr...

[Memory]280.4s, 4.7min  : Loading _fit_transform_one from /tmp/tmpdw6ugtj8/joblib/sklearn/pipeline/_fit_transform_one/d225e5ba4ef299475900aa4c8daee4a8
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2022629.58it/s]


lr --- Accuracy: 0.622, Precision: 0.553, Recall: 0.469, F1: 0.49805320587337387, Kappa: 0.4048598366105536
Training lin_svm...

[Memory]305.2s, 5.1min  : Loading _fit_transform_one from /tmp/tmpdw6ugtj8/joblib/sklearn/pipeline/_fit_transform_one/d225e5ba4ef299475900aa4c8daee4a8
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min




Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2037761.99it/s]


lin_svm --- Accuracy: 0.609, Precision: 0.535, Recall: 0.425, F1: 0.44951277498465536, Kappa: 0.37526375110956445
Training rbf_svm...

[Memory]356.3s, 5.9min  : Loading _fit_transform_one from /tmp/tmpdw6ugtj8/joblib/sklearn/pipeline/_fit_transform_one/d225e5ba4ef299475900aa4c8daee4a8
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2191708.98it/s]


rbf_svm --- Accuracy: 0.627, Precision: 0.559, Recall: 0.463, F1: 0.49267876349282114, Kappa: 0.4126594116546488
Training rf...

[Memory]1054.3s, 17.6min: Loading _fit_transform_one from /tmp/tmpdw6ugtj8/joblib/sklearn/pipeline/_fit_transform_one/d225e5ba4ef299475900aa4c8daee4a8
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 2014994.39it/s]


rf --- Accuracy: 0.627, Precision: 0.548, Recall: 0.492, F1: 0.5140489877851053, Kappa: 0.4217551496418803
Training mlp...

[Memory]1065.6s, 17.8min: Loading _fit_transform_one from /tmp/tmpdw6ugtj8/joblib/sklearn/pipeline/_fit_transform_one/d225e5ba4ef299475900aa4c8daee4a8
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1984823.61it/s]

mlp --- Accuracy: 0.623, Precision: 0.546, Recall: 0.483, F1: 0.5056438077978067, Kappa: 0.4180293487250639





In [19]:
for k, v in json.loads(Encoder().encode(results)).items():
    accuracy = v['accuracy']
    precision = v['precision']
    recall = v['recall']
    f1 = v['f1']
    kappa = v['kappa']
    
    print(f'{k} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, Kappa: {kappa:.3f}')
    print()

dc --- Accuracy: 0.498, Precision: 0.100, Recall: 0.200, F1: 0.133, Kappa: 0.000

gnb --- Accuracy: 0.611, Precision: 0.521, Recall: 0.500, F1: 0.509, Kappa: 0.404

lr --- Accuracy: 0.622, Precision: 0.553, Recall: 0.469, F1: 0.498, Kappa: 0.405

lin_svm --- Accuracy: 0.609, Precision: 0.535, Recall: 0.425, F1: 0.450, Kappa: 0.375

rbf_svm --- Accuracy: 0.627, Precision: 0.559, Recall: 0.463, F1: 0.493, Kappa: 0.413

rf --- Accuracy: 0.627, Precision: 0.548, Recall: 0.492, F1: 0.514, Kappa: 0.422

mlp --- Accuracy: 0.623, Precision: 0.546, Recall: 0.483, F1: 0.506, Kappa: 0.418



# Best performing classifier on test set

In [9]:
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])

In [13]:
X.shape

(173594,)

In [18]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

accuracy, precision, recall, f1, kappa = mlp.train_and_evaluate(X, y, 
                                                                X_test, y_test, 
                                                                use_features=True, 
                                                                memory=memory)
rmtree(cachedir)

You provided "cachedir='/tmp/tmpbcx6_0_i'", use "location='/tmp/tmpbcx6_0_i'" instead.
  


________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <__main__.DenseTransformer object at 0x7f69b05c5590>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                               ('feature_processing',
                                Pipeline(steps=[('features',
                                                 FeatureUn

100%|██████████| 173594/173594 [00:00<00:00, 1876262.06it/s]
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


______________________________________________fit_transform_one - 339.8s, 5.7min
Making dense transformation...



100%|██████████| 47470/47470 [00:00<00:00, 1868587.57it/s]
