# Imports

In [10]:
import string
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

from preprocessing import Preprocessor
from data_loader import DataLoader
from helpers import select_n_components, pos_check
from textblob import TextBlob
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, FunctionTransformer
from sklearn.base import BaseEstimator
from sklearn.utils.fixes import loguniform
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.base import TransformerMixin
from tempfile import mkdtemp
from shutil import rmtree
from joblib import Memory

# Load Data

In [15]:
train, validate, test = DataLoader().create_dataframe(preprocess=True, split=True, remove_duplicates=True)

  1%|▏         | 3456/239073 [00:00<00:06, 34550.31it/s]

Preprocessing...


100%|██████████| 239073/239073 [00:05<00:00, 41650.74it/s]


Deduplicating...


In [16]:
train.head()

Unnamed: 0,phrase_id,phrase,phrase_clean,sentiment_val,label_id,label,word_count
29417,105996,"Gives us a lot to chew on , but not all of it ...",Gives us lot chew properly digested .,0.41667,3.0,Neutral,7
45769,108668,Rocky and Bullwinkle,Rocky Bullwinkle,0.51389,3.0,Neutral,2
129460,82606,if Nakata did it better,Nakata better,0.44444,3.0,Neutral,2
18647,104291,An engrossing Iranian film about two itinerant...,An engrossing Iranian film two itinerant teach...,0.66667,4.0,Positive,19
9574,13622,", provocative , insistently humanizing",provocative insistently humanizing,0.68056,4.0,Positive,3


In [17]:
train.shape

(126124, 7)

# Train, test, dev split

In [19]:
X_train, y_train = train['phrase_clean'], train['label_id']
X_val, y_val = validate['phrase_clean'], validate['label_id']
X_test, y_test = test['phrase_clean'], test['label_id']

# Feature Engineering & Selection

- Features to include:

    - phrase length
    - punctuation count
    - capital letters count
    - number of adjective POS tags

In [20]:
punct_count = lambda l1, l2: sum([1 for x in l1 if x in l2])
caps_count = lambda l1: sum([1 for x in l1 if x.isupper()])

def get_phrase_length(text):
    return np.array([len(t) for t in tqdm(text)]).reshape(-1, 1)

def get_num_punct(text):
    return np.array([punct_count(t, set(string.punctuation)) for t in tqdm(text)]).reshape(-1, 1)

def get_num_caps(text):
    return np.array([caps_count(t) for t in tqdm(text)]).reshape(-1, 1)

In [21]:
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             max_features=10000, 
                             use_idf=True, 
                             ngram_range=(1,3))

# Prediction

In [39]:
class ClassificationPipeline():
    def __init__(self, memory, vectorizer, use_features):
        

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        print('Making dense transformation...')
        return X.todense()


def create_feature_pipeline(feature_name, feature_id, feature):
    return (feature_name, Pipeline([
        (feature_id, feature)
    ]))

def create_pipeline(my_id, clf, memory, vectorizer=tfidf_vect, use_features=True):
    if use_features:
        pipe = Pipeline([
            ('features', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', vectorizer),
                    ('to_dense', DenseTransformer()),
                    ('lda', LinearDiscriminantAnalysis(n_components=4)),
                ])),
                create_feature_pipeline('phrase_length', 'f1', 
                                        FunctionTransformer(get_phrase_length, validate=False)),
                create_feature_pipeline('num_punct', 'f2', 
                                        FunctionTransformer(get_num_punct, validate=False)),
                create_feature_pipeline('num_caps', 'f3', 
                                        FunctionTransformer(get_num_caps, validate=False)),
            ])),
            (my_id, clf)            
        ], memory=memory)
    else:
        pipe = Pipeline([
            ('features', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', vectorizer),
                    ('to_dense', DenseTransformer()),
                    ('lda', LinearDiscriminantAnalysis(n_components=4))
                ])),
            ])),
            (my_id, clf)            
        ])
        
    return pipe

# Train classifiers with TF-IDF vectors

In [31]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

classifiers = {
    'Dummy, most frequent': create_pipeline(my_id='dc', 
                                            clf=DummyClassifier(strategy='most_frequent'),
                                            memory=memory,
                                            use_features=True),
    'Gaussian NB': create_pipeline(my_id='gnb', 
                                   clf=GaussianNB(), 
                                   memory=memory,
                                   use_features=True),
    'Logistic Regression': create_pipeline(my_id='lr', 
                                        clf=LogisticRegression(max_iter=8000), 
                                        memory=memory,
                                        use_features=True),
    'Linear SVM': create_pipeline(my_id='lin_svm', 
                                  clf=LinearSVC(), 
                                  memory=memory,
                                  use_features=True),
    'RBF SVM': create_pipeline(my_id='svm_rbf', 
                               clf=SVC(kernel='rbf'), 
                               memory=memory,
                               use_features=True),
    'Random Forest': create_pipeline(my_id='rf', 
                                     clf=RandomForestClassifier(max_depth=10, n_estimators=50),
                                     memory=memory,
                                     use_features=True),
    'MLP Classifer': create_pipeline(my_id='mlp',
                                     clf=MLPClassifier(max_iter=800),
                                     memory=memory,
                                     use_features=True),
}

- Micro-averaged F1 score is preferable for scoring when dataset is imbalanced.

In [34]:
ca_train_score = {}
ca_val_score = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    print(f'\nFinished training classifier: {name}')
    
    train_preds = clf.predict(X_train)
    val_preds = clf.predict(X_val)
    
    ca_train_score[name] = f1_score(train_preds, y_train, average='macro')
    ca_val_score[name] = f1_score(val_preds, y_val, average='macro')
    
    print(f'{name}: {ca_val_score[name]}')
    
rmtree(cachedir)

[Memory]2795.8s, 46.6min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min

Finished training classifier: Dummy, most frequent
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1882791.92it/s]
100%|██████████| 126124/126124 [00:00<00:00, 403909.58it/s]
100%|██████████| 126124/126124 [00:00<00:00, 505638.85it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1716514.02it/s]
100%|██████████| 47470/47470 [00:00<00:00, 332327.87it/s]
100%|██████████| 47470/47470 [00:00<00:00, 465620.72it/s]


Dummy, most frequent: 0.13290195526797016
[Memory]2803.7s, 46.7min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min

Finished training classifier: Gaussian NB
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1860470.28it/s]
100%|██████████| 126124/126124 [00:00<00:00, 402348.06it/s]
100%|██████████| 126124/126124 [00:00<00:00, 500240.57it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1723154.51it/s]
100%|██████████| 47470/47470 [00:00<00:00, 331281.69it/s]
100%|██████████| 47470/47470 [00:00<00:00, 450995.32it/s]


Gaussian NB: 0.4946757953164787
[Memory]2811.7s, 46.9min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.5s, 0.0min

Finished training classifier: Logistic Regression
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1899756.51it/s]
100%|██████████| 126124/126124 [00:00<00:00, 406917.72it/s]
100%|██████████| 126124/126124 [00:00<00:00, 510775.38it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1672633.58it/s]
100%|██████████| 47470/47470 [00:00<00:00, 359795.46it/s]
100%|██████████| 47470/47470 [00:00<00:00, 442164.57it/s]


Logistic Regression: 0.4984168813813622
[Memory]2837.6s, 47.3min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min





Finished training classifier: Linear SVM
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1858476.75it/s]
100%|██████████| 126124/126124 [00:00<00:00, 401200.41it/s]
100%|██████████| 126124/126124 [00:00<00:00, 499352.35it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1708838.52it/s]
100%|██████████| 47470/47470 [00:00<00:00, 328767.02it/s]
100%|██████████| 47470/47470 [00:00<00:00, 460685.37it/s]


Linear SVM: 0.41898825408612295
[Memory]2901.1s, 48.4min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min

Finished training classifier: RBF SVM
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1448190.86it/s]
100%|██████████| 126124/126124 [00:00<00:00, 401097.89it/s]
100%|██████████| 126124/126124 [00:00<00:00, 496437.58it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1700083.77it/s]
100%|██████████| 47470/47470 [00:00<00:00, 334083.84it/s]
100%|██████████| 47470/47470 [00:00<00:00, 468282.96it/s]


RBF SVM: 0.49421423105242646
[Memory]4309.6s, 71.8min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min

Finished training classifier: Random Forest
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1897405.69it/s]
100%|██████████| 126124/126124 [00:00<00:00, 389418.18it/s]
100%|██████████| 126124/126124 [00:00<00:00, 423113.57it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1725005.73it/s]
100%|██████████| 47470/47470 [00:00<00:00, 328533.75it/s]
100%|██████████| 47470/47470 [00:00<00:00, 458193.98it/s]


Random Forest: 0.5111567105000308
[Memory]4325.5s, 72.1min: Loading _fit_transform_one from /tmp/tmpyqku2wwm/joblib/sklearn/pipeline/_fit_transform_one/bdab5df99b107d978989bb6fa60bf478
___________________________________fit_transform_one cache loaded - 0.6s, 0.0min

Finished training classifier: MLP Classifer
Making dense transformation...


100%|██████████| 126124/126124 [00:00<00:00, 1879300.43it/s]
100%|██████████| 126124/126124 [00:00<00:00, 405830.74it/s]
100%|██████████| 126124/126124 [00:00<00:00, 500264.22it/s]


Making dense transformation...


100%|██████████| 47470/47470 [00:00<00:00, 1423897.67it/s]
100%|██████████| 47470/47470 [00:00<00:00, 320778.38it/s]
100%|██████████| 47470/47470 [00:00<00:00, 337606.23it/s]


MLP Classifer: 0.5155385014680038


In [35]:
print('Classification performance on validation set: \n')

print('Validation (LHS), Training (RHS)')
print()
for name, clf in classifiers.items():
    print ("{method:<20s}{val_f1:>13.3f}{train_f1:>13.3f}".format(
        method=name, val_f1=ca_val_score[name],
        train_f1=ca_train_score[name]))

Classification performance on validation set: 

Validation (LHS), Training (RHS)

Dummy, most frequent        0.133        0.136
Gaussian NB                 0.495        0.566
Logistic Regression         0.498        0.583
Linear SVM                  0.419        0.495
RBF SVM                     0.494        0.576
Random Forest               0.511        0.625
MLP Classifer               0.516        0.607


In [40]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

classifiers = {
    'Dummy, most frequent': create_pipeline(my_id='dc', 
                                            clf=DummyClassifier(strategy='most_frequent'),
                                            memory=memory,
                                            use_features=False),
    'Gaussian NB': create_pipeline(my_id='gnb', 
                                   clf=GaussianNB(),
                                   memory=memory,
                                   use_features=False),
    'Logistic Regression': create_pipeline(my_id='lr', 
                                        clf=LogisticRegression(max_iter=8000),
                                        memory=memory,
                                        use_features=False),
    'Linear SVM': create_pipeline(my_id='lin_svm', 
                                  clf=LinearSVC(), 
                                  memory=memory,
                                  use_features=False),
    'RBF SVM': create_pipeline(my_id='svm_rbf', 
                               clf=SVC(kernel='rbf'), 
                               memory=memory,
                               use_features=False),
    'Random Forest': create_pipeline(my_id='rf', 
                                     clf=RandomForestClassifier(max_depth=10, n_estimators=50),
                                     memory=memory,
                                     use_features=False),
    'MLP Classifer': create_pipeline(my_id='mlp',
                                    clf=MLPClassifier(max_iter=800),
                                     memory=memory,
                                    use_features=False)
}

In [41]:
ca_train_score = {}
ca_val_score = {}

ca_train_macro = {}
ce_val_macro = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    print(f'\nFinished training classifier: {name}')
    
    train_preds = clf.predict(X_train)
    val_preds = clf.predict(X_val)
    
    ca_train_score[name] = f1_score(train_preds, y_train, average='micro')
    ca_val_score[name] = f1_score(val_preds, y_val, average='micro')
    
    ca_train_macro[name] = f1_score(train_preds, y_train, average='macro')
    ce_val_macro[name] = f1_score(val_preds, y_val, average='macro')

rmtree(cachedir)

Making dense transformation...

Finished training classifier: Dummy, most frequent
Making dense transformation...
Making dense transformation...
Making dense transformation...

Finished training classifier: Gaussian NB
Making dense transformation...
Making dense transformation...
Making dense transformation...

Finished training classifier: Logistic Regression
Making dense transformation...
Making dense transformation...
Making dense transformation...





Finished training classifier: Linear SVM
Making dense transformation...
Making dense transformation...
Making dense transformation...

Finished training classifier: RBF SVM
Making dense transformation...
Making dense transformation...
Making dense transformation...

Finished training classifier: Random Forest
Making dense transformation...
Making dense transformation...
Making dense transformation...

Finished training classifier: MLP Classifer
Making dense transformation...
Making dense transformation...


In [43]:
print('Classification performance on validation set: \n')

print('Validation (LHS), Training (RHS)')
print()
for name, clf in classifiers.items():
    print ("{method:<20s}{val_f1:>13.3f}{val_macro:>13.3f}{train_f1:>13.3f}{train_macro:>13.3f}".format(
        method=name, val_f1=ca_val_score[name], val_macro=ce_val_macro[name],
        train_f1=ca_train_score[name], train_macro=ca_train_macro[name]))

Classification performance on validation set: 

Validation (LHS), Training (RHS)

Dummy, most frequent        0.498        0.133        0.518        0.136
Gaussian NB                 0.607        0.504        0.678        0.585
Logistic Regression         0.618        0.491        0.687        0.575
Linear SVM                  0.611        0.466        0.683        0.555
RBF SVM                     0.619        0.494        0.691        0.583
Random Forest               0.620        0.505        0.709        0.617
MLP Classifer               0.618        0.498        0.693        0.588
