# Imports

In [1]:
import string
import pandas as pd
import numpy as np
import scipy.stats as stats
import json
import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

from tempfile import mkdtemp
from shutil import rmtree
from joblib import Memory

from classes.metrics import Metrics
from classes.classification_pipeline import ClassificationPipeline
from classes.data_loader import DataLoader, Encoder
from classes.preprocessing import Preprocessor
from utils.helpers import pos_check

[nltk_data] Downloading package stopwords to
[nltk_data]     /afs/inf.ed.ac.uk/user/s21/s2125219/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
train, validate, test = DataLoader().create_dataframe(preprocess=True, split=True, remove_duplicates=True)

  1%|▏         | 3446/239073 [00:00<00:06, 34447.63it/s]

Preprocessing...


100%|██████████| 239073/239073 [00:05<00:00, 42815.43it/s]


Deduplicating...


In [3]:
train.head()

Unnamed: 0,phrase_id,phrase,phrase_clean,sentiment_val,label_id,label,word_count
94640,229789,certainly ranks as the most original in years .,certainly ranks original years .,0.79167,4.0,Positive,5
54712,47301,"The whole is quite entertaining , but despite ...",The whole quite entertaining despite virtues u...,0.47222,3.0,Neutral,10
211918,11902,the story 's pathetic,story pathetic,0.16667,1.0,Very negative,2
56047,70261,"This is pure , exciting moviemaking .",This pure exciting moviemaking .,0.91667,5.0,Very positive,5
81973,154034,artistically ',artistically,0.51389,3.0,Neutral,1


In [4]:
train.shape

(126124, 7)

# Train, test, dev split

In [5]:
X_train, y_train = train['phrase_clean'], train['label_id']
X_val, y_val = validate['phrase_clean'], validate['label_id']
X_test, y_test = test['phrase_clean'], test['label_id']

# Feature Engineering & Selection

- Features to include:

    - phrase length
    - punctuation count
    - capital letters count

In [6]:
punct_count = lambda l1, l2: sum([1 for x in l1 if x in l2])
caps_count = lambda l1: sum([1 for x in l1 if x.isupper()])

def get_phrase_length(text):
    return np.array([len(t) for t in tqdm(text)]).reshape(-1, 1)

def get_num_punct(text):
    return np.array([punct_count(t, set(string.punctuation)) for t in tqdm(text)]).reshape(-1, 1)

def get_num_caps(text):
    return np.array([caps_count(t) for t in tqdm(text)]).reshape(-1, 1)

In [7]:
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             max_features=10000, 
                             use_idf=True, 
                             ngram_range=(1,3))

# Prediction

In [8]:
# Features
features = FeatureUnion([
    ('phrase_length', Pipeline([
        ('f1', FunctionTransformer(get_phrase_length, validate=False))]
    )),
])

feature_processing = Pipeline([('features', features)])

# Classifiers
dc = ClassificationPipeline(clf_id='dc', 
                            clf=DummyClassifier(strategy='most_frequent'),
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
gnb = ClassificationPipeline(clf_id='gnb',
                            clf=GaussianNB(),
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
lr = ClassificationPipeline(clf_id='lr', 
                           clf=LogisticRegression(max_iter=10000),
                           vectorizer=tfidf_vect,
                           feature_processing=feature_processing)
lin_svm = ClassificationPipeline(clf_id='lin_svm', 
                                 clf=LinearSVC(),
                                 vectorizer=tfidf_vect,
                                 feature_processing=feature_processing)
rbf_svm = ClassificationPipeline(clf_id='rbf_svm', 
                                 clf=SVC(kernel='rbf'), 
                                 vectorizer=tfidf_vect,
                                 feature_processing=feature_processing)
rf = ClassificationPipeline(clf_id='rf', 
                            clf=RandomForestClassifier(max_depth=10, n_estimators=50), 
                            vectorizer=tfidf_vect,
                            feature_processing=feature_processing)
mlp = ClassificationPipeline(clf_id='mlp', 
                             clf=MLPClassifier(max_iter=800),
                             vectorizer=tfidf_vect,
                             feature_processing=feature_processing)

In [9]:
# Train and evaluate classifiers using additional features
clfs = [dc, gnb, lr, lin_svm, rbf_svm, rf, mlp]

# Train classifiers using additional features

In [None]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

results = defaultdict(Metrics)

for clf in clfs:
    print(f'Training {clf.clf_id}...\n')
    
    accuracy, precision, recall, f1, kappa = clf.train_and_evaluate(X_train, y_train, X_val, y_val, 
                                                          use_features=True, memory=memory)
    
    results[clf.clf_id].accuracy = accuracy 
    results[clf.clf_id].precision = precision
    results[clf.clf_id].recall = recall
    results[clf.clf_id].f1 = f1
    results[clf.clf_id].kappa = kappa
    
rmtree(cachedir)

You provided "cachedir='/tmp/tmpce45votn'", use "location='/tmp/tmpce45votn'" instead.
  """


Training dc...

________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(FeatureUnion(transformer_list=[('text',
                                Pipeline(steps=[('vectorizer',
                                                 TfidfVectorizer(max_features=10000,
                                                                 ngram_range=(1,
                                                                              3))),
                                                ('to_dense',
                                                 <classes.dense_transformer.DenseTransformer object at 0x7fcd09577110>),
                                                ('lda',
                                                 LinearDiscriminantAnalysis(n_components=4))])),
                               ('feature_processing',
                                Pipeline(steps=[('features',
                         

In [None]:
for k, v in json.loads(Encoder().encode(results)).items():
    accuracy = v['accuracy']
    precision = v['precision']
    recall = v['recall']
    f1 = v['f1']
    kappa = v['kappa']
    
    print(f'{k} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, Kappa: {kappa:.3f}')
    print()

# Train classifiers without using additional features

In [None]:
# Train and evaulate classifiers without additional features
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

results = defaultdict(Metrics)

for clf in clfs:
    print(f'Training {clf.clf_id}...\n')
    
    accuracy, precision, recall, f1, kappa = clf.train_and_evaluate(X_train, y_train, X_val, y_val, 
                                                          use_features=True, memory=memory)
    
    print(f'{clf.clf_id} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1}, Kappa: {kappa}')
    
    results[clf.clf_id].accuracy = accuracy 
    results[clf.clf_id].precision = precision
    results[clf.clf_id].recall = recall
    results[clf.clf_id].f1 = f1
    results[clf.clf_id].kappa = kappa
    

rmtree(cachedir)

In [None]:
for k, v in json.loads(Encoder().encode(results)).items():
    accuracy = v['accuracy']
    precision = v['precision']
    recall = v['recall']
    f1 = v['f1']
    kappa = v['kappa']
    
    print(f'{k} --- Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}, Kappa: {kappa:.3f}')
    print()

# Best performing classifier on test set

In [None]:
# Dedup validation set to avoid adding any bias
validate = DataLoader().dedup(validate)

In [None]:
validate.shape

In [None]:
X_val, y_val = validate['phrase_clean'], validate['label_id']

In [None]:
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])

In [None]:
X.shape

In [None]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

accuracy, precision, recall, f1, kappa = rf.train_and_evaluate(X, y, 
                                                                X_test, y_test, 
                                                                use_features=True, 
                                                                memory=memory)
rmtree(cachedir)

In [None]:
accuracy, precision, recall, f1, kappa