# Imports

In [1]:
import string
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

from preprocessing import Preprocessor
from data_loader import DataLoader
from helpers import select_n_components, pos_check
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, FunctionTransformer
from sklearn.base import BaseEstimator
from sklearn.utils.fixes import loguniform
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to /Users/olive/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: No module named 'textblob'

# Load Data

In [2]:
train, validate, test = DataLoader().create_dataframe(preprocess=True, split=True, remove_duplicates=True)

100%|██████████| 239073/239073 [00:05<00:00, 43034.81it/s]


In [3]:
train.head()

Unnamed: 0,phrase_id,phrase,phrase_clean,sentiment_val,label_id,label
64633,3748,a fairy tale that comes from a renowned indian...,fairy tale comes renowned indian film culture ...,0.88889,5.0,Very positive
23483,183087,Chris Cooper 's,Chris Cooper,0.5,3.0,Neutral
93051,229624,call it a work of art,call work art,0.76389,4.0,Positive
15507,103745,A muddled limp biscuit,A muddled limp biscuit,0.19444,1.0,Very negative
84727,115229,ate,ate,0.36111,2.0,Negative


In [4]:
train.shape

(119468, 6)

# Train, test, dev split

In [5]:
X_train, y_train = train['phrase_clean'], train['label_id']
X_val, y_val = validate['phrase_clean'], validate['label_id']
X_test, y_test = test['phrase_clean'], test['label_id']

# Feature Engineering & Selection

- Features to include:

    - phrase length
    - punctuation count
    - capital letters count
    - number of adjective POS tags

In [6]:
punct_count = lambda l1, l2: sum([1 for x in l1 if x in l2])
caps_count = lambda l1: sum([1 for x in l1 if x.isupper()])

def get_phrase_length(text):
    return np.array([len(t) for t in tqdm(text)]).reshape(-1, 1)

def get_num_punct(text):
    return np.array([punct_count(t, set(string.punctuation)) for t in tqdm(text)]).reshape(-1, 1)

def get_num_caps(text):
    return np.array([caps_count(t) for t in tqdm(text)]).reshape(-1, 1)

def get_num_adj_pos(text):
    return np.array([pos_check(t) for t in tqdm(text)]).reshape(-1, 1)

In [7]:
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             max_features=10000, 
                             use_idf=True, 
                             ngram_range=(1,3))

# Prediction

In [8]:
def create_feature_pipeline(feature_name, feature_id, feature):
    return (feature_name, Pipeline([
        (feature_id, feature)
    ]))

def create_pipeline(my_id, clf, vectorizer=tfidf_vect, use_features=True):
    if use_features:
        pipe = Pipeline([
            ('features', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', vectorizer),
                    ('chi2', SelectKBest(chi2, k=1000))
                ])),
                create_feature_pipeline('phrase_length', 'f1', FunctionTransformer(get_phrase_length, validate=False)),
                create_feature_pipeline('num_punct', 'f2', FunctionTransformer(get_num_punct, validate=False)),
                create_feature_pipeline('num_caps', 'f3', FunctionTransformer(get_num_caps, validate=False)),
                create_feature_pipeline('num_adj_pos', 'f4', FunctionTransformer(get_num_adj_pos, validate=False))
            ])),
            (my_id, clf)            
        ])
    else:
        pipe = Pipeline([
            ('features', FeatureUnion([
                ('text', Pipeline([
                    ('vectorizer', vectorizer),
                    ('chi2', SelectKBest(chi2, k=1000))
                ])),
            ])),
            (my_id, clf)            
        ])
        
    return pipe

# Train classifiers with TF-IDF vectors

In [9]:
classifiers = {
    'Dummy, most frequent': create_pipeline(my_id='dc', 
                                            clf=DummyClassifier(strategy='most_frequent'), 
                                            use_features=True),
    'Logistic Regression': create_pipeline(my_id='lr', 
                                        clf=LogisticRegression(max_iter=5000), 
                                        use_features=True),
    'kNN': create_pipeline(my_id='knn', clf=KNeighborsClassifier(n_neighbors=5), 
                           use_features=True),
    'Linear SVM': create_pipeline(my_id='svm', 
                               clf=SVC(kernel='linear'), 
                               use_features=True),
    'RBF SVM': create_pipeline(my_id='svm_rbf', 
                               clf=SVC(kernel='rbf'), 
                               use_features=True),
    'Random Forest': create_pipeline(my_id='rf', 
                                     clf=RandomForestClassifier(max_depth=10, n_estimators=50),
                                     use_features=True),
    'MLP Classifer': create_pipeline(my_id='mlp',
                                    clf=MLPClassifier(max_iter=800),
                                    use_features=True),
    'MLP Classifer (stronger)': create_pipeline(my_id='mlp',
                                    clf=MLPClassifier(max_iter=800, alpha=1),
                                    use_features=True),
}

In [None]:
ca_train_score = {}
ca_val_score = {}

ce_train_score = {} 
ce_val_score = {} 

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    print(f'\nFinished training classifier: {name}')
    
    train_preds = clf.predict(X_train)
    val_preds = clf.predict(X_val)
    
    ca_train_score[name] = f1_score(train_preds, y_train, average='micro')
    ca_val_score[name] = f1_score(val_preds, y_val, average='micro')
    
    ce_train_score[name] = log_loss(y_train, clf.predict_proba(X_train))
    ce_val_score[name] = log_loss(y_val, clf.predict_proba(X_val))

100%|██████████| 119468/119468 [00:00<00:00, 2246857.22it/s]
100%|██████████| 119468/119468 [00:00<00:00, 394393.10it/s]
100%|██████████| 119468/119468 [00:00<00:00, 439075.37it/s]
100%|██████████| 119468/119468 [01:09<00:00, 1716.29it/s]



Finished training classifier: Dummy, most frequent


100%|██████████| 119468/119468 [00:00<00:00, 2285774.09it/s]
100%|██████████| 119468/119468 [00:00<00:00, 390454.51it/s]
100%|██████████| 119468/119468 [00:00<00:00, 451212.58it/s]
100%|██████████| 119468/119468 [01:05<00:00, 1829.90it/s]
100%|██████████| 47470/47470 [00:00<00:00, 2251131.89it/s]
100%|██████████| 47470/47470 [00:00<00:00, 399358.17it/s]
100%|██████████| 47470/47470 [00:00<00:00, 433689.06it/s]
100%|██████████| 47470/47470 [00:25<00:00, 1886.81it/s]
100%|██████████| 119468/119468 [00:00<00:00, 2270116.61it/s]
100%|██████████| 119468/119468 [00:00<00:00, 396509.97it/s]
100%|██████████| 119468/119468 [00:00<00:00, 450735.28it/s]
100%|██████████| 119468/119468 [01:02<00:00, 1915.23it/s]
100%|██████████| 47470/47470 [00:00<00:00, 2213984.33it/s]
100%|██████████| 47470/47470 [00:00<00:00, 392553.66it/s]
100%|██████████| 47470/47470 [00:00<00:00, 348209.25it/s]
100%|██████████| 47470/47470 [00:22<00:00, 2148.57it/s]
100%|██████████| 119468/119468 [00:00<00:00, 2282910.29it/s]


Finished training classifier: Logistic Regression


100%|██████████| 119468/119468 [00:00<00:00, 2308850.06it/s]
100%|██████████| 119468/119468 [00:00<00:00, 398093.22it/s]
100%|██████████| 119468/119468 [00:00<00:00, 441640.71it/s]
100%|██████████| 119468/119468 [01:03<00:00, 1874.23it/s]
100%|██████████| 47470/47470 [00:00<00:00, 2175662.86it/s]
100%|██████████| 47470/47470 [00:00<00:00, 395829.08it/s]
100%|██████████| 47470/47470 [00:00<00:00, 435537.11it/s]
100%|██████████| 47470/47470 [00:25<00:00, 1896.21it/s]
100%|██████████| 119468/119468 [00:00<00:00, 2293621.11it/s]
100%|██████████| 119468/119468 [00:00<00:00, 392663.00it/s]
100%|██████████| 119468/119468 [00:00<00:00, 440233.41it/s]
100%|██████████| 119468/119468 [01:03<00:00, 1881.85it/s]
100%|██████████| 47470/47470 [00:00<00:00, 2284713.14it/s]
100%|██████████| 47470/47470 [00:00<00:00, 398155.46it/s]
100%|██████████| 47470/47470 [00:00<00:00, 478119.09it/s]
100%|██████████| 47470/47470 [00:25<00:00, 1888.49it/s]
100%|██████████| 119468/119468 [00:00<00:00, 2292949.40it/s]


Finished training classifier: kNN


100%|██████████| 119468/119468 [00:00<00:00, 2259012.47it/s]
100%|██████████| 119468/119468 [00:00<00:00, 396556.41it/s]
100%|██████████| 119468/119468 [00:00<00:00, 453325.05it/s]
100%|██████████| 119468/119468 [01:03<00:00, 1873.86it/s]
100%|██████████| 47470/47470 [00:00<00:00, 2244280.75it/s]
100%|██████████| 47470/47470 [00:00<00:00, 396364.13it/s]
100%|██████████| 47470/47470 [00:00<00:00, 453884.30it/s]
100%|██████████| 47470/47470 [00:22<00:00, 2149.71it/s]
100%|██████████| 119468/119468 [00:00<00:00, 2288007.63it/s]
100%|██████████| 119468/119468 [00:00<00:00, 395046.99it/s]
100%|██████████| 119468/119468 [00:00<00:00, 442611.29it/s]
100%|██████████| 119468/119468 [01:02<00:00, 1900.81it/s]
100%|██████████| 47470/47470 [00:00<00:00, 2093205.47it/s]
100%|██████████| 47470/47470 [00:00<00:00, 394441.10it/s]
100%|██████████| 47470/47470 [00:00<00:00, 456911.17it/s]
100%|██████████| 47470/47470 [00:25<00:00, 1893.07it/s]
100%|██████████| 119468/119468 [00:00<00:00, 2069316.99it/s]

In [None]:
print('Classification performance on validation set: \n')

print('Validation (LHS), Training (RHS)')
print()
for name, clf in classifiers.items():
    print ("{method:<20s}{val_f1:>13.3f}{val_logloss:>13.3f}{train_f1:>13.3f}{train_logloss:>13.3f}".format(
        method=clf, val_accuracy=ca_val_score[clf], val_logloss=ce_val_score[clf],
        train_accuracy=ca_train_score[clf], train_logloss=ce_train_score[clf]))

In [None]:
classifiers = {
    'Dummy, most frequent': create_pipeline(my_id='dc', 
                                            clf=DummyClassifier(strategy='most_frequent'), 
                                            use_features=False),
    'Logistic Regression': create_pipeline(my_id='lr', 
                                        clf=LogisticRegression(max_iter=5000), 
                                        use_features=False),
    'kNN': create_pipeline(my_id='knn', clf=KNeighborsClassifier(n_neighbors=5), 
                           use_features=False),
    'Linear SVM': create_pipeline(my_id='svm', 
                               clf=SVC(kernel='linear'), 
                               use_features=False),
    'RBF SVM': create_pipeline(my_id='svm_rbf', 
                               clf=SVC(kernel='rbf'), 
                               use_features=False),
    'Random Forest': create_pipeline(my_id='rf', 
                                     clf=RandomForestClassifier(max_depth=10, n_estimators=50),
                                     use_features=False),
    'MLP Classifer': create_pipeline(my_id='mlp',
                                    clf=MLPClassifier(max_iter=800),
                                    use_features=False),
    'MLP Classifer (stronger)': create_pipeline(my_id='mlp',
                                    clf=MLPClassifier(max_iter=800, alpha=1),
                                    use_features=False),
}

In [None]:
print('Classification performance on validation set: \n')

print('Validation (LHS), Training (RHS)')
print()
for name, clf in classifiers.items():
    print ("{method:<20s}{val_f1:>13.3f}{val_logloss:>13.3f}{train_f1:>13.3f}{train_logloss:>13.3f}".format(
        method=clf, val_accuracy=ca_val_score[clf], val_logloss=ce_val_score[clf],
        train_accuracy=ca_train_score[clf], train_logloss=ce_train_score[clf]))

# Unsupervised Clustering
- Ignore this for now.

- KMeans does not work well on high-dimensional data, better to reduce dimensionality of data first, and then do KMeans on reduced space: https://stats.stackexchange.com/questions/199501/user-segmentation-by-clustering-with-sparse-data

In [None]:
from sklearn import metrics

In [None]:
phrases, labels = phrase_df['phrase_clean'], phrase_df['label_id']

In [None]:
k_labels = phrase_df['label_id'].values.tolist()
true_k = np.unique(k_labels).shape[0]

In [None]:
vectorizer = TfidfVectorizer(analyzer='word', 
                             max_features=100,
                             use_idf=True, 
                             ngram_range=(1,1))

X = vectorizer.fit_transform(phrase_df['phrase_clean'])

In [None]:
# svd = TruncatedSVD(n_components=500)
# normalizer = Normalizer(copy=False)
# lsa = make_pipeline(svd, normalizer)
# X = lsa.fit_transform(X_vect)

# print(svd.explained_variance_ratio_.sum())

In [None]:
# Elbow method for finding optimal number of clusters
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', n_init=1, max_iter=100, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1,11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

In [None]:
# original_space_centroids = svd.inverse_transform(km.cluster_centers_)
# order_centroids = original_space_centroids.argsort()[:, ::-1]

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()

for i in range(6):
    print("Cluster %d: " % i, end='')
    for idx in order_centroids[i, :200]:
        print(' %s' % terms[idx], end='')
    print('\n')