In [93]:
import pickle
import swifter
import pandas as pd
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.neural_network import MLPClassifier


import advanced_processor_chain_factory
import simple_processor_chain_factory

In [95]:
gdd.download_file_from_google_drive(file_id='15JJ6ZysFM57tlUjXo2nHVhkGwePbVMVV',dest_path='./dataset.csv')

In [96]:
dataset = pd.read_csv('./dataset.csv')
dataset['sentiment'] = dataset['sentiment'].replace(['negative', 'positive'] , [0, 1])
dataset.head()
DEBUG = True

In [97]:
def analysis(labels, predictions):
    print("Report: Classification\n", classification_report(labels, predictions, target_names=["positive", "negative"]))
    print("Matrix: Confusion\n", confusion_matrix(labels, predictions))
    print("Accuracy:\n", accuracy_score(labels, predictions))

In [98]:
def evaluate_models_with_data(models, X_train, X_test, Y_train, Y_test):
    for name, model in models.items():
        print(f'------Evaluating {name}------')
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)
        analysis(Y_test, pred)

In [99]:
def preprocess_data(processor_chain = None, debug = False, debug_data_size = 4000):
    X , Y = dataset['comment'], dataset['sentiment']
    if debug:
        X , Y = X[:debug_data_size], Y[:debug_data_size]
    if processor_chain:
        X = X.swifter.apply(processor_chain.process)
    return X, Y

def vectorize_data(X, Y, vectorizer):
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    return X_train, X_test, Y_train, Y_test

# Inspection of Pre-Processing Approaches

In [100]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced', n_jobs=-1),
          'svm' : svm.LinearSVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8, n_jobs=-1)
         }

## Without Pre-Process

In [101]:
evaluate_models_with_data(models, *vectorize_data(*preprocess_data(debug=DEBUG), CountVectorizer(max_features=2000)))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.83      0.84       500
    negative       0.83      0.85      0.84       500

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000

Matrix: Confusion
 [[416  84]
 [ 77 423]]
Accuracy:
 0.839
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.82      0.81      0.81       500
    negative       0.81      0.82      0.81       500

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

Matrix: Confusion
 [[406  94]
 [ 92 408]]
Accuracy:
 0.814
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.58      0.68      0.62       500
    negative       0.61      0.50      0.55       500

    accuracy                           0.59      1000
   macro avg       0.59      0.59      0.59      1000
weighted avg       0.59      0.59      0.59      1000

Matrix: Confusion
 [[340 160]
 [250 250]]
Accuracy:
 0.59


## Simple Pre-Process

In [102]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(processor_chain=simple_processor_chain_factory.create(), debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/4000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.82      0.84       501
    negative       0.83      0.87      0.85       499

    accuracy                           0.84      1000
   macro avg       0.85      0.85      0.84      1000
weighted avg       0.85      0.84      0.84      1000

Matrix: Confusion
 [[411  90]
 [ 65 434]]
Accuracy:
 0.845
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.79      0.81       501
    negative       0.80      0.84      0.82       499

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

Matrix: Confusion
 [[396 105]
 [ 81 418]]
Accuracy:
 0.814
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.60      0.65      0.63       501
    negative       0.62      0.57      0.59       499

    accuracy                           0.61      1000
   macro avg       0.61      0.61      0.61      1000
weighted avg       0.61      0.61      0.61      1000

Matrix: Confusion
 [[327 174]
 [217 282]]
Accuracy:
 0.609


## Pre-Process with Stemmimg

In [103]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(processor_chain=advanced_processor_chain_factory.create('stem'),
                                               debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/4000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.83      0.83       499
    negative       0.83      0.83      0.83       501

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[412  87]
 [ 85 416]]
Accuracy:
 0.828
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.81      0.81      0.81       499
    negative       0.81      0.81      0.81       501

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

Matrix: Confusion
 [[404  95]
 [ 96 405]]
Accuracy:
 0.809
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    

## Pre-Process with Lemmitization

In [104]:
X, Y = preprocess_data(processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG)
del dataset
X_train_bow, X_test_bow, Y_train_bow, Y_test_bow = vectorize_data(X, Y,CountVectorizer(max_features=2000))
evaluate_models_with_data(models, X_train_bow, X_test_bow, Y_train_bow, Y_test_bow)

Pandas Apply:   0%|          | 0/4000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.83      0.83       506
    negative       0.82      0.83      0.83       494

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[418  88]
 [ 83 411]]
Accuracy:
 0.829
------Evaluating svm------
Report: Classification
               precision    recall  f1-score   support

    positive       0.81      0.80      0.81       506
    negative       0.80      0.81      0.80       494

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

Matrix: Confusion
 [[407  99]
 [ 95 399]]
Accuracy:
 0.806
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    

In [105]:
del models

# Compare W2V and BoW with Their Best Tuned Hyper-parameters

In [106]:
kfold = StratifiedKFold(n_splits=5)
general_grid_params = {'verbose' : 1, 'cv' : kfold, 'n_jobs' : -1, 'scoring' : 'f1'}

logistic_grid = {
    'penalty':['l2'],
    'C':[1, 300, 500, 700, 900],
    'class_weight':['balanced'],
    'solver':['saga'],
    'n_jobs':[-1],
    'max_iter':[1000],
}

svc_grid = {
    'kernel' : ['linear', 'rbf'],
    'C':[0.1, 1, 300, 500, 700],
}

knn_grid = {
    'n_neighbors' : [i for i in range(1,24,2)],
    'n_jobs' : [-1]
}

## BoW

### Logistic Regression

In [107]:
bow_log = LogisticRegression()
bow_log = GridSearchCV(estimator=bow_log, param_grid=logistic_grid, **general_grid_params)
bow_log.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {bow_log.best_score_}')
print(f'Best Params: {bow_log.best_params_}')

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Best Score: 0.8128492870528445
Best Params: {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}




In [108]:
analysis(Y_test_bow, bow_log.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.83      0.83       506
    negative       0.82      0.84      0.83       494

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000

Matrix: Confusion
 [[418  88]
 [ 80 414]]
Accuracy:
 0.832


### SVM

In [109]:
bow_svm = svm.SVC()
bow_svm = GridSearchCV(estimator=bow_svm, param_grid=svc_grid, **general_grid_params)
bow_svm.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best Score: 0.8137394686192903
Best Params: {'C': 1, 'kernel': 'rbf'}


In [110]:
analysis(Y_test_bow, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.78      0.81       506
    negative       0.79      0.85      0.82       494

    accuracy                           0.81      1000
   macro avg       0.82      0.82      0.81      1000
weighted avg       0.82      0.81      0.81      1000

Matrix: Confusion
 [[394 112]
 [ 73 421]]
Accuracy:
 0.815


### KNN

In [111]:
bow_knn = KNeighborsClassifier()
bow_knn = GridSearchCV(estimator=bow_knn, param_grid=knn_grid, **general_grid_params)
bow_knn.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')

Fitting 2 folds for each of 12 candidates, totalling 24 fits
Best Score: 0.8137394686192903
Best Params: {'C': 1, 'kernel': 'rbf'}


In [112]:
analysis(Y_test_bow, bow_svm.predict(X_test_bow))


Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.78      0.81       506
    negative       0.79      0.85      0.82       494

    accuracy                           0.81      1000
   macro avg       0.82      0.82      0.81      1000
weighted avg       0.82      0.81      0.81      1000

Matrix: Confusion
 [[394 112]
 [ 73 421]]
Accuracy:
 0.815


## W2V

In [113]:
from gensim.models import word2vec
import numpy as np
import nltk

class Word2VecAdapter:
  def __init__(self, pre_trained_model=None, num_features=250, min_count=40,workers=4,
               window=10,sample=0.001):
    
    self.num_features=num_features
    self.min_count=min_count
    self.workers=workers
    self.window=window
    self.sample=sample
    self.wv=pre_trained_model
    if pre_trained_model:
        self.word_index = set(self.wv.index_to_key)

  def fit(self,X):
      if not self.wv:
        X = X.swifter.apply(nltk.word_tokenize)
        model = word2vec.Word2Vec(X, workers = self.workers,
                                vector_size = self.num_features, min_count = self.min_count,
                                window = self.window, sample = self.sample)
        self.wv = model.wv
        self.word_index = set(self.wv.index_to_key)

  def predict(self,comment):

    key_words = filter(lambda w: w in self.word_index, nltk.word_tokenize(comment))
    vectors = self.wv[key_words]
    return np.divide(np.sum(vectors, axis=0),len(vectors))

  def fit_transform(self, X):
      if not self.wv:
        self.fit(X)
      return X.swifter.apply(lambda x: pd.Series(self.predict(x)))

  def transform(self, X):
      return X.swifter.apply(lambda x: pd.Series(self.predict(x)))

In [114]:
X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v = vectorize_data(X, Y, Word2VecAdapter(num_features=500))

Pandas Apply:   0%|          | 0/3000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/3000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

### Logistic Regression

In [115]:
w2v_log = LogisticRegression()
w2v_log = GridSearchCV(estimator=w2v_log, param_grid=logistic_grid, **general_grid_params)
w2v_log.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {w2v_log.best_score_}')
print(f'Best Params: {w2v_log.best_params_}')

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Best Score: 0.7600499499526963
Best Params: {'C': 700, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}




In [116]:
analysis(Y_test_w2v, w2v_log.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.79      0.76      0.77       486
    negative       0.78      0.80      0.79       514

    accuracy                           0.78      1000
   macro avg       0.78      0.78      0.78      1000
weighted avg       0.78      0.78      0.78      1000

Matrix: Confusion
 [[369 117]
 [101 413]]
Accuracy:
 0.782


### SVM

In [117]:
w2v_svm = svm.SVC()
w2v_svm = GridSearchCV(estimator=w2v_svm, param_grid=svc_grid, **general_grid_params)
w2v_svm.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')

Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best Score: 0.784313544370445
Best Params: {'C': 700, 'kernel': 'linear'}


In [118]:
analysis(Y_test_w2v, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.81      0.76      0.78       486
    negative       0.79      0.83      0.81       514

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.80      0.80      0.80      1000

Matrix: Confusion
 [[370 116]
 [ 88 426]]
Accuracy:
 0.796


### KNN

In [119]:
w2v_knn = KNeighborsClassifier()
w2v_knn = GridSearchCV(estimator=w2v_knn, param_grid=knn_grid, **general_grid_params)
w2v_knn.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')

Fitting 2 folds for each of 12 candidates, totalling 24 fits
Best Score: 0.784313544370445
Best Params: {'C': 700, 'kernel': 'linear'}


In [120]:
analysis(Y_test_w2v, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.81      0.76      0.78       486
    negative       0.79      0.83      0.81       514

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.80      0.80      0.80      1000

Matrix: Confusion
 [[370 116]
 [ 88 426]]
Accuracy:
 0.796


## Comparison

In [121]:
summary = {
    'LR': {'BoW': bow_log, 'W2V': w2v_log},
    'SVM' : {'BoW': bow_svm, 'W2V': w2v_svm},
    'KNN': {'BoW': bow_knn, 'W2V': w2v_knn},
  }

for name, values in summary.items():
    print(f'For classifier {name}, best BoW score is {values["BoW"].best_score_}, whereas best W2V score is {values["W2V"].best_score_}')
    best_model = "BoW" if values["BoW"].best_score_ > values["W2V"].best_score_ else "W2V"
    print(f'So {best_model} is better with parameters {values[best_model].best_params_}')
    filename = name + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(values[best_model], f)

del summary, bow_log, w2v_log, bow_svm, w2v_svm, bow_knn, w2v_knn

For classifier LR, best BoW score is 0.8128492870528445, whereas best W2V score is 0.7600499499526963
So BoW is better with parameters {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}
For classifier SVM, best BoW score is 0.8137394686192903, whereas best W2V score is 0.784313544370445
So BoW is better with parameters {'C': 1, 'kernel': 'rbf'}
For classifier KNN, best BoW score is 0.67243206392765, whereas best W2V score is 0.6576011157601116
So BoW is better with parameters {'n_jobs': -1, 'n_neighbors': 21}


# MLP

In [122]:
mlp_grid = {
    'hidden_layer_sizes':[(500, 250), (1000, 250), (500, 250, 250), (1000, 500,250)],
    'activation':['tanh', 'relu'],
    'solver':['sgd'],
    'alpha':[1],
    'learning_rate':['adaptive'],
    'max_iter':[1000]
}

## W2V

In [127]:
mlp_w2v = MLPClassifier()
mlp_w2v = GridSearchCV(estimator=mlp_w2v, param_grid=mlp_grid, **general_grid_params)
mlp_w2v.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {mlp_w2v.best_score_}')
print(f'Best Params: {mlp_w2v.best_params_}')



MLPClassifier(activation='tanh', alpha=1, hidden_layer_sizes=(500, 250),
              learning_rate='adaptive', max_iter=1000, solver='sgd')

In [128]:
analysis(Y_test_w2v, mlp_w2v.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.68      0.69      0.69       486
    negative       0.70      0.70      0.70       514

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.69      1000
weighted avg       0.69      0.69      0.69      1000

Matrix: Confusion
 [[335 151]
 [156 358]]
Accuracy:
 0.693


In [None]:
del X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v

## BoW

In [None]:
mlp_bow = MLPClassifier()
mlp_bow = GridSearchCV(estimator=mlp_bow, param_grid=mlp_grid, **general_grid_params)
mlp_bow.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {mlp_bow.best_score_}')
print(f'Best Params: {mlp_bow.best_params_}')

In [None]:
analysis(Y_test_bow, mlp_bow.predict(X_test_bow))

In [None]:
del X_train_bow, X_test_bow, Y_train_bow, Y_test_bow

## TD-IDF

In [None]:
mlp_idf = MLPClassifier()
mlp_idf = GridSearchCV(estimator=mlp_idf, param_grid=mlp_grid, **general_grid_params)
X_train_idf, X_test_idf, Y_train_idf, Y_test_idf = vectorize_data(X, Y, TfidfVectorizer(max_features=2000))
mlp_idf.fit(X_train_idf, Y_train_idf)
print(f'Best Score: {mlp_idf.best_score_}')
print(f'Best Params: {mlp_idf.best_params_}')

In [None]:
analysis(Y_train_idf, mlp_bow.predict(X_test_idf))
del X_train_idf, X_test_idf, Y_train_idf, Y_test_idf

## Comparison

In [None]:
print('Best scores:')
print(f'W2V: {mlp_w2v.best_score_} with params: {mlp_w2v.best_params_}')
print(f'BoW: {mlp_bow.best_score_} with params: {mlp_bow.best_params_}')
print(f'MLP: {mlp_idf.best_score_} with params: {mlp_idf.best_params_}')
best_mlp = max(mlp_w2v, mlp_bow, mlp_idf, lambda mlp: mlp.best_score_)
with open('best.pkl', 'wb') as f:
    pickle.dump(best_mlp, f)