In [1]:
import os
import pickle
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, HalvingGridSearchCV
from sklearn.neural_network import MLPClassifier

from data import preprocess_data, vectorize_data, load_dataset
from evaluation import analysis, evaluate_models_with_data
from w2v_adapter import Word2VecAdapter

import advanced_processor_chain_factory
import simple_processor_chain_factory

In [2]:
dataset = load_dataset()
DEBUG = False

# Inspection of Pre-Processing Approaches

In [3]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced', n_jobs=-1),
          'svm' : svm.LinearSVC(),
          'knn' : KNeighborsClassifier(n_neighbors=300, n_jobs=-1)
         }

## Without Pre-Process

In [4]:
evaluate_models_with_data(models, *vectorize_data(*preprocess_data(dataset, debug=DEBUG), CountVectorizer(max_features=2000)))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.87      5563
    negative       0.87      0.88      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4796  767]
 [ 696 4991]]
Accuracy:
 0.8699555555555556
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5563
    negative       0.87      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4815  748]
 [ 736 4951]]
Accuracy:
 0.8680888888888889
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.74      0.50      0.60      5563
    negative       0.63      0.83      0.72      5687

    accuracy                           0.67     11250
   macro avg       0.69      0.67      0.66     11250
weighted avg       0.69      0.67      0.66     11250

Matrix: Confusion
 [[2787 2776]
 [ 969 4718]]
Accuracy:
 0.6671111111111111


## Simple Pre-Process

In [5]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=simple_processor_chain_factory.create(), debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5563
    negative       0.87      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4793  770]
 [ 727 4960]]
Accuracy:
 0.8669333333333333
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.83      0.86      5563
    negative       0.84      0.90      0.87      5687

    accuracy                           0.86     11250
   macro avg       0.87      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4610  953]
 [ 583 5104]]
Accuracy:
 0.8634666666666667
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.75      0.47      0.57      5563
    negative       0.62      0.85      0.72      5687

    accuracy                           0.66     11250
   macro avg       0.68      0.66      0.64     11250
weighted avg       0.68      0.66      0.65     11250

Matrix: Confusion
 [[2592 2971]
 [ 868 4819]]
Accuracy:
 0.6587555555555555


## Pre-Process with Stemmimg

In [6]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('stem'),
                                               debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5563
    negative       0.86      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4777  786]
 [ 773 4914]]
Accuracy:
 0.8614222222222222
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5563
    negative       0.87      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4801  762]
 [ 785 4902]]
Accuracy:
 0.8624888888888889
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.74      0.70      0.72      5563
    negative       0.72      0.76      0.74      5687

    accuracy                           0.73     11250
   macro avg       0.73      0.73      0.73     11250
weighted avg       0.73      0.73      0.73     11250

Matrix: Confusion
 [[3876 1687]
 [1350 4337]]
Accuracy:
 0.7300444444444445


## Pre-Process with Lemmitization

In [7]:
X, Y = preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=11)
del dataset

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

In [8]:
vectorizer = CountVectorizer(max_features=2000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)
pickle.dump(vectorizer, open('cnt_vect.pkl', 'wb'))

In [9]:
evaluate_models_with_data(models, X_train_bow, X_test_bow, Y_train, Y_test)

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5563
    negative       0.87      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4791  772]
 [ 738 4949]]
Accuracy:
 0.8657777777777778
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5563
    negative       0.85      0.88      0.87      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4701  862]
 [ 673 5014]]
Accuracy:
 0.8635555555555555
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.74      0.68      0.71      5563
    negative       0.71      0.77      0.74      5687

    accuracy                           0.72     11250
   macro avg       0.72      0.72      0.72     11250
weighted avg       0.72      0.72      0.72     11250

Matrix: Confusion
 [[3781 1782]
 [1332 4355]]
Accuracy:
 0.7232


In [10]:
del models

# Compare W2V and BoW with Their Best Tuned Hyper-parameters

In [11]:
kfold = StratifiedKFold(n_splits=5)
general_grid_params = {'verbose' : 1, 'cv' : kfold, 'n_jobs' : -1, 'scoring' : 'f1'}

logistic_grid = {
    'penalty':['l2'],
    'C':[1, 300, 500, 700, 900, 2000],
    'class_weight':['balanced'],
    'solver':['saga'],
    'n_jobs':[-1],
    'max_iter':[1000],
}

svc_grid = {
    'kernel' : ['linear', 'rbf'],
    'C':[0.1, 1, 500, 1000],
}

knn_grid = {
    'n_neighbors' : [1, 100, 300, 500, 700, 900],
    'n_jobs' : [-1]
}

cols2show = ['mean_test_score','rank_test_score',]

logistic_cols = ['param_C'] + cols2show 
svc_cols = ['param_kernel', 'param_C', 'iter', 'n_resources'] + cols2show 
knn_cols = ['param_n_neighbors', 'iter', 'n_resources'] + cols2show 

## BoW

### Logistic Regression

In [12]:
bow_log = LogisticRegression()
bow_log = GridSearchCV(estimator=bow_log, param_grid=logistic_grid, **general_grid_params)
bow_log.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_log.best_score_}')
print(f'Best Params: {bow_log.best_params_}')
pd.DataFrame(bow_log.cv_results_)[logistic_cols]

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score: 0.8659501024379079
Best Params: {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}




Unnamed: 0,param_C,mean_test_score,rank_test_score
0,1,0.86595,1
1,300,0.864508,2
2,500,0.864508,2
3,700,0.864508,2
4,900,0.864508,2
5,2000,0.864508,2


In [13]:
bow_log_score = analysis(Y_test, bow_log.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5563
    negative       0.86      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4789  774]
 [ 739 4948]]
Accuracy:
 0.8655111111111111


### SVM

In [14]:
bow_svm = svm.SVC()
bow_svm = HalvingGridSearchCV(estimator=bow_svm, param_grid=svc_grid, cv = 3, n_jobs= -1, scoring='f1', factor=2)
bow_svm.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')
pd.DataFrame(bow_svm.cv_results_)[svc_cols]

Best Score: 0.866309996068062
Best Params: {'C': 1, 'kernel': 'rbf'}


Unnamed: 0,param_kernel,param_C,iter,n_resources,mean_test_score,rank_test_score
0,linear,0.1,0,4218,0.798469,11
1,rbf,0.1,0,4218,0.735624,15
2,linear,1.0,0,4218,0.771711,14
3,rbf,1.0,0,4218,0.821592,8
4,linear,500.0,0,4218,0.772941,12
5,rbf,500.0,0,4218,0.809133,9
6,linear,1000.0,0,4218,0.772941,12
7,rbf,1000.0,0,4218,0.809133,9
8,linear,0.1,1,8436,0.834505,7
9,rbf,500.0,1,8436,0.834895,5


In [15]:
bow_svm_score = analysis(Y_test, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.87      5563
    negative       0.86      0.89      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4753  810]
 [ 645 5042]]
Accuracy:
 0.8706666666666667


### KNN

In [16]:
bow_knn = KNeighborsClassifier()
bow_knn = HalvingGridSearchCV(estimator=bow_knn, param_grid=knn_grid, cv = 3, scoring='f1', factor=2)
bow_knn.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_knn.best_score_}')
print(f'Best Params: {bow_knn.best_params_}')
pd.DataFrame(bow_knn.cv_results_)[knn_cols]

Best Score: 0.7368442255189845
Best Params: {'n_jobs': -1, 'n_neighbors': 300}


Unnamed: 0,param_n_neighbors,iter,n_resources,mean_test_score,rank_test_score
0,1,0,8437,0.587065,11
1,100,0,8437,0.723721,4
2,300,0,8437,0.714575,7
3,500,0,8437,0.711323,9
4,700,0,8437,0.714279,8
5,900,0,8437,0.718755,5
6,300,1,16874,0.727412,2
7,900,1,16874,0.7077,10
8,100,1,16874,0.72489,3
9,100,2,33748,0.718356,6


In [17]:
bow_knn_score = analysis(Y_test, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.87      5563
    negative       0.86      0.89      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4753  810]
 [ 645 5042]]
Accuracy:
 0.8706666666666667


## W2V

In [18]:
if os.path.isfile('w2v.kv'):
    vectorizer = Word2VecAdapter(pre_trained_model=KeyedVectors.load('w2v.kv'))
else:
    vectorizer = Word2VecAdapter()

X_train_w2v = vectorizer.fit_transform(X_train)
X_test_w2v = vectorizer.transform(X_test)

if not os.path.isfile('w2v.kv'):
     vectorizer.wv.save('w2v.kv')

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11250 [00:00<?, ?it/s]

### Logistic Regression

In [19]:
w2v_log = LogisticRegression()
w2v_log = GridSearchCV(estimator=w2v_log, param_grid=logistic_grid, **general_grid_params)
w2v_log.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_log.best_score_}')
print(f'Best Params: {w2v_log.best_params_}')
pd.DataFrame(w2v_log.cv_results_)[logistic_cols]

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score: 0.8685783640980057
Best Params: {'C': 300, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}


Unnamed: 0,param_C,mean_test_score,rank_test_score
0,1,0.864422,6
1,300,0.868578,1
2,500,0.868552,4
3,700,0.868553,3
4,900,0.868578,1
5,2000,0.868552,4


In [20]:
w2v_log_score = analysis(Y_test, w2v_log.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5563
    negative       0.86      0.87      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4786  777]
 [ 762 4925]]
Accuracy:
 0.8632


### SVM

In [21]:
w2v_svm = svm.SVC()
w2v_svm = HalvingGridSearchCV(estimator=w2v_svm, param_grid=svc_grid, cv = 3, n_jobs= -1, scoring='f1', factor=2)
w2v_svm.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')
pd.DataFrame(w2v_svm.cv_results_)[svc_cols]

Best Score: 0.8695247250442834
Best Params: {'C': 1, 'kernel': 'rbf'}


Unnamed: 0,param_kernel,param_C,iter,n_resources,mean_test_score,rank_test_score
0,linear,0.1,0,4218,0.861348,6
1,rbf,0.1,0,4218,0.847107,13
2,linear,1.0,0,4218,0.859765,8
3,rbf,1.0,0,4218,0.86483,3
4,linear,500.0,0,4218,0.853402,11
5,rbf,500.0,0,4218,0.828638,14
6,linear,1000.0,0,4218,0.849289,12
7,rbf,1000.0,0,4218,0.828638,14
8,linear,500.0,1,8436,0.857591,10
9,linear,1.0,1,8436,0.861521,5


In [22]:
w2v_svm_score = analysis(Y_test, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.87      5563
    negative       0.87      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4795  768]
 [ 719 4968]]
Accuracy:
 0.8678222222222223


### KNN

In [23]:
w2v_knn = KNeighborsClassifier()
w2v_knn = HalvingGridSearchCV(estimator=w2v_knn, param_grid=knn_grid, cv = 3, scoring='f1', factor=2)
w2v_knn.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_knn.best_score_}')
print(f'Best Params: {w2v_knn.best_params_}')
pd.DataFrame(w2v_knn.cv_results_)[knn_cols]

Best Score: 0.8091766744682971
Best Params: {'n_jobs': -1, 'n_neighbors': 100}


Unnamed: 0,param_n_neighbors,iter,n_resources,mean_test_score,rank_test_score
0,1,0,8437,0.736049,9
1,100,0,8437,0.776512,5
2,300,0,8437,0.757849,7
3,500,0,8437,0.742171,8
4,700,0,8437,0.734681,10
5,900,0,8437,0.723424,11
6,500,1,16874,0.77237,6
7,300,1,16874,0.782319,4
8,100,1,16874,0.800457,2
9,300,2,33748,0.794627,3


In [24]:
w2v_knn_score = analysis(Y_test, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.87      5563
    negative       0.87      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4795  768]
 [ 719 4968]]
Accuracy:
 0.8678222222222223


## Comparison

In [25]:
summary = {
    'LR': {'BoW': {'model':bow_log, 'score': bow_log_score}, 'W2V': {'model': w2v_log, 'score': w2v_log_score}},
    'SVM' : {'BoW': {'model':bow_svm, 'score': bow_svm_score}, 'W2V': {'model': w2v_svm, 'score': w2v_svm_score}},
    'KNN': {'BoW': {'model' :bow_knn, 'score':bow_knn_score}, 'W2V': {'model':w2v_knn, 'score':w2v_knn_score}},
  }

for name, values in summary.items():
    print(f'For classifier {name}, best BoW score is {values["BoW"]["score"]}, whereas best W2V score is {values["W2V"]["score"]}')
    best_model = 'BoW' if values['BoW']['score'] > values['W2V']['score'] else 'W2V'
    print(f'So {best_model} is better with parameters {values[best_model]["model"].best_params_}')
    filename = name + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(values[best_model]['model'], f)

del summary, bow_log, w2v_log, bow_svm, w2v_svm, bow_knn, w2v_knn

For classifier LR, best BoW score is 0.8673853974932071, whereas best W2V score is 0.864869611028185
So BoW is better with parameters {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}
For classifier SVM, best BoW score is 0.8739058843920617, whereas best W2V score is 0.8698240392191194
So BoW is better with parameters {'C': 1, 'kernel': 'rbf'}
For classifier KNN, best BoW score is 0.8739058843920617, whereas best W2V score is 0.8698240392191194
So BoW is better with parameters {'n_jobs': -1, 'n_neighbors': 300}


# MLP

In [26]:
mlp_grid = {
    'hidden_layer_sizes': [(500, 250), (1000, 250), (500, 250, 250), (1000, 500, 250), (500, 250, 250, 5),
                           (1000, 500, 250, 5)],
    'activation':['tanh', 'relu']
}
def eval_mlp(X_train, X_test, Y_train, Y_test):
    best_f1 = -1
    best_model = None
    for sizes in mlp_grid['hidden_layer_sizes']:
        for act in mlp_grid['activation']:
            m = MLPClassifier(hidden_layer_sizes=sizes, activation=act, solver='sgd', alpha=1,
                                    learning_rate='adaptive', max_iter=10)
            m.fit(X_train, Y_train)
            print(f'Model config: hidden_layer_sizes={sizes}, activation={act}')
            f1 = analysis(Y_test, m.predict(X_test))
            if f1 > best_f1:
                best_model = m
                best_f1 = f1
    return best_f1, best_model

## W2V

In [27]:
w2v_f1, w2v_mlp = eval_mlp(X_train_w2v, X_test_w2v, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.85      0.85      5563
    negative       0.85      0.85      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4726  837]
 [ 840 4847]]
Accuracy:
 0.8509333333333333




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5563
    negative       0.84      0.86      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4664  899]
 [ 806 4881]]
Accuracy:
 0.8484444444444444




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.85      0.85      5563
    negative       0.85      0.85      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4733  830]
 [ 832 4855]]
Accuracy:
 0.8522666666666666




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.84      0.84      5563
    negative       0.85      0.85      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4689  874]
 [ 863 4824]]
Accuracy:
 0.8456




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.85      0.85      5563
    negative       0.86      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4753  810]
 [ 807 4880]]
Accuracy:
 0.8562666666666666




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5563
    negative       0.85      0.86      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4686  877]
 [ 804 4883]]
Accuracy:
 0.8505777777777778




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.87      0.86      5563
    negative       0.87      0.84      0.85      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4825  738]
 [ 891 4796]]
Accuracy:
 0.8552




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.84      0.85      5563
    negative       0.85      0.86      0.86      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4685  878]
 [ 777 4910]]
Accuracy:
 0.8528888888888889




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.83      0.85      5563
    negative       0.84      0.88      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4645  918]
 [ 702 4985]]
Accuracy:
 0.856




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5563
    negative       0.86      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4741  822]
 [ 770 4917]]
Accuracy:
 0.8584888888888889




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.84      0.85      5563
    negative       0.85      0.88      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4648  915]
 [ 675 5012]]
Accuracy:
 0.8586666666666667




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5563
    negative       0.85      0.86      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4680  883]
 [ 821 4866]]
Accuracy:
 0.8485333333333334


## BoW

In [28]:
bow_f1, bow_mlp = eval_mlp(X_train_bow, X_test_bow, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5563
    negative       0.86      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4742  821]
 [ 780 4907]]
Accuracy:
 0.8576888888888888




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5563
    negative       0.85      0.85      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4700  863]
 [ 857 4830]]
Accuracy:
 0.8471111111111111




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5563
    negative       0.86      0.87      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4738  825]
 [ 748 4939]]
Accuracy:
 0.8601777777777778




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.83      0.85      5563
    negative       0.84      0.87      0.85      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4611  952]
 [ 736 4951]]
Accuracy:
 0.8499555555555556




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.86      5563
    negative       0.87      0.86      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4821  742]
 [ 772 4915]]
Accuracy:
 0.8654222222222222




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.85      5563
    negative       0.85      0.86      0.86      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4704  859]
 [ 785 4902]]
Accuracy:
 0.8538666666666667




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.86      5563
    negative       0.87      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4814  749]
 [ 789 4898]]
Accuracy:
 0.8632888888888889




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.82      0.85      5563
    negative       0.83      0.89      0.86      5687

    accuracy                           0.85     11250
   macro avg       0.86      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4564  999]
 [ 647 5040]]
Accuracy:
 0.8536888888888889




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.86      5563
    negative       0.87      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4818  745]
 [ 789 4898]]
Accuracy:
 0.8636444444444444




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.85      5563
    negative       0.85      0.86      0.86      5687

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4721  842]
 [ 800 4887]]
Accuracy:
 0.8540444444444445




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.87      5563
    negative       0.87      0.87      0.87      5687

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4824  739]
 [ 761 4926]]
Accuracy:
 0.8666666666666667




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5563
    negative       0.86      0.86      0.86      5687

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4799  764]
 [ 803 4884]]
Accuracy:
 0.8607111111111111


## TD-IDF

In [29]:
vectorizer = TfidfVectorizer(max_features=2000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)
pickle.dump(vectorizer, open('tf_idf.pkl', 'wb'))

In [30]:
tf_idf_f1, tf_idf_mlp = eval_mlp(X_train_idf, X_test_idf, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.76      0.77      0.77      5563
    negative       0.77      0.77      0.77      5687

    accuracy                           0.77     11250
   macro avg       0.77      0.77      0.77     11250
weighted avg       0.77      0.77      0.77     11250

Matrix: Confusion
 [[4293 1270]
 [1321 4366]]
Accuracy:
 0.7696888888888889




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.68      0.76      0.72      5563
    negative       0.74      0.64      0.69      5687

    accuracy                           0.70     11250
   macro avg       0.71      0.70      0.70     11250
weighted avg       0.71      0.70      0.70     11250

Matrix: Confusion
 [[4247 1316]
 [2023 3664]]
Accuracy:
 0.7032




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.79      0.76      0.77      5563
    negative       0.77      0.80      0.79      5687

    accuracy                           0.78     11250
   macro avg       0.78      0.78      0.78     11250
weighted avg       0.78      0.78      0.78     11250

Matrix: Confusion
 [[4228 1335]
 [1140 4547]]
Accuracy:
 0.78




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.70      0.78      0.74      5563
    negative       0.76      0.67      0.71      5687

    accuracy                           0.73     11250
   macro avg       0.73      0.73      0.73     11250
weighted avg       0.73      0.73      0.73     11250

Matrix: Confusion
 [[4335 1228]
 [1852 3835]]
Accuracy:
 0.7262222222222222




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.80      0.70      0.75      5563
    negative       0.74      0.83      0.78      5687

    accuracy                           0.77     11250
   macro avg       0.77      0.76      0.76     11250
weighted avg       0.77      0.77      0.76     11250

Matrix: Confusion
 [[3871 1692]
 [ 948 4739]]
Accuracy:
 0.7653333333333333




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.60      0.85      0.70      5563
    negative       0.75      0.45      0.56      5687

    accuracy                           0.65     11250
   macro avg       0.68      0.65      0.63     11250
weighted avg       0.68      0.65      0.63     11250

Matrix: Confusion
 [[4721  842]
 [3131 2556]]
Accuracy:
 0.6468444444444444




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.73      0.77      5563
    negative       0.76      0.85      0.80      5687

    accuracy                           0.79     11250
   macro avg       0.79      0.79      0.79     11250
weighted avg       0.79      0.79      0.79     11250

Matrix: Confusion
 [[4058 1505]
 [ 853 4834]]
Accuracy:
 0.7904




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.65      0.83      0.73      5563
    negative       0.77      0.56      0.65      5687

    accuracy                           0.69     11250
   macro avg       0.71      0.70      0.69     11250
weighted avg       0.71      0.69      0.69     11250

Matrix: Confusion
 [[4636  927]
 [2518 3169]]
Accuracy:
 0.6937777777777778




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.81      0.79      0.80      5563
    negative       0.80      0.82      0.81      5687

    accuracy                           0.80     11250
   macro avg       0.80      0.80      0.80     11250
weighted avg       0.80      0.80      0.80     11250

Matrix: Confusion
 [[4399 1164]
 [1046 4641]]
Accuracy:
 0.8035555555555556




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.50      1.00      0.66      5563
    negative       0.82      0.01      0.02      5687

    accuracy                           0.50     11250
   macro avg       0.66      0.50      0.34     11250
weighted avg       0.66      0.50      0.34     11250

Matrix: Confusion
 [[5553   10]
 [5641   46]]
Accuracy:
 0.4976888888888889




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.83      0.79      0.81      5563
    negative       0.80      0.84      0.82      5687

    accuracy                           0.82     11250
   macro avg       0.82      0.81      0.82     11250
weighted avg       0.82      0.82      0.82     11250

Matrix: Confusion
 [[4380 1183]
 [ 895 4792]]
Accuracy:
 0.8152888888888888




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.52      0.98      0.68      5563
    negative       0.87      0.11      0.20      5687

    accuracy                           0.54     11250
   macro avg       0.69      0.55      0.44     11250
weighted avg       0.70      0.54      0.44     11250

Matrix: Confusion
 [[5462  101]
 [5035  652]]
Accuracy:
 0.5434666666666667


## Comparison

In [31]:
print('Best scores:')
print(f'W2V: {w2v_f1} with params: {w2v_mlp.get_params()}')
print(f'BoW: {bow_f1} with params: {bow_mlp.get_params()}')
print(f'Tf-Idf: {tf_idf_f1} with params: {tf_idf_mlp.get_params()}')

idx = np.argmax([w2v_f1, bow_f1, tf_idf_f1])
best_mlp = [w2v_mlp, bow_mlp, tf_idf_mlp][idx]
with open('best.pkl', 'wb') as f:
    pickle.dump(best_mlp, f)




Best scores:
W2V: 0.8630962631307042 with params: {'activation': 'tanh', 'alpha': 1, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (1000, 500, 250, 5), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 10, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'sgd', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': True}
BoW: 0.8678646934460887 with params: {'activation': 'tanh', 'alpha': 1, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (1000, 500, 250, 5), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 10, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'sgd', 'tol': 0.0001, '