In [1]:
import os
import pickle
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, HalvingGridSearchCV
from sklearn.neural_network import MLPClassifier

from data import preprocess_data, vectorize_data, load_dataset
from evaluation import analysis, evaluate_models_with_data
from w2v_adapter import Word2VecAdapter

import advanced_processor_chain_factory
import simple_processor_chain_factory

In [2]:
dataset = load_dataset()
DEBUG = False

# Inspection of Pre-Processing Approaches

In [3]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced', n_jobs=-1),
          'svm' : svm.LinearSVC(),
          'knn' : KNeighborsClassifier(n_neighbors=300, n_jobs=-1)
         }

## Without Pre-Process

In [4]:
evaluate_models_with_data(models, *vectorize_data(*preprocess_data(dataset, debug=DEBUG), CountVectorizer(max_features=2000)))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.87      0.87      5613
    negative       0.87      0.88      0.87      5637

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4865  748]
 [ 692 4945]]
Accuracy:
 0.872
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5613
    negative       0.87      0.87      0.87      5637

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4896  717]
 [ 739 4898]]
Accuracy:
 0.8705777777777778
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.74      0.51      0.61      5613
    negative       0.63      0.82      0.71      5637

    accuracy                           0.67     11250
   macro avg       0.69      0.67      0.66     11250
weighted avg       0.69      0.67      0.66     11250

Matrix: Confusion
 [[2881 2732]
 [1001 4636]]
Accuracy:
 0.6681777777777778


## Simple Pre-Process

In [5]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=simple_processor_chain_factory.create(), debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5518
    negative       0.87      0.87      0.87      5732

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4801  717]
 [ 731 5001]]
Accuracy:
 0.8712888888888889
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.87      5518
    negative       0.87      0.86      0.87      5732

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4801  717]
 [ 778 4954]]
Accuracy:
 0.8671111111111112
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.76      0.48      0.59      5518
    negative       0.63      0.85      0.72      5732

    accuracy                           0.67     11250
   macro avg       0.69      0.67      0.66     11250
weighted avg       0.69      0.67      0.66     11250

Matrix: Confusion
 [[2638 2880]
 [ 846 4886]]
Accuracy:
 0.6688


## Pre-Process with Stemmimg

In [6]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('stem'),
                                               debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.87      5603
    negative       0.87      0.87      0.87      5647

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4843  760]
 [ 719 4928]]
Accuracy:
 0.8685333333333334
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.87      5603
    negative       0.87      0.86      0.87      5647

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4897  706]
 [ 767 4880]]
Accuracy:
 0.8690666666666667
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.74      0.70      0.72      5603
    negative       0.72      0.76      0.74      5647

    accuracy                           0.73     11250
   macro avg       0.73      0.73      0.73     11250
weighted avg       0.73      0.73      0.73     11250

Matrix: Confusion
 [[3921 1682]
 [1361 4286]]
Accuracy:
 0.7295111111111111


## Pre-Process with Lemmitization

In [7]:
X, Y = preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
del dataset

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

In [8]:
vectorizer = CountVectorizer(max_features=2000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [9]:
evaluate_models_with_data(models, X_train_bow, X_test_bow, Y_train, Y_test)

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5672
    negative       0.86      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4858  814]
 [ 672 4906]]
Accuracy:
 0.8679111111111111
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5672
    negative       0.86      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4853  819]
 [ 667 4911]]
Accuracy:
 0.8679111111111111
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.75      0.64      0.69      5672
    negative       0.68      0.78      0.73      5578

    accuracy                           0.71     11250
   macro avg       0.72      0.71      0.71     11250
weighted avg       0.72      0.71      0.71     11250

Matrix: Confusion
 [[3642 2030]
 [1207 4371]]
Accuracy:
 0.7122666666666667


In [10]:
del models

# Compare W2V and BoW with Their Best Tuned Hyper-parameters

In [11]:
kfold = StratifiedKFold(n_splits=5)
general_grid_params = {'verbose' : 1, 'cv' : kfold, 'n_jobs' : -1, 'scoring' : 'f1'}

logistic_grid = {
    'penalty':['l2'],
    'C':[1, 300, 500, 700, 900, 2000],
    'class_weight':['balanced'],
    'solver':['saga'],
    'n_jobs':[-1],
    'max_iter':[1000],
}

svc_grid = {
    'kernel' : ['linear', 'rbf'],
    'C':[0.1, 1, 500, 1000],
}

knn_grid = {
    'n_neighbors' : [1, 100, 300, 500, 700, 900],
    'n_jobs' : [-1]
}

cols2show = ['mean_test_score','rank_test_score',]

logistic_cols = ['param_C'] + cols2show 
svc_cols = ['param_kernel', 'param_C', 'iter', 'n_resources'] + cols2show 
knn_cols = ['param_n_neighbors', 'iter', 'n_resources'] + cols2show 

## BoW

### Logistic Regression

In [12]:
bow_log = LogisticRegression()
bow_log = GridSearchCV(estimator=bow_log, param_grid=logistic_grid, **general_grid_params)
bow_log.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_log.best_score_}')
print(f'Best Params: {bow_log.best_params_}')
pd.DataFrame(bow_log.cv_results_)[logistic_cols]

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score: 0.8644932981621395
Best Params: {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}




Unnamed: 0,param_C,mean_test_score,rank_test_score
0,1,0.864493,1
1,300,0.863449,6
2,500,0.863474,3
3,700,0.863499,2
4,900,0.863474,3
5,2000,0.863474,3


In [13]:
bow_log_score = analysis(Y_test, bow_log.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5672
    negative       0.86      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4856  816]
 [ 666 4912]]
Accuracy:
 0.8682666666666666


### SVM

In [14]:
bow_svm = svm.SVC()
bow_svm = HalvingGridSearchCV(estimator=bow_svm, param_grid=svc_grid, cv = 3, n_jobs= -1, scoring='f1', factor=2)
bow_svm.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')
pd.DataFrame(bow_svm.cv_results_)[svc_cols]

Best Score: 0.8643585708532733
Best Params: {'C': 1, 'kernel': 'rbf'}


Unnamed: 0,param_kernel,param_C,iter,n_resources,mean_test_score,rank_test_score
0,linear,0.1,0,4218,0.809657,11
1,rbf,0.1,0,4218,0.739839,15
2,linear,1.0,0,4218,0.79132,12
3,rbf,1.0,0,4218,0.822439,8
4,linear,500.0,0,4218,0.790764,13
5,rbf,500.0,0,4218,0.819586,9
6,linear,1000.0,0,4218,0.790764,13
7,rbf,1000.0,0,4218,0.819586,9
8,linear,0.1,1,8436,0.82598,5
9,rbf,500.0,1,8436,0.825834,6


In [15]:
bow_svm_score = analysis(Y_test, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.85      0.87      5672
    negative       0.85      0.90      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4801  871]
 [ 579 4999]]
Accuracy:
 0.8711111111111111


### KNN

In [16]:
bow_knn = KNeighborsClassifier()
bow_knn = HalvingGridSearchCV(estimator=bow_knn, param_grid=knn_grid, cv = 3, scoring='f1', factor=2)
bow_knn.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_knn.best_score_}')
print(f'Best Params: {bow_knn.best_params_}')
pd.DataFrame(bow_knn.cv_results_)[knn_cols]

Best Score: 0.7292935417519001
Best Params: {'n_jobs': -1, 'n_neighbors': 300}


Unnamed: 0,param_n_neighbors,iter,n_resources,mean_test_score,rank_test_score
0,1,0,8437,0.56446,11
1,100,0,8437,0.709987,4
2,300,0,8437,0.696429,7
3,500,0,8437,0.692816,9
4,700,0,8437,0.69236,10
5,900,0,8437,0.69977,6
6,300,1,16874,0.709492,5
7,900,1,16874,0.694313,8
8,100,1,16874,0.713327,3
9,300,2,33748,0.729294,1


In [17]:
bow_knn_score = analysis(Y_test, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.85      0.87      5672
    negative       0.85      0.90      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4801  871]
 [ 579 4999]]
Accuracy:
 0.8711111111111111


## W2V

In [20]:
if os.path.isfile('w2v.kv'):
    vectorizer = Word2VecAdapter(pre_trained_model=KeyedVectors.load('w2v.kv'))
else:
    vectorizer = Word2VecAdapter()

X_train_w2v = vectorizer.fit_transform(X_train)
X_test_w2v = vectorizer.transform(X_test)

if not os.path.isfile('w2v.kv'):
     vectorizer.wv.save('w2v.kv')

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11250 [00:00<?, ?it/s]

### Logistic Regression

In [21]:
w2v_log = LogisticRegression()
w2v_log = GridSearchCV(estimator=w2v_log, param_grid=logistic_grid, **general_grid_params)
w2v_log.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_log.best_score_}')
print(f'Best Params: {w2v_log.best_params_}')
pd.DataFrame(w2v_log.cv_results_)[logistic_cols]

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score: 0.8668510002816859
Best Params: {'C': 700, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}


Unnamed: 0,param_C,mean_test_score,rank_test_score
0,1,0.8644,6
1,300,0.866825,2
2,500,0.866817,3
3,700,0.866851,1
4,900,0.866817,3
5,2000,0.866817,3


In [22]:
w2v_log_score = analysis(Y_test, w2v_log.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5672
    negative       0.86      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4870  802]
 [ 668 4910]]
Accuracy:
 0.8693333333333333


### SVM

In [23]:
w2v_svm = svm.SVC()
w2v_svm = HalvingGridSearchCV(estimator=w2v_svm, param_grid=svc_grid, cv = 3, n_jobs= -1, scoring='f1', factor=2)
w2v_svm.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')
pd.DataFrame(w2v_svm.cv_results_)[svc_cols]

Best Score: 0.8697736474102836
Best Params: {'C': 1, 'kernel': 'rbf'}


Unnamed: 0,param_kernel,param_C,iter,n_resources,mean_test_score,rank_test_score
0,linear,0.1,0,4218,0.851328,8
1,rbf,0.1,0,4218,0.843364,11
2,linear,1.0,0,4218,0.847382,9
3,rbf,1.0,0,4218,0.853962,7
4,linear,500.0,0,4218,0.843093,13
5,rbf,500.0,0,4218,0.821001,14
6,linear,1000.0,0,4218,0.843162,12
7,rbf,1000.0,0,4218,0.821001,14
8,rbf,0.1,1,8436,0.845976,10
9,linear,1.0,1,8436,0.860008,4


In [24]:
w2v_svm_score = analysis(Y_test, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5672
    negative       0.86      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4861  811]
 [ 651 4927]]
Accuracy:
 0.8700444444444444


### KNN

In [25]:
w2v_knn = KNeighborsClassifier()
w2v_knn = HalvingGridSearchCV(estimator=w2v_knn, param_grid=knn_grid, cv = 3, scoring='f1', factor=2)
w2v_knn.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_knn.best_score_}')
print(f'Best Params: {w2v_knn.best_params_}')
pd.DataFrame(w2v_knn.cv_results_)[knn_cols]

Best Score: 0.809571953461928
Best Params: {'n_jobs': -1, 'n_neighbors': 100}


Unnamed: 0,param_n_neighbors,iter,n_resources,mean_test_score,rank_test_score
0,1,0,8437,0.733319,11
1,100,0,8437,0.788877,4
2,300,0,8437,0.768344,7
3,500,0,8437,0.755649,8
4,700,0,8437,0.744273,9
5,900,0,8437,0.734836,10
6,500,1,16874,0.770698,6
7,300,1,16874,0.781184,5
8,100,1,16874,0.795532,3
9,300,2,33748,0.797969,2


In [26]:
w2v_knn_score = analysis(Y_test, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5672
    negative       0.86      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4861  811]
 [ 651 4927]]
Accuracy:
 0.8700444444444444


## Comparison

In [27]:
summary = {
    'LR': {'BoW': {'model':bow_log, 'score': bow_log_score}, 'W2V': {'model': w2v_log, 'score': w2v_log_score}},
    'SVM' : {'BoW': {'model':bow_svm, 'score': bow_svm_score}, 'W2V': {'model': w2v_svm, 'score': w2v_svm_score}},
    'KNN': {'BoW': {'model' :bow_knn, 'score':bow_knn_score}, 'W2V': {'model':w2v_knn, 'score':w2v_knn_score}},
  }

for name, values in summary.items():
    print(f'For classifier {name}, best BoW score is {values["BoW"]["score"]}, whereas best W2V score is {values["W2V"]["score"]}')
    best_model = 'BoW' if values['BoW']['score'] > values['W2V']['score'] else 'W2V'
    print(f'So {best_model} is better with parameters {values[best_model]["model"].best_params_}')
    filename = name + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(values[best_model]['model'], f)

del summary, bow_log, w2v_log, bow_svm, w2v_svm, bow_knn, w2v_knn

For classifier LR, best BoW score is 0.8689191579692198, whereas best W2V score is 0.8697962798937113
So W2V is better with parameters {'C': 700, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}
For classifier SVM, best BoW score is 0.8733403214535291, whereas best W2V score is 0.8708024036762106
So BoW is better with parameters {'C': 1, 'kernel': 'rbf'}
For classifier KNN, best BoW score is 0.8733403214535291, whereas best W2V score is 0.8708024036762106
So BoW is better with parameters {'n_jobs': -1, 'n_neighbors': 300}


# MLP

In [28]:
mlp_grid = {
    'hidden_layer_sizes': [(500, 250), (1000, 250), (500, 250, 250), (1000, 500, 250), (500, 250, 250, 5),
                           (1000, 500, 250, 5)],
    'activation':['tanh', 'relu']
}
def eval_mlp(X_train, X_test, Y_train, Y_test):
    best_f1 = -1
    best_model = None
    for sizes in mlp_grid['hidden_layer_sizes']:
        for act in mlp_grid['activation']:
            m = MLPClassifier(hidden_layer_sizes=sizes, activation=act, solver='sgd', alpha=1,
                                    learning_rate='adaptive', max_iter=10)
            m.fit(X_train, Y_train)
            print(f'Model config: hidden_layer_sizes={sizes}, activation={act}')
            f1 = analysis(Y_test, m.predict(X_test))
            if f1 > best_f1:
                best_model = m
                best_f1 = f1
    return best_f1, best_model

## W2V

In [29]:
w2v_f1, w2v_mlp = eval_mlp(X_train_w2v, X_test_w2v, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.84      0.85      5672
    negative       0.84      0.86      0.85      5578

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4770  902]
 [ 759 4819]]
Accuracy:
 0.8523555555555555




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.84      0.85      5672
    negative       0.84      0.86      0.85      5578

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4774  898]
 [ 767 4811]]
Accuracy:
 0.852




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.84      0.85      5672
    negative       0.84      0.87      0.86      5578

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4742  930]
 [ 718 4860]]
Accuracy:
 0.8535111111111111




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.85      5672
    negative       0.85      0.86      0.85      5578

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4800  872]
 [ 784 4794]]
Accuracy:
 0.8528




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5672
    negative       0.85      0.86      0.85      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4837  835]
 [ 793 4785]]
Accuracy:
 0.8552888888888889




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.85      5672
    negative       0.85      0.86      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4801  871]
 [ 760 4818]]
Accuracy:
 0.8550222222222222




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5672
    negative       0.85      0.87      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4813  859]
 [ 751 4827]]
Accuracy:
 0.8568888888888889




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5672
    negative       0.85      0.85      0.85      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4863  809]
 [ 814 4764]]
Accuracy:
 0.8557333333333333




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5672
    negative       0.85      0.86      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4850  822]
 [ 780 4798]]
Accuracy:
 0.8576




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5672
    negative       0.84      0.85      0.84      5578

    accuracy                           0.84     11250
   macro avg       0.84      0.84      0.84     11250
weighted avg       0.85      0.84      0.84     11250

Matrix: Confusion
 [[4782  890]
 [ 854 4724]]
Accuracy:
 0.8449777777777778




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.86      0.86      5672
    negative       0.86      0.86      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4885  787]
 [ 806 4772]]
Accuracy:
 0.8584




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.78      0.83      5672
    negative       0.80      0.89      0.84      5578

    accuracy                           0.83     11250
   macro avg       0.84      0.84      0.83     11250
weighted avg       0.84      0.83      0.83     11250

Matrix: Confusion
 [[4443 1229]
 [ 628 4950]]
Accuracy:
 0.8349333333333333


## BoW

In [None]:
bow_f1, bow_mlp = eval_mlp(X_train_bow, X_test_bow, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5672
    negative       0.85      0.87      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4845  827]
 [ 737 4841]]
Accuracy:
 0.8609777777777777




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5672
    negative       0.85      0.86      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4812  860]
 [ 768 4810]]
Accuracy:
 0.8552888888888889




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.84      0.86      5672
    negative       0.84      0.89      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4755  917]
 [ 591 4987]]
Accuracy:
 0.8659555555555556




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5672
    negative       0.85      0.87      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4802  870]
 [ 709 4869]]
Accuracy:
 0.8596444444444444




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.86      5672
    negative       0.85      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4826  846]
 [ 662 4916]]
Accuracy:
 0.8659555555555556




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.86      5672
    negative       0.86      0.85      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4909  763]
 [ 826 4752]]
Accuracy:
 0.8587555555555556




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.90      0.83      0.86      5672
    negative       0.84      0.91      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4717  955]
 [ 528 5050]]
Accuracy:
 0.8681777777777778




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.83      0.86      5672
    negative       0.84      0.89      0.86      5578

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4714  958]
 [ 607 4971]]
Accuracy:
 0.8608888888888889




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.85      0.87      5672
    negative       0.85      0.90      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4804  868]
 [ 580 4998]]
Accuracy:
 0.8712888888888889




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5672
    negative       0.84      0.85      0.84      5578

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4785  887]
 [ 852 4726]]
Accuracy:
 0.8454222222222222


## TD-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=2000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

In [None]:
tf_idf_f1, tf_idf_mlp = eval_mlp(X_train_idf, X_test_idf, Y_train, Y_test)

## Comparison

In [None]:
print('Best scores:')
print(f'W2V: {w2v_f1} with params: {w2v_mlp.get_params()}')
print(f'BoW: {bow_f1} with params: {bow_mlp.get_params()}')
print(f'Tf-Idf: {tf_idf_f1} with params: {tf_idf_mlp.get_params()}')

idx = np.argmax([w2v_f1, bow_f1, tf_idf_f1])
best_mlp = [w2v_mlp, bow_mlp, tf_idf_mlp][idx]
with open('best.pkl', 'wb') as f:
    pickle.dump(best_mlp, f)
