In [1]:
import os
import pickle

from gensim.models import KeyedVectors
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, HalvingGridSearchCV
from sklearn.neural_network import MLPClassifier

from data import preprocess_data, vectorize_data, load_dataset
from evaluation import analysis, evaluate_models_with_data
from w2v_adapter import Word2VecAdapter

import advanced_processor_chain_factory
import simple_processor_chain_factory

In [2]:
dataset = load_dataset()
DEBUG = False

# Inspection of Pre-Processing Approaches

In [3]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced', n_jobs=-1),
          'svm' : svm.LinearSVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8, n_jobs=-1)
         }

## Without Pre-Process

In [4]:
evaluate_models_with_data(models, *vectorize_data(*preprocess_data(dataset, debug=DEBUG), CountVectorizer(max_features=2000)))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5679
    negative       0.87      0.88      0.87      5571

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4911  768]
 [ 642 4929]]
Accuracy:
 0.8746666666666667
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.91      0.81      0.86      5679
    negative       0.83      0.91      0.87      5571

    accuracy                           0.86     11250
   macro avg       0.87      0.86      0.86     11250
weighted avg       0.87      0.86      0.86     11250

Matrix: Confusion
 [[4618 1061]
 [ 474 5097]]
Accuracy:
 0.8635555555555555
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.65      0.64      0.65      5679
    negative       0.64      0.65      0.65      5571

    accuracy                           0.65     11250
   macro avg       0.65      0.65      0.65     11250
weighted avg       0.65      0.65      0.65     11250

Matrix: Confusion
 [[3645 2034]
 [1930 3641]]
Accuracy:
 0.6476444444444445


## Simple Pre-Process

In [5]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=simple_processor_chain_factory.create(), debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.87      0.87      5672
    negative       0.87      0.88      0.87      5578

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4925  747]
 [ 686 4892]]
Accuracy:
 0.8726222222222222
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.90      0.82      0.86      5672
    negative       0.83      0.91      0.87      5578

    accuracy                           0.86     11250
   macro avg       0.87      0.86      0.86     11250
weighted avg       0.87      0.86      0.86     11250

Matrix: Confusion
 [[4658 1014]
 [ 509 5069]]
Accuracy:
 0.8646222222222222
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.66      0.60      0.63      5672
    negative       0.63      0.68      0.65      5578

    accuracy                           0.64     11250
   macro avg       0.64      0.64      0.64     11250
weighted avg       0.64      0.64      0.64     11250

Matrix: Confusion
 [[3413 2259]
 [1765 3813]]
Accuracy:
 0.6423111111111112


## Pre-Process with Stemmimg

In [6]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('stem'),
                                               debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5623
    negative       0.86      0.87      0.86      5627

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4845  778]
 [ 748 4879]]
Accuracy:
 0.8643555555555555
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5623
    negative       0.87      0.87      0.87      5627

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4870  753]
 [ 745 4882]]
Accuracy:
 0.8668444444444444
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.63      0.80      0.70      5623
    negative       0.72      0.54      0.62      5627

    accuracy                           0.67     11250
   macro avg       0.68      0.67      0.66     11250
weighted avg       0.68      0.67      0.66     11250

Matrix: Confusion
 [[4474 1149]
 [2609 3018]]
Accuracy:
 0.6659555555555555


## Pre-Process with Lemmitization

In [7]:
X, Y = preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
del dataset

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

In [8]:
vectorizer = CountVectorizer(max_features=2000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [9]:
evaluate_models_with_data(models, X_train_bow, X_test_bow, Y_train, Y_test)

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.87      5668
    negative       0.86      0.87      0.87      5582

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4900  768]
 [ 706 4876]]
Accuracy:
 0.8689777777777777
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5668
    negative       0.86      0.88      0.87      5582

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4858  810]
 [ 669 4913]]
Accuracy:
 0.8685333333333334
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.63      0.82      0.71      5668
    negative       0.74      0.51      0.60      5582

    accuracy                           0.67     11250
   macro avg       0.68      0.67      0.66     11250
weighted avg       0.68      0.67      0.66     11250

Matrix: Confusion
 [[4675  993]
 [2760 2822]]
Accuracy:
 0.6664


In [10]:
del models

# Compare W2V and BoW with Their Best Tuned Hyper-parameters

In [11]:
kfold = StratifiedKFold(n_splits=5)
general_grid_params = {'verbose' : 1, 'cv' : kfold, 'n_jobs' : -1, 'scoring' : 'f1'}

logistic_grid = {
    'penalty':['l2'],
    'C':[1, 300, 500, 700, 900, 2000],
    'class_weight':['balanced'],
    'solver':['saga'],
    'n_jobs':[-1],
    'max_iter':[1000],
}

svc_grid = {
    'kernel' : ['linear', 'rbf'],
    'C':[0.1, 1, 500, 1000],
}

knn_grid = {
    'n_neighbors' : [i for i in range(1,24,2)],
    'n_jobs' : [-1]
}

## BoW

### Logistic Regression

In [12]:
bow_log = LogisticRegression()
bow_log = GridSearchCV(estimator=bow_log, param_grid=logistic_grid, **general_grid_params)
bow_log.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_log.best_score_}')
print(f'Best Params: {bow_log.best_params_}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score: 0.8645317545309833
Best Params: {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}




In [13]:
analysis(Y_test, bow_log.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.87      5668
    negative       0.86      0.87      0.87      5582

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4896  772]
 [ 710 4872]]
Accuracy:
 0.8682666666666666


0.8679850347407803

### SVM

In [14]:
bow_svm = svm.SVC()
bow_svm = HalvingGridSearchCV(estimator=bow_svm, param_grid=svc_grid, cv = 4, n_jobs= -1, scoring='f1', factor=2)
bow_svm.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')

KeyboardInterrupt: 

In [None]:
analysis(Y_test, bow_svm.predict(X_test_bow))

### KNN

In [None]:
bow_knn = KNeighborsClassifier()
bow_knn = GridSearchCV(estimator=bow_knn, param_grid=knn_grid, **general_grid_params)
bow_knn.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')

In [None]:
analysis(Y_test, bow_svm.predict(X_test_bow))


## W2V

In [15]:
if os.path.isfile('w2v.kv'):
    vectorizer = Word2VecAdapter(pre_trained_model=KeyedVectors.load('w2v.kv'))
else:
    vectorizer = Word2VecAdapter()

X_train_w2v = vectorizer.fit_transform(X_train)
X_test_w2v = vectorizer.transform(X_test)

if not os.path.isfile('w2v.kv'):
     vectorizer.wv.save('w2v.kv')

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11250 [00:00<?, ?it/s]

### Logistic Regression

In [None]:
w2v_log = LogisticRegression(n_jobs=-1)
w2v_log.fit(X_train_w2v, Y_train)
analysis(Y_test, w2v_log.predict(X_test_w2v))

In [None]:
w2v_log = LogisticRegression()
w2v_log = GridSearchCV(estimator=w2v_log, param_grid=logistic_grid, **general_grid_params)
w2v_log.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_log.best_score_}')
print(f'Best Params: {w2v_log.best_params_}')

In [None]:
analysis(Y_test, w2v_log.predict(X_test_w2v))

### SVM

In [None]:
w2v_svm = svm.SVC()
w2v_svm = HalvingGridSearchCV(estimator=w2v_svm, param_grid=svc_grid, cv = 4, n_jobs= -1, scoring='f1', factor=2)
w2v_svm.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')

In [None]:
analysis(Y_test, w2v_svm.predict(X_test_w2v))

### KNN

In [None]:
w2v_knn = KNeighborsClassifier()
w2v_knn = GridSearchCV(estimator=w2v_knn, param_grid=knn_grid, **general_grid_params)
w2v_knn.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')

In [None]:
analysis(Y_test, w2v_svm.predict(X_test_w2v))

## Comparison

In [None]:
summary = {
    'LR': {'BoW': bow_log, 'W2V': w2v_log},
    'SVM' : {'BoW': bow_svm, 'W2V': w2v_svm},
    'KNN': {'BoW': bow_knn, 'W2V': w2v_knn},
  }

for name, values in summary.items():
    print(f'For classifier {name}, best BoW score is {values["BoW"].best_score_}, whereas best W2V score is {values["W2V"].best_score_}')
    best_model = "BoW" if values["BoW"].best_score_ > values["W2V"].best_score_ else "W2V"
    print(f'So {best_model} is better with parameters {values[best_model].best_params_}')
    filename = name + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(values[best_model], f)

del summary, bow_log, w2v_log, bow_svm, w2v_svm, bow_knn, w2v_knn

# MLP

In [16]:
mlp_grid = {
    'hidden_layer_sizes': [(500, 250), (1000, 250), (500, 250, 250), (1000, 500, 250), (500, 250, 250, 5),
                           (1000, 500, 250, 5)],
    'activation':['tanh', 'relu']
}
def eval_mlp(X_train, X_test, Y_train, Y_test):
    best_f1 = -1
    best_model = None
    for sizes in mlp_grid['hidden_layer_sizes']:
        for act in mlp_grid['activation']:
            m = MLPClassifier(hidden_layer_sizes=sizes, activation=act, solver='sgd', alpha=1,
                                    learning_rate='adaptive', max_iter=10)
            m.fit(X_train, Y_train)
            print(f'Model config: hidden_layer_sizes={sizes}, activation={act}')
            f1 = analysis(Y_test, m.predict(X_test))
            if f1 > best_f1:
                best_model = m
                best_f1 = f1
    return best_f1, best_model

## W2V

In [17]:
w2v_f1, w2v_mlp = eval_mlp(X_train_w2v, X_test_w2v, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.85      5668
    negative       0.85      0.85      0.85      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4825  843]
 [ 816 4766]]
Accuracy:
 0.8525333333333334




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5668
    negative       0.84      0.85      0.85      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4783  885]
 [ 816 4766]]
Accuracy:
 0.8488




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.86      0.86      5668
    negative       0.85      0.85      0.85      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4863  805]
 [ 843 4739]]
Accuracy:
 0.8535111111111111




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.85      0.84      0.85      5668
    negative       0.84      0.85      0.85      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4766  902]
 [ 815 4767]]
Accuracy:
 0.8473777777777778




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5668
    negative       0.85      0.86      0.86      5582

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4843  825]
 [ 794 4788]]
Accuracy:
 0.8560888888888889




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.84      0.85      5668
    negative       0.84      0.87      0.85      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4750  918]
 [ 749 4833]]
Accuracy:
 0.8518222222222223




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.84      0.85      5668
    negative       0.84      0.87      0.86      5582

    accuracy                           0.85     11250
   macro avg       0.86      0.85      0.85     11250
weighted avg       0.86      0.85      0.85     11250

Matrix: Confusion
 [[4774  894]
 [ 739 4843]]
Accuracy:
 0.8548444444444444




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.84      0.85      5668
    negative       0.85      0.86      0.86      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4789  879]
 [ 756 4826]]
Accuracy:
 0.8546666666666667


## BoW

In [18]:
bow_f1, bow_mlp = eval_mlp(X_train_bow, X_test_bow, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5668
    negative       0.86      0.87      0.86      5582

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4853  815]
 [ 753 4829]]
Accuracy:
 0.8606222222222222




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.82      0.85      5668
    negative       0.83      0.88      0.85      5582

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4665 1003]
 [ 683 4899]]
Accuracy:
 0.8501333333333333




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.86      5668
    negative       0.85      0.88      0.86      5582

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4804  864]
 [ 682 4900]]
Accuracy:
 0.8625777777777778




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.81      0.85      5668
    negative       0.82      0.89      0.86      5582

    accuracy                           0.85     11250
   macro avg       0.86      0.85      0.85     11250
weighted avg       0.86      0.85      0.85     11250

Matrix: Confusion
 [[4611 1057]
 [ 599 4983]]
Accuracy:
 0.8528




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.84      0.86      5668
    negative       0.85      0.89      0.87      5582

    accuracy                           0.86     11250
   macro avg       0.87      0.86      0.86     11250
weighted avg       0.87      0.86      0.86     11250

Matrix: Confusion
 [[4772  896]
 [ 629 4953]]
Accuracy:
 0.8644444444444445




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.85      0.86      5668
    negative       0.85      0.86      0.86      5582

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4818  850]
 [ 772 4810]]
Accuracy:
 0.8558222222222223




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.87      0.87      5668
    negative       0.87      0.88      0.87      5582

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4906  762]
 [ 683 4899]]
Accuracy:
 0.8715555555555555




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5668
    negative       0.85      0.87      0.86      5582

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4802  866]
 [ 730 4852]]
Accuracy:
 0.8581333333333333


## TD-IDF

In [19]:
vectorizer = TfidfVectorizer(max_features=2000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

In [20]:
tf_idf_f1, tf_idf_mlp = eval_mlp(X_train_idf, X_test_idf, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.76      0.81      0.78      5668
    negative       0.79      0.73      0.76      5582

    accuracy                           0.77     11250
   macro avg       0.77      0.77      0.77     11250
weighted avg       0.77      0.77      0.77     11250

Matrix: Confusion
 [[4572 1096]
 [1480 4102]]
Accuracy:
 0.7710222222222223




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.77      0.65      0.70      5668
    negative       0.69      0.80      0.74      5582

    accuracy                           0.72     11250
   macro avg       0.73      0.72      0.72     11250
weighted avg       0.73      0.72      0.72     11250

Matrix: Confusion
 [[3673 1995]
 [1112 4470]]
Accuracy:
 0.7238222222222223




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.77      0.81      0.79      5668
    negative       0.80      0.76      0.78      5582

    accuracy                           0.79     11250
   macro avg       0.79      0.79      0.79     11250
weighted avg       0.79      0.79      0.79     11250

Matrix: Confusion
 [[4615 1053]
 [1347 4235]]
Accuracy:
 0.7866666666666666




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.75      0.74      0.75      5668
    negative       0.74      0.75      0.75      5582

    accuracy                           0.75     11250
   macro avg       0.75      0.75      0.75     11250
weighted avg       0.75      0.75      0.75     11250

Matrix: Confusion
 [[4214 1454]
 [1399 4183]]
Accuracy:
 0.7464




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.76      0.81      0.79      5668
    negative       0.80      0.75      0.77      5582

    accuracy                           0.78     11250
   macro avg       0.78      0.78      0.78     11250
weighted avg       0.78      0.78      0.78     11250

Matrix: Confusion
 [[4604 1064]
 [1415 4167]]
Accuracy:
 0.7796444444444445




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.69      0.70      0.69      5668
    negative       0.69      0.68      0.68      5582

    accuracy                           0.69     11250
   macro avg       0.69      0.69      0.69     11250
weighted avg       0.69      0.69      0.69     11250

Matrix: Confusion
 [[3966 1702]
 [1808 3774]]
Accuracy:
 0.688




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.84      0.76      0.80      5668
    negative       0.78      0.85      0.81      5582

    accuracy                           0.81     11250
   macro avg       0.81      0.81      0.81     11250
weighted avg       0.81      0.81      0.81     11250

Matrix: Confusion
 [[4327 1341]
 [ 844 4738]]
Accuracy:
 0.8057777777777778




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.75      0.64      0.69      5668
    negative       0.68      0.79      0.73      5582

    accuracy                           0.71     11250
   macro avg       0.72      0.72      0.71     11250
weighted avg       0.72      0.71      0.71     11250

Matrix: Confusion
 [[3640 2028]
 [1184 4398]]
Accuracy:
 0.7144888888888888


## Comparison

In [21]:
print('Best scores:')
print(f'W2V: {w2v_f1} with params: {w2v_mlp.get_params()}')
print(f'BoW: {bow_f1} with params: {bow_mlp.get_params()}')
print(f'Tf-Idf: {tf_idf_f1} with params: {tf_idf_mlp.get_params()}')

idx = np.argmax([w2v_f1, bow_f1, tf_idf_f1])
best_mlp = [w2v_mlp, bow_mlp, tf_idf_mlp][idx]
with open('best.pkl', 'wb') as f:
    pickle.dump(best_mlp, f)


Best scores:
W2V: 0.8557293047088965 with params: {'activation': 'tanh', 'alpha': 1, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (1000, 500, 250), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 10, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'sgd', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
BoW: 0.8714755848083253 with params: {'activation': 'tanh', 'alpha': 1, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (1000, 500, 250), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 10, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'sgd', 'tol': 0.0001, 'valid