In [1]:
import os
import pickle
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, HalvingGridSearchCV
from sklearn.neural_network import MLPClassifier

from data import preprocess_data, vectorize_data, load_dataset
from evaluation import analysis, evaluate_models_with_data
from w2v_adapter import Word2VecAdapter

import advanced_processor_chain_factory
import simple_processor_chain_factory

In [2]:
dataset = load_dataset()
DEBUG = False    

# Inspection of Pre-Processing Approaches

In [3]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced', n_jobs=-1),
          'svm' : svm.LinearSVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8, n_jobs=-1)
         }

## Without Pre-Process

In [4]:
evaluate_models_with_data(models, *vectorize_data(*preprocess_data(dataset, debug=DEBUG), CountVectorizer(max_features=2000)))

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.87      0.87      5586
    negative       0.87      0.89      0.88      5664

    accuracy                           0.88     11250
   macro avg       0.88      0.88      0.88     11250
weighted avg       0.88      0.88      0.88     11250

Matrix: Confusion
 [[4842  744]
 [ 640 5024]]
Accuracy:
 0.8769777777777777
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.87      0.87      5586
    negative       0.87      0.86      0.87      5664

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4884  702]
 [ 774 4890]]
Accuracy:
 0.8688
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.64      0.64      0.64      5586
    negative       0.64      0.64      0.64      5664

    accuracy                           0.64     11250
   macro avg       0.64      0.64      0.64     11250
weighted avg       0.64      0.64      0.64     11250

Matrix: Confusion
 [[3561 2025]
 [2017 3647]]
Accuracy:
 0.6407111111111111


## Simple Pre-Process

In [5]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=simple_processor_chain_factory.create(), debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5673
    negative       0.86      0.87      0.87      5577

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4914  759]
 [ 717 4860]]
Accuracy:
 0.8688
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.84      0.86      5673
    negative       0.85      0.89      0.87      5577

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4765  908]
 [ 602 4975]]
Accuracy:
 0.8657777777777778
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.65      0.62      0.63      5673
    negative       0.63      0.65      0.64      5577

    accuracy                           0.64     11250
   macro avg       0.64      0.64      0.64     11250
weighted avg       0.64      0.64      0.64     11250

Matrix: Confusion
 [[3525 2148]
 [1939 3638]]
Accuracy:
 0.6367111111111111


## Pre-Process with Stemmimg

In [6]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('stem'),
                                               debug=DEBUG),
                              CountVectorizer(max_features=2000)))

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5601
    negative       0.87      0.88      0.87      5649

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4854  747]
 [ 702 4947]]
Accuracy:
 0.8712
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5601
    negative       0.86      0.88      0.87      5649

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4818  783]
 [ 674 4975]]
Accuracy:
 0.8704888888888889
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.63      0.81      0.71      5601
    negative       0.73      0.52      0.61      5649

    accuracy                           0.66     11250
   macro avg       0.68      0.67      0.66     11250
weighted avg       0.68      0.66      0.66     11250

Matrix: Confusion
 [[4523 1078]
 [2697 2952]]
Accuracy:
 0.6644444444444444


## Pre-Process with Lemmitization

In [7]:
X, Y = preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
del dataset

Pandas Apply:   0%|          | 0/45000 [00:00<?, ?it/s]

In [8]:
vectorizer = CountVectorizer(max_features=2000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [9]:
evaluate_models_with_data(models, X_train_bow, X_test_bow, Y_train, Y_test)

------Evaluating logistic regression------
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5688
    negative       0.86      0.88      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4865  823]
 [ 660 4902]]
Accuracy:
 0.8681777777777778
------Evaluating svm------




Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.86      5688
    negative       0.85      0.88      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4847  841]
 [ 676 4886]]
Accuracy:
 0.8651555555555556
------Evaluating knn------
Report: Classification
               precision    recall  f1-score   support

    positive       0.63      0.81      0.71      5688
    negative       0.73      0.51      0.60      5562

    accuracy                           0.66     11250
   macro avg       0.68      0.66      0.66     11250
weighted avg       0.68      0.66      0.66     11250

Matrix: Confusion
 [[4632 1056]
 [2726 2836]]
Accuracy:
 0.6638222222222222


In [10]:
del models

# Compare W2V and BoW with Their Best Tuned Hyper-parameters

In [23]:
kfold = StratifiedKFold(n_splits=5)
general_grid_params = {'verbose' : 1, 'cv' : kfold, 'n_jobs' : -1, 'scoring' : 'f1'}

logistic_grid = {
    'penalty':['l2'],
    'C':[1, 300, 500, 700, 900, 2000],
    'class_weight':['balanced'],
    'solver':['saga'],
    'n_jobs':[-1],
    'max_iter':[1000],
}

svc_grid = {
    'kernel' : ['linear', 'rbf'],
    'C':[0.1, 1, 500, 1000],
}

knn_grid = {
    'n_neighbors' : [1, 100, 300, 500, 700, 900, 2000],
    'n_jobs' : [-1]
}

## BoW

### Logistic Regression

In [12]:
bow_log = LogisticRegression()
bow_log = GridSearchCV(estimator=bow_log, param_grid=logistic_grid, **general_grid_params)
bow_log.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_log.best_score_}')
print(f'Best Params: {bow_log.best_params_}')
pd.DataFrame(bow_log.cv_results_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best Score: 0.8657995144389071
Best Params: {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_max_iter,param_n_jobs,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,86.863498,0.486335,0.019102,0.004084,1,balanced,1000,-1,l2,saga,"{'C': 1, 'class_weight': 'balanced', 'max_iter...",0.86844,0.860537,0.871598,0.859662,0.868761,0.8658,0.00479,1
1,85.363447,1.895883,0.017772,0.006606,300,balanced,1000,-1,l2,saga,"{'C': 300, 'class_weight': 'balanced', 'max_it...",0.866123,0.861042,0.870748,0.858529,0.867264,0.864741,0.004396,4
2,85.48027,1.190058,0.014565,0.004605,500,balanced,1000,-1,l2,saga,"{'C': 500, 'class_weight': 'balanced', 'max_it...",0.866249,0.861042,0.870748,0.858697,0.867264,0.8648,0.004357,2
3,86.60562,1.691871,0.012728,0.00157,700,balanced,1000,-1,l2,saga,"{'C': 700, 'class_weight': 'balanced', 'max_it...",0.866123,0.861042,0.870748,0.858697,0.867264,0.864775,0.004349,3
4,85.209331,3.334262,0.012569,0.002099,900,balanced,1000,-1,l2,saga,"{'C': 900, 'class_weight': 'balanced', 'max_it...",0.866123,0.861042,0.870748,0.858529,0.867264,0.864741,0.004396,4
5,79.50033,0.926723,0.011064,0.00151,2000,balanced,1000,-1,l2,saga,"{'C': 2000, 'class_weight': 'balanced', 'max_i...",0.866123,0.861042,0.870748,0.858529,0.867264,0.864741,0.004396,4


In [13]:
analysis(Y_test, bow_log.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5688
    negative       0.86      0.88      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4866  822]
 [ 664 4898]]
Accuracy:
 0.8679111111111111


0.8682857649352952

### SVM

In [14]:
bow_svm = svm.SVC()
bow_svm = HalvingGridSearchCV(estimator=bow_svm, param_grid=svc_grid, cv = 4, n_jobs= -1, scoring='f1', factor=2)
bow_svm.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')
pd.DataFrame(bow_svm.cv_results_)

Best Score: 0.8677297881739967
Best Params: {'C': 1, 'kernel': 'rbf'}


Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,0,4218,5.788482,0.624874,1.049996,0.088975,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.815672,...,0.82507,0.820762,0.007773,11,0.983212,0.980141,0.978197,0.978833,0.980096,0.001931
1,0,4218,10.294384,1.200383,3.128312,0.352425,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.765957,...,0.766387,0.762559,0.007976,15,0.782804,0.785379,0.776485,0.775343,0.780003,0.004208
2,0,4218,5.961231,0.748652,0.980025,0.011628,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.794326,...,0.801153,0.798872,0.004893,12,1.0,0.999694,0.999674,1.0,0.999842,0.000158
3,0,4218,8.308554,1.224831,2.027,0.126436,1.0,rbf,"{'C': 1, 'kernel': 'rbf'}",0.831633,...,0.83871,0.836093,0.002797,5,0.948428,0.944411,0.947573,0.951623,0.948009,0.002568
4,0,4218,6.61143,0.553976,1.012758,0.088587,500.0,linear,"{'C': 500, 'kernel': 'linear'}",0.791111,...,0.801914,0.797348,0.004679,13,1.0,1.0,1.0,1.0,1.0,0.0
5,0,4218,11.814751,1.351137,2.206957,0.143414,500.0,rbf,"{'C': 500, 'kernel': 'rbf'}",0.828179,...,0.83989,0.829271,0.008183,8,1.0,1.0,1.0,1.0,1.0,0.0
6,0,4218,6.837007,0.813769,0.957066,0.069734,1000.0,linear,"{'C': 1000, 'kernel': 'linear'}",0.791111,...,0.801914,0.797348,0.004679,13,1.0,1.0,1.0,1.0,1.0,0.0
7,0,4218,8.436112,0.80554,1.636469,0.138795,1000.0,rbf,"{'C': 1000, 'kernel': 'rbf'}",0.828179,...,0.83989,0.829271,0.008183,8,1.0,1.0,1.0,1.0,1.0,0.0
8,1,8436,29.782039,4.290875,5.528334,0.435915,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.836153,...,0.825247,0.82827,0.008136,10,0.952321,0.954172,0.94989,0.956169,0.953138,0.002317
9,1,8436,49.428387,5.780353,9.974282,1.94373,500.0,rbf,"{'C': 500, 'kernel': 'rbf'}",0.841856,...,0.817204,0.830358,0.008828,6,1.0,1.0,1.0,1.0,1.0,0.0


In [18]:
analysis(Y_test, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.84      0.87      5688
    negative       0.85      0.90      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4777  911]
 [ 577 4985]]
Accuracy:
 0.8677333333333334


0.8701344039099319

### KNN

In [24]:
bow_knn = KNeighborsClassifier()
bow_knn = HalvingGridSearchCV(estimator=bow_knn, param_grid=knn_grid, cv = 4, scoring='f1', factor=2)
bow_knn.fit(X_train_bow, Y_train)
print(f'Best Score: {bow_knn.best_score_}')
print(f'Best Params: {bow_knn.best_params_}')
pd.DataFrame(bow_knn.cv_results_)

Best Score: 0.7344377977492681
Best Params: {'n_jobs': -1, 'n_neighbors': 300}


Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_jobs,param_n_neighbors,params,split0_test_score,...,split3_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,mean_train_score,std_train_score
0,0,8437,0.009809,0.001351,0.753061,0.032235,-1,1,"{'n_jobs': -1, 'n_neighbors': 1}",0.555783,...,0.598049,0.595487,0.029178,13,1.0,1.0,1.0,1.0,1.0,0.0
1,0,8437,0.011755,0.001151,1.033305,0.120803,-1,100,"{'n_jobs': -1, 'n_neighbors': 100}",0.715699,...,0.71547,0.706199,0.011554,5,0.729865,0.710826,0.724062,0.731096,0.723962,0.008036
2,0,8437,0.010428,0.001356,1.004717,0.033191,-1,300,"{'n_jobs': -1, 'n_neighbors': 300}",0.699401,...,0.705882,0.702248,0.003463,8,0.705944,0.70121,0.723704,0.709018,0.709969,0.008404
3,0,8437,0.013292,0.005359,1.17506,0.053505,-1,500,"{'n_jobs': -1, 'n_neighbors': 500}",0.698313,...,0.693164,0.695764,0.004095,11,0.697765,0.697319,0.709746,0.699973,0.701201,0.005035
4,0,8437,0.011536,0.002371,1.23836,0.057155,-1,700,"{'n_jobs': -1, 'n_neighbors': 700}",0.699098,...,0.690174,0.695583,0.00486,12,0.698692,0.698988,0.706236,0.692842,0.69919,0.00475
5,0,8437,0.011441,0.002858,1.322407,0.049513,-1,900,"{'n_jobs': -1, 'n_neighbors': 900}",0.697582,...,0.693878,0.697408,0.004057,10,0.701428,0.700151,0.705635,0.695194,0.700602,0.003724
6,0,8437,0.011688,0.00226,1.856229,0.096936,-1,2000,"{'n_jobs': -1, 'n_neighbors': 2000}",0.701079,...,0.687122,0.698583,0.008963,9,0.699381,0.705416,0.702343,0.683525,0.697666,0.008439
7,1,16874,0.021313,0.003518,4.095783,0.113402,-1,900,"{'n_jobs': -1, 'n_neighbors': 900}",0.710148,...,0.694182,0.703707,0.005973,7,0.705402,0.701485,0.703465,0.709173,0.704881,0.002839
8,1,16874,0.02256,0.003252,4.80389,0.337252,-1,2000,"{'n_jobs': -1, 'n_neighbors': 2000}",0.708627,...,0.695548,0.704829,0.005779,6,0.69998,0.705698,0.702429,0.711396,0.704876,0.004276
9,1,16874,0.022476,0.001762,3.52879,0.058324,-1,300,"{'n_jobs': -1, 'n_neighbors': 300}",0.735464,...,0.715121,0.725307,0.007271,2,0.739397,0.722495,0.72898,0.72926,0.730033,0.006046


In [25]:
analysis(Y_test, bow_svm.predict(X_test_bow))

Report: Classification
               precision    recall  f1-score   support

    positive       0.89      0.84      0.87      5688
    negative       0.85      0.90      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4777  911]
 [ 577 4985]]
Accuracy:
 0.8677333333333334


0.8701344039099319

## W2V

In [26]:
if os.path.isfile('w2v.kv'):
    vectorizer = Word2VecAdapter(pre_trained_model=KeyedVectors.load('w2v.kv'))
else:
    vectorizer = Word2VecAdapter()

X_train_w2v = vectorizer.fit_transform(X_train)
X_test_w2v = vectorizer.transform(X_test)

if not os.path.isfile('w2v.kv'):
     vectorizer.wv.save('w2v.kv')

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/33750 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11250 [00:00<?, ?it/s]

### Logistic Regression

In [27]:
w2v_log = LogisticRegression()
w2v_log = GridSearchCV(estimator=w2v_log, param_grid=logistic_grid, **general_grid_params)
w2v_log.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_log.best_score_}')
print(f'Best Params: {w2v_log.best_params_}')
pd.DataFrame(w2v_log.cv_results_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score: 0.8673145284043511
Best Params: {'C': 900, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_max_iter,param_n_jobs,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,63.494407,3.185043,0.025942,0.009875,1,balanced,1000,-1,l2,saga,"{'C': 1, 'class_weight': 'balanced', 'max_iter...",0.867902,0.865685,0.866499,0.861005,0.865861,0.865391,0.002327,6
1,166.584122,17.958926,0.020381,0.003884,300,balanced,1000,-1,l2,saga,"{'C': 300, 'class_weight': 'balanced', 'max_it...",0.869095,0.865421,0.869719,0.862861,0.86931,0.867281,0.002696,3
2,156.931494,14.240462,0.023535,0.001406,500,balanced,1000,-1,l2,saga,"{'C': 500, 'class_weight': 'balanced', 'max_it...",0.869095,0.865421,0.869719,0.862861,0.869016,0.867222,0.002654,5
3,158.189548,11.045858,0.018214,0.002789,700,balanced,1000,-1,l2,saga,"{'C': 700, 'class_weight': 'balanced', 'max_it...",0.868968,0.865421,0.869886,0.862861,0.86931,0.867289,0.002711,2
4,159.282837,8.879876,0.02229,0.005948,900,balanced,1000,-1,l2,saga,"{'C': 900, 'class_weight': 'balanced', 'max_it...",0.869095,0.865421,0.869886,0.862861,0.86931,0.867315,0.002727,1
5,158.08115,6.700318,0.015832,0.002311,2000,balanced,1000,-1,l2,saga,"{'C': 2000, 'class_weight': 'balanced', 'max_i...",0.869095,0.865421,0.869719,0.862861,0.86931,0.867281,0.002696,3


In [28]:
analysis(Y_test, w2v_log.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.86      5688
    negative       0.85      0.88      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4821  867]
 [ 685 4877]]
Accuracy:
 0.8620444444444444


0.8627277551742437

### SVM

In [30]:
w2v_svm = svm.SVC()
w2v_svm = HalvingGridSearchCV(estimator=w2v_svm, param_grid=svc_grid, cv = 2, n_jobs= -1, scoring='f1', factor=2)
w2v_svm.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')
pd.DataFrame(w2v_svm.cv_results_)

Best Score: 0.8686236371763223
Best Params: {'C': 1, 'kernel': 'rbf'}


Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,0,4218,3.940405,1.661494,1.103088,0.00822,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.853249,0.861538,0.857393,0.004145,7,0.88172,0.872483,0.877102,0.004619
1,0,4218,5.531537,0.362466,2.94618,1.449964,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.841263,0.837341,0.839302,0.001961,11,0.860384,0.846081,0.853233,0.007151
2,0,4218,5.316753,1.148359,0.587173,0.041561,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.848235,0.864916,0.856576,0.008341,8,0.889689,0.884523,0.887106,0.002583
3,0,4218,4.721053,0.477904,1.993583,0.503912,1.0,rbf,"{'C': 1, 'kernel': 'rbf'}",0.851801,0.861244,0.856523,0.004721,9,0.907465,0.894712,0.901089,0.006377
4,0,4218,72.122401,6.157075,0.252554,0.008855,500.0,linear,"{'C': 500, 'kernel': 'linear'}",0.834888,0.842664,0.838776,0.003888,12,0.917423,0.903818,0.910621,0.006802
5,0,4218,1.456833,0.251809,0.787516,0.015626,500.0,rbf,"{'C': 500, 'kernel': 'rbf'}",0.812645,0.824266,0.818456,0.00581,14,1.0,1.0,1.0,0.0
6,0,4218,154.23206,0.657518,0.295835,0.004763,1000.0,linear,"{'C': 1000, 'kernel': 'linear'}",0.828545,0.843675,0.83611,0.007565,13,0.920909,0.905314,0.913112,0.007798
7,0,4218,1.83501,0.763028,1.276727,0.020805,1000.0,rbf,"{'C': 1000, 'kernel': 'rbf'}",0.812645,0.824266,0.818456,0.00581,14,1.0,1.0,1.0,0.0
8,1,8436,46.107652,1.817349,9.450498,1.612715,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.850897,0.845566,0.848232,0.002666,10,0.855588,0.853346,0.854467,0.001121
9,1,8436,38.709924,0.934568,15.067066,1.520572,1.0,rbf,"{'C': 1, 'kernel': 'rbf'}",0.865066,0.855869,0.860467,0.004599,4,0.893539,0.888367,0.890953,0.002586


In [31]:
analysis(Y_test, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.87      5688
    negative       0.85      0.89      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4819  869]
 [ 630 4932]]
Accuracy:
 0.8667555555555555


0.8680806125143008

### KNN

In [32]:
w2v_knn = KNeighborsClassifier()
w2v_knn = HalvingGridSearchCV(estimator=w2v_knn, param_grid=knn_grid, cv = 2, scoring='f1', factor=2)
w2v_knn.fit(X_train_w2v, Y_train)
print(f'Best Score: {w2v_knn.best_score_}')
print(f'Best Params: {w2v_knn.best_params_}')
pd.DataFrame(w2v_knn.cv_results_)

Best Score: 0.8046219952536993
Best Params: {'n_jobs': -1, 'n_neighbors': 100}


Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_jobs,param_n_neighbors,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,0,8437,0.037862,0.007212,2.239229,0.627276,-1,1,"{'n_jobs': -1, 'n_neighbors': 1}",0.724382,0.738973,0.731678,0.007295,11,1.0,1.0,1.0,0.0
1,0,8437,0.028378,0.000207,1.554471,0.012075,-1,100,"{'n_jobs': -1, 'n_neighbors': 100}",0.793792,0.78191,0.787851,0.005941,4,0.795503,0.7871,0.791302,0.004202
2,0,8437,0.031118,0.0045,1.649769,0.172171,-1,300,"{'n_jobs': -1, 'n_neighbors': 300}",0.774226,0.751298,0.762762,0.011464,6,0.770352,0.763212,0.766782,0.00357
3,0,8437,0.02309,0.000727,1.67994,0.121788,-1,500,"{'n_jobs': -1, 'n_neighbors': 500}",0.761119,0.732558,0.746839,0.014281,9,0.759277,0.741093,0.750185,0.009092
4,0,8437,0.029401,0.003737,1.902784,0.047222,-1,700,"{'n_jobs': -1, 'n_neighbors': 700}",0.750693,0.719035,0.734864,0.015829,10,0.747723,0.728342,0.738033,0.009691
5,0,8437,0.028024,0.004964,1.976717,0.133504,-1,900,"{'n_jobs': -1, 'n_neighbors': 900}",0.741886,0.707284,0.724585,0.017301,12,0.741217,0.716707,0.728962,0.012255
6,0,8437,0.027081,0.003388,2.824902,0.123307,-1,2000,"{'n_jobs': -1, 'n_neighbors': 2000}",0.716846,0.670336,0.693591,0.023255,13,0.712418,0.679835,0.696127,0.016291
7,1,16874,0.037361,0.0004,26.759446,20.460729,-1,700,"{'n_jobs': -1, 'n_neighbors': 700}",0.746385,0.750697,0.748541,0.002156,8,0.7543,0.760113,0.757207,0.002906
8,1,16874,0.033526,0.002737,5.729146,0.210555,-1,500,"{'n_jobs': -1, 'n_neighbors': 500}",0.756903,0.759875,0.758389,0.001486,7,0.761601,0.771483,0.766542,0.004941
9,1,16874,0.03886,0.007362,5.235834,0.080959,-1,300,"{'n_jobs': -1, 'n_neighbors': 300}",0.766355,0.774337,0.770346,0.003991,5,0.776278,0.782061,0.779169,0.002892


In [33]:
analysis(Y_test, w2v_svm.predict(X_test_w2v))

Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.87      5688
    negative       0.85      0.89      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4819  869]
 [ 630 4932]]
Accuracy:
 0.8667555555555555


0.8680806125143008

## Comparison

In [34]:
summary = {
    'LR': {'BoW': bow_log, 'W2V': w2v_log},
    'SVM' : {'BoW': bow_svm, 'W2V': w2v_svm},
    'KNN': {'BoW': bow_knn, 'W2V': w2v_knn},
  }

for name, values in summary.items():
    print(f'For classifier {name}, best BoW score is {values["BoW"].best_score_}, whereas best W2V score is {values["W2V"].best_score_}')
    best_model = "BoW" if values["BoW"].best_score_ > values["W2V"].best_score_ else "W2V"
    print(f'So {best_model} is better with parameters {values[best_model].best_params_}')
    filename = name + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(values[best_model], f)

del summary, bow_log, w2v_log, bow_svm, w2v_svm, bow_knn, w2v_knn

For classifier LR, best BoW score is 0.8657995144389071, whereas best W2V score is 0.8673145284043511
So W2V is better with parameters {'C': 900, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'saga'}
For classifier SVM, best BoW score is 0.8677297881739967, whereas best W2V score is 0.8686236371763223
So W2V is better with parameters {'C': 1, 'kernel': 'rbf'}
For classifier KNN, best BoW score is 0.7344377977492681, whereas best W2V score is 0.8046219952536993
So W2V is better with parameters {'n_jobs': -1, 'n_neighbors': 100}


# MLP

In [35]:
mlp_grid = {
    'hidden_layer_sizes': [(500, 250), (1000, 250), (500, 250, 250), (1000, 500, 250), (500, 250, 250, 5),
                           (1000, 500, 250, 5)],
    'activation':['tanh', 'relu']
}
def eval_mlp(X_train, X_test, Y_train, Y_test):
    best_f1 = -1
    best_model = None
    for sizes in mlp_grid['hidden_layer_sizes']:
        for act in mlp_grid['activation']:
            m = MLPClassifier(hidden_layer_sizes=sizes, activation=act, solver='sgd', alpha=1,
                                    learning_rate='adaptive', max_iter=10)
            m.fit(X_train, Y_train)
            print(f'Model config: hidden_layer_sizes={sizes}, activation={act}')
            f1 = analysis(Y_test, m.predict(X_test))
            if f1 > best_f1:
                best_model = m
                best_f1 = f1
    return best_f1, best_model

## W2V

In [36]:
w2v_f1, w2v_mlp = eval_mlp(X_train_w2v, X_test_w2v, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.83      0.85      5688
    negative       0.83      0.88      0.85      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4714  974]
 [ 690 4872]]
Accuracy:
 0.8520888888888889




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.83      0.85      5688
    negative       0.83      0.87      0.85      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4731  957]
 [ 728 4834]]
Accuracy:
 0.8502222222222222




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.83      0.85      5688
    negative       0.84      0.88      0.86      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.86      0.85      0.85     11250

Matrix: Confusion
 [[4742  946]
 [ 693 4869]]
Accuracy:
 0.8543111111111111




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.84      0.85      5688
    negative       0.84      0.86      0.85      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4766  922]
 [ 769 4793]]
Accuracy:
 0.8496888888888889




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.83      0.85      5688
    negative       0.84      0.88      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4734  954]
 [ 669 4893]]
Accuracy:
 0.8557333333333333




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.82      0.85      5688
    negative       0.83      0.88      0.86      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4673 1015]
 [ 643 4919]]
Accuracy:
 0.8526222222222222




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5688
    negative       0.85      0.87      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4813  875]
 [ 717 4845]]
Accuracy:
 0.8584888888888889




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.83      0.85      5688
    negative       0.83      0.88      0.86      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4722  966]
 [ 682 4880]]
Accuracy:
 0.8535111111111111




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5688
    negative       0.85      0.86      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4870  818]
 [ 755 4807]]
Accuracy:
 0.8601777777777778




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.83      0.85      5688
    negative       0.83      0.88      0.86      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4702  986]
 [ 660 4902]]
Accuracy:
 0.8536888888888889




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.84      0.86      5688
    negative       0.84      0.87      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4797  891]
 [ 710 4852]]
Accuracy:
 0.8576888888888888




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.84      0.86      5688
    negative       0.84      0.88      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4776  912]
 [ 684 4878]]
Accuracy:
 0.8581333333333333


## BoW

In [None]:
bow_f1, bow_mlp = eval_mlp(X_train_bow, X_test_bow, Y_train, Y_test)



Model config: hidden_layer_sizes=(500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.84      0.86      5688
    negative       0.85      0.88      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4795  893]
 [ 662 4900]]
Accuracy:
 0.8617777777777778




Model config: hidden_layer_sizes=(500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.86      0.83      0.85      5688
    negative       0.83      0.87      0.85      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4717  971]
 [ 748 4814]]
Accuracy:
 0.8472




Model config: hidden_layer_sizes=(1000, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5688
    negative       0.85      0.87      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4832  856]
 [ 697 4865]]
Accuracy:
 0.8619555555555556




Model config: hidden_layer_sizes=(1000, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.82      0.85      5688
    negative       0.83      0.88      0.86      5562

    accuracy                           0.85     11250
   macro avg       0.86      0.85      0.85     11250
weighted avg       0.86      0.85      0.85     11250

Matrix: Confusion
 [[4691  997]
 [ 646 4916]]
Accuracy:
 0.8539555555555556




Model config: hidden_layer_sizes=(500, 250, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.86      0.86      5688
    negative       0.85      0.87      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4865  823]
 [ 721 4841]]
Accuracy:
 0.8627555555555556




Model config: hidden_layer_sizes=(500, 250, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.83      0.85      5688
    negative       0.84      0.87      0.85      5562

    accuracy                           0.85     11250
   macro avg       0.85      0.85      0.85     11250
weighted avg       0.85      0.85      0.85     11250

Matrix: Confusion
 [[4748  940]
 [ 732 4830]]
Accuracy:
 0.8513777777777778




Model config: hidden_layer_sizes=(1000, 500, 250), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.85      0.86      5688
    negative       0.85      0.88      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4826  862]
 [ 651 4911]]
Accuracy:
 0.8655111111111111




Model config: hidden_layer_sizes=(1000, 500, 250), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5688
    negative       0.85      0.87      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4811  877]
 [ 729 4833]]
Accuracy:
 0.8572444444444445




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.88      0.86      0.87      5688
    negative       0.86      0.88      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4877  811]
 [ 694 4868]]
Accuracy:
 0.8662222222222222




Model config: hidden_layer_sizes=(500, 250, 250, 5), activation=relu
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.85      0.86      5688
    negative       0.85      0.87      0.86      5562

    accuracy                           0.86     11250
   macro avg       0.86      0.86      0.86     11250
weighted avg       0.86      0.86      0.86     11250

Matrix: Confusion
 [[4823  865]
 [ 747 4815]]
Accuracy:
 0.8567111111111111




Model config: hidden_layer_sizes=(1000, 500, 250, 5), activation=tanh
Report: Classification
               precision    recall  f1-score   support

    positive       0.87      0.87      0.87      5688
    negative       0.86      0.87      0.87      5562

    accuracy                           0.87     11250
   macro avg       0.87      0.87      0.87     11250
weighted avg       0.87      0.87      0.87     11250

Matrix: Confusion
 [[4929  759]
 [ 706 4856]]
Accuracy:
 0.8697777777777778


## TD-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=2000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

In [None]:
tf_idf_f1, tf_idf_mlp = eval_mlp(X_train_idf, X_test_idf, Y_train, Y_test)

## Comparison

In [None]:
print('Best scores:')
print(f'W2V: {w2v_f1} with params: {w2v_mlp.get_params()}')
print(f'BoW: {bow_f1} with params: {bow_mlp.get_params()}')
print(f'Tf-Idf: {tf_idf_f1} with params: {tf_idf_mlp.get_params()}')

idx = np.argmax([w2v_f1, bow_f1, tf_idf_f1])
best_mlp = [w2v_mlp, bow_mlp, tf_idf_mlp][idx]
with open('best.pkl', 'wb') as f:
    pickle.dump(best_mlp, f)
