In [49]:
import pickle
import swifter
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neural_network import MLPClassifier

from data import preprocess_data, vectorize_data, load_dataset
from evaluation import analysis, evaluate_models_with_data
from w2v_adapter import Word2VecAdapter

import advanced_processor_chain_factory
import simple_processor_chain_factory

ImportError: cannot import name 'analysis'

In [None]:
dataset = load_dataset()
DEBUG = True

# Inspection of Pre-Processing Approaches

In [None]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced', n_jobs=-1),
          'svm' : svm.LinearSVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8, n_jobs=-1)
         }

## Without Pre-Process

In [None]:
evaluate_models_with_data(models, *vectorize_data(*preprocess_data(dataset, debug=DEBUG), CountVectorizer(max_features=2000)))

## Simple Pre-Process

In [None]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=simple_processor_chain_factory.create(), debug=DEBUG),
                              CountVectorizer(max_features=2000)))

## Pre-Process with Stemmimg

In [None]:
evaluate_models_with_data(models,
                          *vectorize_data(
                              *preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('stem'),
                                               debug=DEBUG),
                              CountVectorizer(max_features=2000)))

## Pre-Process with Lemmitization

In [None]:
X, Y = preprocess_data(dataset, processor_chain=advanced_processor_chain_factory.create('lem'), debug=DEBUG)
del dataset
X_train_bow, X_test_bow, Y_train_bow, Y_test_bow = vectorize_data(X, Y,CountVectorizer(max_features=2000))
evaluate_models_with_data(models, X_train_bow, X_test_bow, Y_train_bow, Y_test_bow)

In [None]:
del models

# Compare W2V and BoW with Their Best Tuned Hyper-parameters

In [None]:
kfold = StratifiedKFold(n_splits=5)
general_grid_params = {'verbose' : 1, 'cv' : kfold, 'n_jobs' : -1, 'scoring' : 'f1'}

logistic_grid = {
    'penalty':['l2'],
    'C':[1, 300, 500, 700, 900],
    'class_weight':['balanced'],
    'solver':['saga'],
    'n_jobs':[-1],
    'max_iter':[1000],
}

svc_grid = {
    'kernel' : ['linear', 'rbf'],
    'C':[0.1, 1, 300, 500, 700],
}

knn_grid = {
    'n_neighbors' : [i for i in range(1,24,2)],
    'n_jobs' : [-1]
}

## BoW

### Logistic Regression

In [None]:
bow_log = LogisticRegression()
bow_log = GridSearchCV(estimator=bow_log, param_grid=logistic_grid, **general_grid_params)
bow_log.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {bow_log.best_score_}')
print(f'Best Params: {bow_log.best_params_}')

In [None]:
analysis(Y_test_bow, bow_log.predict(X_test_bow))

### SVM

In [None]:
bow_svm = svm.SVC()
bow_svm = GridSearchCV(estimator=bow_svm, param_grid=svc_grid, **general_grid_params)
bow_svm.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')

In [None]:
analysis(Y_test_bow, bow_svm.predict(X_test_bow))

### KNN

In [None]:
bow_knn = KNeighborsClassifier()
bow_knn = GridSearchCV(estimator=bow_knn, param_grid=knn_grid, **general_grid_params)
bow_knn.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {bow_svm.best_score_}')
print(f'Best Params: {bow_svm.best_params_}')

In [None]:
analysis(Y_test_bow, bow_svm.predict(X_test_bow))


## W2V

In [None]:
X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v = vectorize_data(X, Y, Word2VecAdapter(num_features=500))

### Logistic Regression

In [None]:
w2v_log = LogisticRegression()
w2v_log = GridSearchCV(estimator=w2v_log, param_grid=logistic_grid, **general_grid_params)
w2v_log.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {w2v_log.best_score_}')
print(f'Best Params: {w2v_log.best_params_}')

In [None]:
analysis(Y_test_w2v, w2v_log.predict(X_test_w2v))

### SVM

In [None]:
w2v_svm = svm.SVC()
w2v_svm = GridSearchCV(estimator=w2v_svm, param_grid=svc_grid, **general_grid_params)
w2v_svm.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')

In [None]:
analysis(Y_test_w2v, w2v_svm.predict(X_test_w2v))

### KNN

In [None]:
w2v_knn = KNeighborsClassifier()
w2v_knn = GridSearchCV(estimator=w2v_knn, param_grid=knn_grid, **general_grid_params)
w2v_knn.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {w2v_svm.best_score_}')
print(f'Best Params: {w2v_svm.best_params_}')

In [None]:
analysis(Y_test_w2v, w2v_svm.predict(X_test_w2v))

## Comparison

In [None]:
summary = {
    'LR': {'BoW': bow_log, 'W2V': w2v_log},
    'SVM' : {'BoW': bow_svm, 'W2V': w2v_svm},
    'KNN': {'BoW': bow_knn, 'W2V': w2v_knn},
  }

for name, values in summary.items():
    print(f'For classifier {name}, best BoW score is {values["BoW"].best_score_}, whereas best W2V score is {values["W2V"].best_score_}')
    best_model = "BoW" if values["BoW"].best_score_ > values["W2V"].best_score_ else "W2V"
    print(f'So {best_model} is better with parameters {values[best_model].best_params_}')
    filename = name + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(values[best_model], f)

del summary, bow_log, w2v_log, bow_svm, w2v_svm, bow_knn, w2v_knn

# MLP

In [None]:
mlp_grid = {
    'hidden_layer_sizes':[(500, 250), (1000, 250), (500, 250, 250), (1000, 500,250)],
    'activation':['tanh', 'relu'],
    'solver':['sgd'],
    'alpha':[1],
    'learning_rate':['adaptive'],
    'max_iter':[1000]
}

## W2V

In [None]:
mlp_w2v = MLPClassifier()
mlp_w2v = GridSearchCV(estimator=mlp_w2v, param_grid=mlp_grid, **general_grid_params)
mlp_w2v.fit(X_train_w2v, Y_train_w2v)
print(f'Best Score: {mlp_w2v.best_score_}')
print(f'Best Params: {mlp_w2v.best_params_}')

In [None]:
analysis(Y_test_w2v, mlp_w2v.predict(X_test_w2v))

In [None]:
del X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v

## BoW

In [None]:
mlp_bow = MLPClassifier()
mlp_bow = GridSearchCV(estimator=mlp_bow, param_grid=mlp_grid, **general_grid_params)
mlp_bow.fit(X_train_bow, Y_train_bow)
print(f'Best Score: {mlp_bow.best_score_}')
print(f'Best Params: {mlp_bow.best_params_}')

In [None]:
analysis(Y_test_bow, mlp_bow.predict(X_test_bow))

In [None]:
del X_train_bow, X_test_bow, Y_train_bow, Y_test_bow

## TD-IDF

In [None]:
mlp_idf = MLPClassifier()
mlp_idf = GridSearchCV(estimator=mlp_idf, param_grid=mlp_grid, **general_grid_params)
X_train_idf, X_test_idf, Y_train_idf, Y_test_idf = vectorize_data(X, Y, TfidfVectorizer(max_features=2000))
mlp_idf.fit(X_train_idf, Y_train_idf)
print(f'Best Score: {mlp_idf.best_score_}')
print(f'Best Params: {mlp_idf.best_params_}')

In [None]:
analysis(Y_train_idf, mlp_bow.predict(X_test_idf))
del X_train_idf, X_test_idf, Y_train_idf, Y_test_idf

## Comparison

In [None]:
print('Best scores:')
print(f'W2V: {mlp_w2v.best_score_} with params: {mlp_w2v.best_params_}')
print(f'BoW: {mlp_bow.best_score_} with params: {mlp_bow.best_params_}')
print(f'MLP: {mlp_idf.best_score_} with params: {mlp_idf.best_params_}')
best_mlp = max(mlp_w2v, mlp_bow, mlp_idf, lambda mlp: mlp.best_score_)
with open('best.pkl', 'wb') as f:
    pickle.dump(best_mlp, f)