In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
# File manipulation
import os
import shutil
# plotting
import matplotlib.pyplot as plt
# Time
import time
import pickle

In [2]:
# load data
df_train = pd.read_csv('../data/processed/normalized_train.csv')
df_test = pd.read_csv('../data/processed/normalized_test.csv')
df_under_train = pd.read_csv('../data/processed/undersampled_train.csv')
df_under_test = pd.read_csv('../data/processed/undersampled_test.csv')

In [3]:
# Vectorizer imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
# LogisticRegression Pipeline
pipe_lr = Pipeline([
    ("countVectorizer", CountVectorizer()),
    ("LogisticRegression", LogisticRegression(multi_class="multinomial", max_iter=100))])

# RandomForestClassifier Pipeline
pipe_rf = Pipeline([
    ('countVectorizer', CountVectorizer()),
    ('RandomForestClassifier', RandomForestClassifier(random_state=0))])

# Define pipeline parameters for all models
params_lr = {
    "countVectorizer__ngram_range": [(1,1), (1,2)],
    "countVectorizer__binary": [True, False],
    "LogisticRegression__C": [0.5, 1.0]
}

params_rf = {
    "countVectorizer__ngram_range": [(1,1), (1,2)],
    "countVectorizer__binary": [True, False],
    "RandomForestClassifier__criterion": ["gini", "entropy"]
}


grid_param_lr = GridSearchCV(
    estimator=pipe_lr,
    param_grid=params_lr,
    scoring='accuracy',
    cv=3) 

grid_param_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=params_rf,
    scoring="accuracy",
    cv=3)


pipelines = [(grid_param_lr, params_lr, pipe_lr), (grid_param_rf, params_rf, pipe_rf)]

In [5]:
from pprint import pprint
from time import time
import warnings


warnings.filterwarnings('ignore')
for pipe in pipelines:
    print("Performing grid search...")
    print("Pipeline:", [name for name, _ in pipe[2].steps])
    print("Parameters:")
    pprint(pipe[1])
    t0 = time()
    pipe[0].fit(df_train["Data"], df_train["Label"])
    print("Done in %0.3fs\n" % (time() - t0))
    print("Best score: %0.3f" % pipe[0].best_score_)
    print("Best parameters set:")
    best_parameters = pipe[0].best_estimator_.get_params()
    
    
    for param_name in sorted(pipe[1].keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    # On training data
    predict_train = pipe[0].best_estimator_.predict(df_train["Data"])
    # On test data
    predict_test = pipe[0].best_estimator_.predict(df_test["Data"])

    # classification report for optimal parameters (training set)
    print("\nClassification report for optimal parameters (training set)\n")
    print(classification_report(df_train["Label"], predict_train))

    # classification report for optimal parameters (test set)
    print("\nClassification report for optimal parameters (test data)\n")
    print(classification_report(df_test["Label"], predict_test))
    
    print("################################################################")

Performing grid search...
Pipeline: ['countVectorizer', 'LogisticRegression']
Parameters:
{'LogisticRegression__C': [0.5, 1.0],
 'countVectorizer__binary': [True, False],
 'countVectorizer__ngram_range': [(1, 1), (1, 2)]}
Done in 552.205s

Best score: 0.869
Best parameters set:
	LogisticRegression__C: 0.5
	countVectorizer__binary: True
	countVectorizer__ngram_range: (1, 2)

Classification report for optimal parameters (training set)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3366
           1       1.00      1.00      1.00      1570
           2       1.00      1.00      1.00      2972
           3       1.00      1.00      1.00      1538

    accuracy                           1.00      9446
   macro avg       1.00      1.00      1.00      9446
weighted avg       1.00      1.00      1.00      9446


Classification report for optimal parameters (test data)

              precision    recall  f1-score   support

           0 

In [6]:
#### UNDERSAMPLED DATA COUNT VECTORIZER####

from pprint import pprint
from time import time
import warnings

warnings.filterwarnings('ignore')
for pipe in pipelines:
    print("Performing grid search...")
    print("Pipeline:", [name for name, _ in pipe[2].steps])
    print("Parameters:")
    pprint(pipe[1])
    t0 = time()
    pipe[0].fit(df_under_train["Data"], df_under_train["Label"])
    print("Done in %0.3fs\n" % (time() - t0))
    print("Best score: %0.3f" % pipe[0].best_score_)
    print("Best parameters set:")
    best_parameters = pipe[0].best_estimator_.get_params()
    for param_name in sorted(pipe[1].keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    # On training data
    predict_train = pipe[0].best_estimator_.predict(df_under_train["Data"])
    # On test data
    predict_test = pipe[0].best_estimator_.predict(df_test["Data"])

    # classification report for optimal parameters (training set)
    print("\nClassification report for optimal parameters (training set)\n")
    print(classification_report(df_under_train["Label"], predict_train))

    # classification report for optimal parameters (test set)
    print("\nClassification report for optimal parameters (test data)\n")
    print(classification_report(df_test["Label"], predict_test))
    
    print("################################################################")

Performing grid search...
Pipeline: ['countVectorizer', 'LogisticRegression']
Parameters:
{'LogisticRegression__C': [0.5, 1.0],
 'countVectorizer__binary': [True, False],
 'countVectorizer__ngram_range': [(1, 1), (1, 2)]}
Done in 322.999s

Best score: 0.847
Best parameters set:
	LogisticRegression__C: 0.5
	countVectorizer__binary: True
	countVectorizer__ngram_range: (1, 2)

Classification report for optimal parameters (training set)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1538
           1       1.00      1.00      1.00      1538
           2       1.00      1.00      1.00      1538
           3       1.00      1.00      1.00      1538

    accuracy                           1.00      6152
   macro avg       1.00      1.00      1.00      6152
weighted avg       1.00      1.00      1.00      6152


Classification report for optimal parameters (test data)

              precision    recall  f1-score   support

           0 

In [7]:
# Tfidf vectorizer

# LogisticRegression Pipeline
pipe_lr = Pipeline([
    ("tfidfVectorizer", TfidfVectorizer()),
    ("LogisticRegression", LogisticRegression(multi_class="multinomial", max_iter=100))])

# RandomForestClassifier Pipeline
pipe_rf = Pipeline([
    ('tfidfVectorizer', TfidfVectorizer()),
    ('RandomForestClassifier', RandomForestClassifier(random_state=0))])

# Define pipeline parameters for all models
params_lr = {
    "tfidfVectorizer__ngram_range": [(1,1), (1,2)],
    "tfidfVectorizer__binary": [True, False],
    "LogisticRegression__C": [0.5, 1.0]
}

params_rf = {
    "tfidfVectorizer__ngram_range": [(1,1), (1,2)],
    "tfidfVectorizer__binary": [True, False],
    "RandomForestClassifier__criterion": ["gini", "entropy"]
}


grid_param_lr = GridSearchCV(
    estimator=pipe_lr,
    param_grid=params_lr,
    scoring='accuracy',
    cv=3) 

grid_param_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=params_rf,
    scoring="accuracy",
    cv=3)


pipelines = [(grid_param_lr, params_lr, pipe_lr), (grid_param_rf, params_rf, pipe_rf)]

In [8]:
from pprint import pprint
from time import time
import warnings

warnings.filterwarnings('ignore')
for pipe in pipelines:
    print("Performing grid search...")
    print("Pipeline:", [name for name, _ in pipe[2].steps])
    print("Parameters:")
    pprint(pipe[1])
    t0 = time()
    pipe[0].fit(df_train["Data"], df_train["Label"])
    print("Done in %0.3fs\n" % (time() - t0))
    print("Best score: %0.3f" % pipe[0].best_score_)
    print("Best parameters set:")
    best_parameters = pipe[0].best_estimator_.get_params()
    for param_name in sorted(pipe[1].keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    # On training data
    predict_train = pipe[0].best_estimator_.predict(df_train["Data"])
    # On test data
    predict_test = pipe[0].best_estimator_.predict(df_test["Data"])

    # classification report for optimal parameters (training set)
    print("\nClassification report for optimal parameters (training set)\n")
    print(classification_report(df_train["Label"], predict_train))

    # classification report for optimal parameters (test set)
    print("\nClassification report for optimal parameters (test data)\n")
    print(classification_report(df_test["Label"], predict_test))
    
    print("################################################################")

Performing grid search...
Pipeline: ['tfidfVectorizer', 'LogisticRegression']
Parameters:
{'LogisticRegression__C': [0.5, 1.0],
 'tfidfVectorizer__binary': [True, False],
 'tfidfVectorizer__ngram_range': [(1, 1), (1, 2)]}
Done in 342.218s

Best score: 0.877
Best parameters set:
	LogisticRegression__C: 1.0
	tfidfVectorizer__binary: True
	tfidfVectorizer__ngram_range: (1, 1)

Classification report for optimal parameters (training set)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      3366
           1       0.97      0.94      0.96      1570
           2       0.99      0.99      0.99      2972
           3       0.94      0.80      0.87      1538

    accuracy                           0.95      9446
   macro avg       0.95      0.93      0.94      9446
weighted avg       0.95      0.95      0.95      9446


Classification report for optimal parameters (test data)

              precision    recall  f1-score   support

           0 

In [9]:
##### UNDERSAMPLED TFIDF VECTORIZER ############


from pprint import pprint
from time import time
import warnings

warnings.filterwarnings('ignore')
for pipe in pipelines:
    print("Performing grid search...")
    print("Pipeline:", [name for name, _ in pipe[2].steps])
    print("Parameters:")
    pprint(pipe[1])
    t0 = time()
    pipe[0].fit(df_under_train["Data"], df_under_train["Label"])
    print("Done in %0.3fs\n" % (time() - t0))
    print("Best score: %0.3f" % pipe[0].best_score_)
    print("Best parameters set:")
    best_parameters = pipe[0].best_estimator_.get_params()
    for param_name in sorted(pipe[1].keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    # On training data
    predict_train = pipe[0].best_estimator_.predict(df_under_train["Data"])
    # On test data
    predict_test = pipe[0].best_estimator_.predict(df_test["Data"])

    # classification report for optimal parameters (training set)
    print("\nClassification report for optimal parameters (training set)\n")
    print(classification_report(df_under_train["Label"], predict_train))

    # classification report for optimal parameters (test set)
    print("\nClassification report for optimal parameters (test data)\n")
    print(classification_report(df_test["Label"], predict_test))
    
    print("################################################################")

Performing grid search...
Pipeline: ['tfidfVectorizer', 'LogisticRegression']
Parameters:
{'LogisticRegression__C': [0.5, 1.0],
 'tfidfVectorizer__binary': [True, False],
 'tfidfVectorizer__ngram_range': [(1, 1), (1, 2)]}
Done in 194.985s

Best score: 0.858
Best parameters set:
	LogisticRegression__C: 1.0
	tfidfVectorizer__binary: True
	tfidfVectorizer__ngram_range: (1, 1)

Classification report for optimal parameters (training set)

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1538
           1       0.97      0.97      0.97      1538
           2       0.99      0.99      0.99      1538
           3       0.93      0.92      0.93      1538

    accuracy                           0.95      6152
   macro avg       0.95      0.95      0.95      6152
weighted avg       0.95      0.95      0.95      6152


Classification report for optimal parameters (test data)

              precision    recall  f1-score   support

           0 

In [10]:
# import pickle

# # save the model to disk
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))

# # loaded_model = pickle.load(open(filename, 'rb'))

In [15]:
# Run model with best parameters

# LogisticRegression Pipeline
pipe_best = Pipeline([
    ("tfidfVectorizer", TfidfVectorizer()),
    ("LogisticRegression", LogisticRegression(multi_class="multinomial", max_iter=100))])

params_best = {
    "tfidfVectorizer__ngram_range": [(1,1)],
    "tfidfVectorizer__binary": [True],
    "LogisticRegression__C": [1.0]
}


grid_param_best = GridSearchCV(
    estimator=pipe_lr,
    param_grid=params_lr,
    scoring='accuracy',
    cv=3)


pipelines = [(grid_param_best, params_best, pipe_best)]
t0 = time()
model_best = pipelines[0][0].fit(df_under_train["Data"], df_under_train["Label"])
print("Done in %0.3fs\n" % (time() - t0))

print("Saving model...")
# save the model to disk
with open('../models/best_model.sav', 'wb') as f:
    pickle.dump(model_best, f)
    
print("Model saved...")

Done in 171.068s

Saving model...
Model saved...
