In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import time
import random
%matplotlib inline

from preprocessor_class import Preprocessor

import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, log_loss

from xgboost import XGBClassifier

stop_words = stopwords.words('english')

In [2]:
df = pd.read_csv('Data/prepared_text_data_sugar.csv', low_memory = False)

In [3]:
start_time_preprocess = time.time()

X = df['text']
y = df['sugar_class']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 200)
processor = Preprocessor()
X_train_transformed = processor.fit_transform(X_train_raw)
X_test_transformed = processor.transform(X_test_raw)

vector_pipe = Pipeline([('tfidf', TfidfVectorizer())])
X_train_vector = vector_pipe.fit_transform(X_train_transformed)
X_test_vector = vector_pipe.transform(X_test_transformed)
X_train = pd.DataFrame(X_train_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())
X_test = pd.DataFrame(X_test_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())

end_time_preprocess = time.time()
preprocess_time = end_time_preprocess - start_time_preprocess

print(f"Preprocessing Runtime:  {preprocess_time}")
X_train

Preprocessing Runtime:  27.266846656799316


Unnamed: 0,aa,abso,absolut,abv,acai,accompani,ace,acha,acqua,ad,...,zinfidel,zing,zinger,ziti,zombi,zone,zt,zucchini,zuppa,íleaf
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
start_time_nb = time.time()

multinb = Pipeline([('multinb', MultinomialNB())])
multinb.fit(X_train, y_train)
y_pred_baseline_train = multinb.predict(X_train)
y_pred_baseline_test = multinb.predict(X_test)
y_proba_baseline_train = multinb.predict_proba(X_train)
y_proba_baseline_test = multinb.predict_proba(X_test)

compnb = Pipeline([('compnb', ComplementNB())])
compnb.fit(X_train, y_train)
y_pred_compnb_train = compnb.predict(X_train)
y_pred_compnb_test = compnb.predict(X_test)
y_proba_compnb_train = compnb.predict_proba(X_train)
y_proba_compnb_test = compnb.predict_proba(X_test)

end_time_nb = time.time()
nb_time = end_time_nb - start_time_nb

print(f"Naive-Bayes Runtime:  {nb_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Multinomial NB Train: {f1_score(y_train, y_pred_baseline_train, average = 'weighted')}")
print(f"Log Loss, Multinomial NB Train:  {log_loss(y_train, y_proba_baseline_train)}")
print(f"Weighted F1 Score, Multinomial NB Test: {f1_score(y_test, y_pred_baseline_test, average = 'weighted')}")
print(f"Log Loss, Multinomial NB Test:  {log_loss(y_test, y_proba_baseline_test)}")
print("")
print(f"Weighted F1 Score, Complement NB Train: {f1_score(y_train, y_pred_compnb_train, average = 'weighted')}")
print(f"Log Loss, Complement NB Train:  {log_loss(y_train, y_proba_compnb_train)}")
print(f"Weighted F1 Score, Complement NB Test: {f1_score(y_test, y_pred_compnb_test, average = 'weighted')}")
print(f"Log Loss, Complement NB Test:  {log_loss(y_test, y_proba_compnb_test)}")

Naive-Bayes Runtime:  3.89 seconds

Weighted F1 Score, Multinomial NB Train: 0.6296011066073247
Log Loss, Multinomial NB Train:  0.9319198344350808
Weighted F1 Score, Multinomial NB Test: 0.602859353334295
Log Loss, Multinomial NB Test:  0.9698460540526268

Weighted F1 Score, Complement NB Train: 0.631216626696921
Log Loss, Complement NB Train:  1.1299313090432557
Weighted F1 Score, Complement NB Test: 0.594470343461912
Log Loss, Complement NB Test:  1.1593542368287353


In [13]:
start_time_ada = time.time()

ada = Pipeline([('ada', AdaBoostClassifier())])
ada.fit(X_train, y_train)
y_pred_ada_train = ada.predict(X_train)
y_pred_ada_test = ada.predict(X_test)
y_proba_ada_train = ada.predict_proba(X_train)
y_proba_ada_test = ada.predict_proba(X_test)

end_time_ada = time.time()
ada_time = end_time_ada - start_time_ada

print(f"AdaBoost Runtime:  {ada_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, AdaBoost Train: {f1_score(y_train, y_pred_ada_train, average = 'weighted')}")
print(f"Log Loss, AdaBoost Train:  {log_loss(y_train, y_proba_ada_train)}")
print(f"Weighted F1 Score, AdaBoost Test: {f1_score(y_test, y_pred_ada_test, average = 'weighted')}")
print(f"Log Loss, AdaBoost Test:  {log_loss(y_test, y_proba_ada_test)}")

AdaBoost Runtime:  214.59 seconds

Weighted F1 Score, AdaBoost Train: 0.5291930608118532
Log Loss, AdaBoost Train:  1.557688314123807
Weighted F1 Score, AdaBoost Test: 0.5303689962703744
Log Loss, AdaBoost Test:  1.5596262938722978


In [14]:
start_time_rfc = time.time()

rfc = Pipeline([('rfc', RandomForestClassifier(n_estimators = 200,
                                              criterion = 'gini',
                                              max_depth = None,
                                              min_samples_split = 4,
                                              min_samples_leaf = 1,
                                              max_leaf_nodes = None,
                                              max_samples = None,
                                              random_state = 200))])
rfc.fit(X_train, y_train)
y_pred_rfc_train = rfc.predict(X_train)
y_pred_rfc_test = rfc.predict(X_test)
y_proba_rfc_train = rfc.predict_proba(X_train)
y_proba_rfc_test = rfc.predict_proba(X_test)

end_time_rfc = time.time()
rfc_time = end_time_rfc - start_time_rfc

print(f"Random Forest Runtime:  {rfc_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Random Forest Train: {f1_score(y_train, y_pred_rfc_train, average = 'weighted')}")
print(f"Log Loss, Random Forest Train:  {log_loss(y_train, y_proba_rfc_train)}")
print(f"Weighted F1 Score, Random Forest Test: {f1_score(y_test, y_pred_rfc_test, average = 'weighted')}")
print(f"Log Loss, Random Forest Test:  {log_loss(y_test, y_proba_rfc_test)}")

Random Forest Runtime:  196.12 seconds

Weighted F1 Score, Random Forest Train: 0.9854428498834271
Log Loss, Random Forest Train:  0.2236946141191958
Weighted F1 Score, Random Forest Test: 0.7819321911037243
Log Loss, Random Forest Test:  0.6277176649319787


In [15]:
start_time_xgb = time.time()

xgb = Pipeline([('xgb', XGBClassifier(n_estimators = 500,
              learning_rate = 0.29775,
              max_depth = 8,
              min_child_weight = 2,
              random_state = 100))])
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()
xgb_time = end_time_xgb - start_time_xgb

print(f"XGB Runtime:  {xgb_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, XGB Train: {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Log Loss, XGB Train:  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Weighted F1 Score, XGB Test: {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss, XGB Test:  {log_loss(y_test, y_proba_xgb_test)}")

XGB Runtime:  1758.27 seconds

Weighted F1 Score, XGB Train: 0.9477228463965388
Log Loss, XGB Train:  0.27168454377174867
Weighted F1 Score, XGB Test: 0.7795149067683812
Log Loss, XGB Test:  0.5828199685655608


In [16]:
start_time_etc = time.time()

etc = Pipeline([('etc', ExtraTreesClassifier(n_estimators = 400,
                                             max_features = 'sqrt',
                                             max_samples = 0.5,
                                             bootstrap = True,
                                             random_state = 200))])
etc.fit(X_train, y_train)
y_pred_etc_train = etc.predict(X_train)
y_pred_etc_test = etc.predict(X_test)
y_proba_etc_train = etc.predict_proba(X_train)
y_proba_etc_test = etc.predict_proba(X_test)

end_time_etc = time.time()
etc_time = end_time_etc - start_time_etc

print(f"Extra Trees Runtime:  {etc_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Extra Trees Train: {f1_score(y_train, y_pred_etc_train, average = 'weighted')}")
print(f"Log Loss, Extra Trees Train:  {log_loss(y_train, y_proba_etc_train)}")
print(f"Weighted F1 Score, Extra Trees Test: {f1_score(y_test, y_pred_etc_test, average = 'weighted')}")
print(f"Log Loss, Extra Trees Test:  {log_loss(y_test, y_proba_etc_test)}")

Extra Trees Runtime:  400.37 seconds

Weighted F1 Score, Extra Trees Train: 0.9814200136427785
Log Loss, Extra Trees Train:  0.3060257050998168
Weighted F1 Score, Extra Trees Test: 0.7804834437494838
Log Loss, Extra Trees Test:  0.6396806810788392


In [17]:
start_time_avg = time.time()

avg = VotingClassifier(estimators = [('rfc', rfc),
                                     ('xgb', xgb),
                                     ('etc', etc)],
                                     weights = [0.25, 0.25, 0.5],
                                     voting = 'soft')
avg.fit(X_train, y_train)
y_pred_avg_train = avg.predict(X_train)
y_pred_avg_test = avg.predict(X_test)
y_proba_avg_train = avg.predict_proba(X_train)
y_proba_avg_test = avg.predict_proba(X_test)

end_time_avg = time.time()
avg_time = end_time_avg - start_time_avg

print(f"Average Voting Classifier Runtime:  {avg_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Average Voting Classifier Train: {f1_score(y_train, y_pred_avg_train, average = 'weighted')}")
print(f"Log Loss, Average Voting Classifier Train:  {log_loss(y_train, y_proba_avg_train)}")
print(f"Weighted F1 Score, Average Voting Classifier Test: {f1_score(y_test, y_pred_avg_test, average = 'weighted')}")
print(f"Log Loss, Average Voting Classifier Test:  {log_loss(y_test, y_proba_avg_test)}")

Average Voting Classifier Runtime:  4174.52 seconds

Weighted F1 Score, Average Voting Classifier Train: 0.9813963644952933
Log Loss, Average Voting Classifier Train:  0.2716736212586663
Weighted F1 Score, Average Voting Classifier Test: 0.7957843884139046
Log Loss, Average Voting Classifier Test:  0.5959754064663629


In [18]:
start_time_stack = time.time()

estimators = [('rfc', rfc),
              ('xgb', xgb)]
              #('etc', etc)]

stack_clf = StackingClassifier(estimators = estimators,
                               final_estimator = etc)
stack_clf.fit(X_train, y_train)
y_pred_stack_train = stack_clf.predict(X_train)
y_pred_stack_test = stack_clf.predict(X_test)
y_proba_stack_train = stack_clf.predict_proba(X_train)
y_proba_stack_test = stack_clf.predict_proba(X_test)

end_time_stack = time.time()
stack_time = end_time_stack - start_time_stack

print(f"Stacking Classifier Runtime:  {stack_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Stacking Classifier Train: {f1_score(y_train, y_pred_stack_train, average = 'weighted')}")
print(f"Log Loss, Stacking Classifier Train:  {log_loss(y_train, y_proba_stack_train)}")
print(f"Weighted F1 Score, Stacking Classifier Test: {f1_score(y_test, y_pred_stack_test, average = 'weighted')}")
print(f"Log Loss, Stacking Classifier Test:  {log_loss(y_test, y_proba_stack_test)}")

Stacking Classifier Runtime:  4174.52 seconds

Weighted F1 Score, Stacking Classifier Train: 0.9742016514410242
Log Loss, Stacking Classifier Train:  0.18996482630955172
Weighted F1 Score, Stacking Classifier Test: 0.7969076234123531
Log Loss, Stacking Classifier Test:  0.5742685261226821


In [20]:
print(f"Stacking Classifier Runtime:  {stack_time / 60:.2f} minutes")

Stacking Classifier Runtime:  321.99 minutes
