In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import time
import random
%matplotlib inline

from scipy.stats import randint, uniform

import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, log_loss

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, LeakyReLU
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.optimizers import Adam

In [None]:
def sugar_classifier(sugar):
    if sugar >= 30:
        return 5
    elif (sugar < 30) & (sugar >= 7):
        return 4
    elif (sugar < 7) & (sugar > 2):
        return 3
    elif (sugar <= 2) & (sugar > 0):
        return 2
    elif sugar == 0:
        return 1

In [None]:
stop_words = stopwords.words('english')

class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):
        preprocessed_data = [self.stem_doc(doc) for doc in data]
        return preprocessed_data

    def stem_doc(self, doc):
        stemmer = SnowballStemmer('english')
        lower_doc = [token.lower() for token in word_tokenize(doc) if token.isalpha()]
        filtered_doc = [token for token in lower_doc if token not in stop_words]
        stemmed_doc = [stemmer.stem(token) for token in filtered_doc]
        return " ".join(stemmed_doc)

In [None]:
start_time_read = time.time()

df_raw = pd.read_csv('Data/menu_items.csv', low_memory = False)
relevant_columns = ['Restaurant_Item_Name', 'restaurant', 'Item_Name', 'Item_Description', 'Food_Category', 'Sugar']
df_relevant = df_raw.loc[:, relevant_columns]
df_relevant.dropna(inplace = True)
df = df_relevant.sort_values(by = 'Sugar', ascending = False)
df = df.reset_index(drop = True)
df['sugar_class'] = df['Sugar'].apply(sugar_classifier)

end_time_read = time.time()
read_time = end_time_read - start_time_read

print(f"Data Reading Runtime:  {read_time:.2f} seconds")
df

In [None]:
value_counts = df['restaurant'].value_counts()
for value, count in value_counts.items():
    print(f'{value}: {count}')

In [None]:
start_time_clean = time.time()

df['text'] = df['Restaurant_Item_Name'] + " " + df['Item_Name'] + " " + df['Item_Description'] + " " + df['Food_Category']
df.drop(columns = ['Restaurant_Item_Name', 'Item_Name', 'Item_Description', 'Food_Category', 'Sugar'], inplace = True)
df['restaurant'] = df['restaurant'].str.split()
for index, row in df.iterrows():
    for string in row['restaurant']:
        df.at[index, 'text'] = df.at[index, 'text'].replace(string, '')
df.drop(columns = 'restaurant', inplace = True)

end_time_clean = time.time()
clean_time = end_time_clean - start_time_clean

print(f"Data Cleaning Runtime:  {clean_time:.2f} seconds")
df

In [None]:
df.to_csv('prepared_text_data_sugar.csv', index = False)

In [None]:
X = data['text']
y = data['sugar_class']

#One-Hot_Encoding target variable, train/test split
ohe = OneHotEncoder(drop='first', sparse=False)
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size = 0.2, random_state = 200)
y_train_raw = y_train_raw.values.reshape(-1, 1)
y_test_raw = y_test_raw.values.reshape(-1, 1)
y_train = ohe.fit_transform(y_train_raw - 1)
y_test = ohe.transform(y_test_raw - 1)

#Pre-processing and vectorizing text
processor = Preprocessor()
X_train_transformed = processor.fit_transform(X_train_raw)
X_test_transformed = processor.transform(X_test_raw)
vector_pipe = Pipeline([('tfidf', TfidfVectorizer())])
X_train_vector = vector_pipe.fit_transform(X_train_transformed)
X_test_vector = vector_pipe.transform(X_test_transformed)

#Returning independent variables to pd.Dataframe
X_train = pd.DataFrame(X_train_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())
X_test = pd.DataFrame(X_test_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())

In [None]:
#Ensuring X_train and X_test are proper data types
X_train_array = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
X_test_array = X_test.values if isinstance(X_test, pd.DataFrame) else X_test

#
y_train_reshaped = np.argmax(y_train_encoded, axis=2)

#
def f1_metric(y_true, y_pred):
    y_pred = tf.round(y_pred)
    true_positives = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32), axis=0)
    predicted_positives = tf.reduce_sum(tf.cast(y_pred, tf.float32), axis=0)
    actual_positives = tf.reduce_sum(tf.cast(y_true, tf.float32), axis=0)
    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    recall = true_positives / (actual_positives + tf.keras.backend.epsilon())
    
    f1 = 2 * precision * recall / (precision + recall + tf.keras.backend.epsilon())
    return tf.reduce_mean(f1)

#
trainCallback = EarlyStopping(monitor='loss', min_delta = 1e-6, patience = 5)

reg = l2(0.0001)
opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

In [None]:
model = Sequential()
model.add(Dense(1800, activation = LeakyReLU(), input_shape = (X_train_array.shape[1],), kernel_regularizer = reg))
model.add(Dense(4, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = [f1_metric])
model.fit(X_train_array, y_train_reshaped, epochs = 15, callbacks=[trainCallback], batch_size= 128, validation_split = 0.2)

In [None]:
start_time_preprocess = time.time()

X = df['text']
y = df['sugar_class']

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size = 0.2, random_state = 200)
processor = Preprocessor()
X_train_transformed = processor.fit_transform(X_train_raw)
X_test_transformed = processor.transform(X_test_raw)

vector_pipe = Pipeline([('tfidf', TfidfVectorizer())])
X_train_vector = vector_pipe.fit_transform(X_train_transformed)
X_test_vector = vector_pipe.transform(X_test_transformed)
X_train = pd.DataFrame(X_train_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())
X_test = pd.DataFrame(X_test_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())

end_time_preprocess = time.time()
preprocess_time = end_time_preprocess - start_time_preprocess

print(f"Preprocessing Runtime:  {preprocess_time}")
X_train

In [None]:
start_time_nb = time.time()

multinb = Pipeline([('multinb', MultinomialNB())])
multinb.fit(X_train, y_train)
y_pred_baseline_train = multinb.predict(X_train)
y_pred_baseline_test = multinb.predict(X_test)
y_proba_baseline_train = multinb.predict_proba(X_train)
y_proba_baseline_test = multinb.predict_proba(X_test)

compnb = Pipeline([('compnb', ComplementNB())])
compnb.fit(X_train, y_train)
y_pred_compnb_train = compnb.predict(X_train)
y_pred_compnb_test = compnb.predict(X_test)
y_proba_compnb_train = compnb.predict_proba(X_train)
y_proba_compnb_test = compnb.predict_proba(X_test)

end_time_nb = time.time()
nb_time = end_time_nb - start_time_nb

print(f"Naive-Bayes Runtime:  {nb_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Multinomial NB Train: {f1_score(y_train, y_pred_baseline_train, average = 'weighted')}")
print(f"Log Loss, Multinomial NB Train:  {log_loss(y_train, y_proba_baseline_train)}")
print(f"Weighted F1 Score, Multinomial NB Test: {f1_score(y_test, y_pred_baseline_test, average = 'weighted')}")
print(f"Log Loss, Multinomial NB Test:  {log_loss(y_test, y_proba_baseline_test)}")
print("")
print(f"Weighted F1 Score, Complement NB Train: {f1_score(y_train, y_pred_compnb_train, average = 'weighted')}")
print(f"Log Loss, Complement NB Train:  {log_loss(y_train, y_proba_compnb_train)}")
print(f"Weighted F1 Score, Complement NB Test: {f1_score(y_test, y_pred_compnb_test, average = 'weighted')}")
print(f"Log Loss, Complement NB Test:  {log_loss(y_test, y_proba_compnb_test)}")

In [None]:
start_time_ada = time.time()

ada = Pipeline([('ada', AdaBoostClassifier())])
ada.fit(X_train, y_train)
y_pred_ada_train = ada.predict(X_train)
y_pred_ada_test = ada.predict(X_test)
y_proba_ada_train = ada.predict_proba(X_train)
y_proba_ada_test = ada.predict_proba(X_test)

end_time_ada = time.time()
ada_time = end_time_ada - start_time_ada

print(f"AdaBoost Runtime:  {ada_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, AdaBoost Train: {f1_score(y_train, y_pred_ada_train, average = 'weighted')}")
print(f"Log Loss, AdaBoost Train:  {log_loss(y_train, y_proba_ada_train)}")
print(f"Weighted F1 Score, AdaBoost Test: {f1_score(y_test, y_pred_ada_test, average = 'weighted')}")
print(f"Log Loss, AdaBoost Test:  {log_loss(y_test, y_proba_ada_test)}")

In [None]:
start_time_rfc = time.time()

rfc = Pipeline([('rfc', RandomForestClassifier(n_estimators = 200,
                                              criterion = 'gini',
                                              max_depth = None,
                                              min_samples_split = 4,
                                              min_samples_leaf = 1,
                                              max_leaf_nodes = None,
                                              max_samples = None,
                                              random_state = 200))])
rfc.fit(X_train, y_train)
y_pred_rfc_train = rfc.predict(X_train)
y_pred_rfc_test = rfc.predict(X_test)
y_proba_rfc_train = rfc.predict_proba(X_train)
y_proba_rfc_test = rfc.predict_proba(X_test)

end_time_rfc = time.time()
rfc_time = end_time_rfc - start_time_rfc

print(f"Random Forest Runtime:  {rfc_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Random Forest Train: {f1_score(y_train, y_pred_rfc_train, average = 'weighted')}")
print(f"Log Loss, Random Forest Train:  {log_loss(y_train, y_proba_rfc_train)}")
print(f"Weighted F1 Score, Random Forest Test: {f1_score(y_test, y_pred_rfc_test, average = 'weighted')}")
print(f"Log Loss, Random Forest Test:  {log_loss(y_test, y_proba_rfc_test)}")

In [None]:
start_time_xgb = time.time()

xgb = Pipeline([('xgb', XGBClassifier(n_estimators = 500,
              learning_rate = 0.29775,
              max_depth = 8,
              min_child_weight = 2,
              random_state = 100))])
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()
xgb_time = end_time_xgb - start_time_xgb

print(f"XGB Runtime:  {xgb_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, XGB Train: {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Log Loss, XGB Train:  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Weighted F1 Score, XGB Test: {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss, XGB Test:  {log_loss(y_test, y_proba_xgb_test)}")

In [None]:
start_time_etc = time.time()

etc = Pipeline([('etc', ExtraTreesClassifier(n_estimators = 400,
                                             max_features = 'sqrt',
                                             max_samples = 0.5,
                                             bootstrap = True,
                                             random_state = 200))])
etc.fit(X_train, y_train)
y_pred_etc_train = etc.predict(X_train)
y_pred_etc_test = etc.predict(X_test)
y_proba_etc_train = etc.predict_proba(X_train)
y_proba_etc_test = etc.predict_proba(X_test)

end_time_etc = time.time()
etc_time = end_time_etc - start_time_etc

print(f"Extra Trees Runtime:  {etc_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Extra Trees Train: {f1_score(y_train, y_pred_etc_train, average = 'weighted')}")
print(f"Log Loss, Extra Trees Train:  {log_loss(y_train, y_proba_etc_train)}")
print(f"Weighted F1 Score, Extra Trees Test: {f1_score(y_test, y_pred_etc_test, average = 'weighted')}")
print(f"Log Loss, Extra Trees Test:  {log_loss(y_test, y_proba_etc_test)}")

In [None]:
start_time_avg = time.time()

avg = VotingClassifier(estimators = [('rfc', rfc),
                                     ('xgb', xgb),
                                     ('etc', etc)],
                                     weights = [0.25, 0.25, 0.5],
                                     voting = 'soft')
avg.fit(X_train, y_train)
y_pred_avg_train = avg.predict(X_train)
y_pred_avg_test = avg.predict(X_test)
y_proba_avg_train = avg.predict_proba(X_train)
y_proba_avg_test = avg.predict_proba(X_test)

end_time_avg = time.time()
avg_time = end_time_avg - start_time_avg

print(f"Average Voting Classifier Runtime:  {avg_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Average Voting Classifier Train: {f1_score(y_train, y_pred_avg_train, average = 'weighted')}")
print(f"Log Loss, Average Voting Classifier Train:  {log_loss(y_train, y_proba_avg_train)}")
print(f"Weighted F1 Score, Average Voting Classifier Test: {f1_score(y_test, y_pred_avg_test, average = 'weighted')}")
print(f"Log Loss, Average Voting Classifier Test:  {log_loss(y_test, y_proba_avg_test)}")

In [None]:
start_time_stack = time.time()

estimators = [('rfc', rfc),
              ('xgb', xgb)]
              #('etc', etc)]

stack_clf = StackingClassifier(estimators = estimators,
                               final_estimator = etc)
stack_clf.fit(X_train, y_train)
y_pred_stack_train = stack_clf.predict(X_train)
y_pred_stack_test = stack_clf.predict(X_test)
y_proba_stack_train = stack_clf.predict_proba(X_train)
y_proba_stack_test = stack_clf.predict_proba(X_test)

end_time_stack = time.time()
stack_time = end_time_stack - start_time_stack

print(f"Stacking Classifier Runtime:  {stack_time:.2f} seconds")
print("")
print(f"Weighted F1 Score, Stacking Classifier Train: {f1_score(y_train, y_pred_stack_train, average = 'weighted')}")
print(f"Log Loss, Stacking Classifier Train:  {log_loss(y_train, y_proba_stack_train)}")
print(f"Weighted F1 Score, Stacking Classifier Test: {f1_score(y_test, y_pred_stack_test, average = 'weighted')}")
print(f"Log Loss, Stacking Classifier Test:  {log_loss(y_test, y_proba_stack_test)}")