In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import time
import random
%matplotlib inline

import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, log_loss, accuracy_score

from xgboost import XGBClassifier

stop_words = stopwords.words('english')

In [2]:
df_raw = pd.read_csv('Data/menu_items.csv', low_memory = False)

In [3]:
relevant_columns = ['Restaurant_Item_Name', 'restaurant', 'Item_Name', 'Item_Description', 'Food_Category', 'Sugar']
df_relevant = df_raw.loc[:, relevant_columns]
df_relevant.dropna(inplace = True)
df = df_relevant.sort_values(by = 'Sugar', ascending = False)
df = df.reset_index(drop = True)

In [4]:
def sugar_classifier(sugar):
    if sugar >= 30:
        return 5
    elif (sugar < 30) & (sugar >= 7):
        return 4
    elif (sugar < 7) & (sugar > 2):
        return 3
    elif (sugar <= 2) & (sugar > 0):
        return 2
    elif sugar == 0:
        return 1

df['sugar_class'] = df['Sugar'].apply(sugar_classifier)
df

Unnamed: 0,Restaurant_Item_Name,restaurant,Item_Name,Item_Description,Food_Category,Sugar,sugar_class
0,"Dairy Queen Cookie Dough Blizzard Cake, 10 in",Dairy Queen,"Cookie Dough Blizzard Cake, 10 in","Cookie Dough Blizzard Cake, 10 in w/ Vanilla S...",Desserts,783.0,5
1,Dairy Queen Reeses Peanut Butter Cups Blizzard...,Dairy Queen,"Reeses Peanut Butter Cups Blizzard Cake, 10 in","Reeses Peanut Butter Cups Blizzard Cake, 10 in...",Desserts,737.0,5
2,"Dairy Queen Chocolate Xtreme Blizzard Cake, 10 in",Dairy Queen,"Chocolate Xtreme Blizzard Cake, 10 in","Chocolate Xtreme Blizzard Cake, 10 in w/ Brown...",Desserts,735.0,5
3,"Dairy Queen Oreo Blizzard Cake, 10 in",Dairy Queen,"Oreo Blizzard Cake, 10 in","Oreo Blizzard Cake, 10 in w/ Oreo Cookie Piece...",Desserts,720.0,5
4,"Dairy Queen DQ Round Cake, 10 in",Dairy Queen,"DQ Round Cake, 10 in","DQ Round Cake w/ Cake Crunch Filling, Chocolat...",Desserts,569.0,5
...,...,...,...,...,...,...,...
52926,Popeyes 6 Nuggets,Popeyes,6 Nuggets,"6 Nuggets, Tenders",Entrees,0.0,1
52927,"Popeyes Breast, Bonafide Spicy Chicken",Popeyes,"Breast, Bonafide Spicy Chicken","Breast, Bonafide Spicy Chicken",Entrees,0.0,1
52928,"Popeyes Thigh, Bonafide Spicy Chicken",Popeyes,"Thigh, Bonafide Spicy Chicken","Thigh, Bonafide Spicy Chicken, 300 Calories or...",Entrees,0.0,1
52929,"Popeyes Leg, Bonafide Spicy Chicken",Popeyes,"Leg, Bonafide Spicy Chicken","Leg, Bonafide Spicy Chicken, 200 Calories or U...",Entrees,0.0,1


In [5]:
#value_counts = df_relevant['restaurant'].value_counts()
#for value, count in value_counts.items():
    #print(f'{value}: {count}')

In [6]:
df['text'] = df['Restaurant_Item_Name'] + " " + df['Item_Name'] + " " + df['Item_Description'] + " " + df['Food_Category']
df.drop(columns = ['Restaurant_Item_Name', 'Item_Name', 'Item_Description', 'Food_Category', 'Sugar'], inplace = True)

In [7]:
df['restaurant'] = df['restaurant'].str.split()
for index, row in df.iterrows():
    for string in row['restaurant']:
        df.at[index, 'text'] = df.at[index, 'text'].replace(string, '')

In [8]:
df.drop(columns = 'restaurant', inplace = True)
df

Unnamed: 0,sugar_class,text
0,5,"Cookie Dough Blizzard Cake, 10 in Cookie Dou..."
1,5,"Reeses Peanut Butter Cups Blizzard Cake, 10 ..."
2,5,"Chocolate Xtreme Blizzard Cake, 10 in Chocol..."
3,5,"Oreo Blizzard Cake, 10 in Oreo Blizzard Cake..."
4,5,"DQ Round Cake, 10 in DQ Round Cake, 10 in DQ..."
...,...,...
52926,1,"6 Nuggets 6 Nuggets 6 Nuggets, Tenders Entrees"
52927,1,"Breast, Bonafide Spicy Chicken Breast, Bonafi..."
52928,1,"Thigh, Bonafide Spicy Chicken Thigh, Bonafide..."
52929,1,"Leg, Bonafide Spicy Chicken Leg, Bonafide Spi..."


In [9]:
#df.to_csv('prepared_text_data_sugar.csv', index = False)

In [10]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):
        preprocessed_data = [self.stem_doc(doc) for doc in data]
        return preprocessed_data

    def stem_doc(self, doc):
        stemmer = SnowballStemmer('english')
        lower_doc = [token.lower() for token in word_tokenize(doc) if token.isalpha()]
        filtered_doc = [token for token in lower_doc if token not in stop_words]
        stemmed_doc = [stemmer.stem(token) for token in filtered_doc]
        return " ".join(stemmed_doc)

In [11]:
X = df['text']
y = df['sugar_class']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
processor = Preprocessor()
X_train_transformed = processor.fit_transform(X_train_raw)
X_test_transformed = processor.transform(X_test_raw)

vector_pipe = Pipeline([('tfidf', TfidfVectorizer())])
X_train_vector = vector_pipe.fit_transform(X_train_transformed)
X_test_vector = vector_pipe.transform(X_test_transformed)
X_train = pd.DataFrame(X_train_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())
X_test = pd.DataFrame(X_test_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())

multinb = MultinomialNB()
multinb.fit(X_train, y_train)
y_pred_baseline = multinb.predict(X_test)
y_pred_proba_baseline = multinb.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_baseline, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_baseline)}")

Weighted F1 Score: 0.6056999897451336
Log Loss:  0.9748097620598305


In [12]:
compnb = ComplementNB()
compnb.fit(X_train, y_train)
y_pred_compnb = compnb.predict(X_test)
y_pred_proba_compnb = compnb.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_compnb, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_compnb)}")

Weighted F1 Score: 0.6021715305623653
Log Loss:  1.1526683451434967


In [14]:
lines_to_skip = sorted(random.sample(range(1, 52932), 42931))
subset = pd.read_csv('Data/prepared_text_data_sugar.csv', skiprows = lines_to_skip)

In [None]:
start_time_xgb = time.time()

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)


xgb_param_grid = {
    'learning_rate': [0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7, 1],
    'n_estimators': [100],
}

grid_xgb = GridSearchCV(xgb, xgb_param_grid, scoring = 'f1_weighted', n_jobs = 1)
grid_xgb.fit(X_train, y_train)
best_parameters = grid_xgb.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))
print("")

y_pred_xgb_param_train = grid_xgb.predict(X_train)
y_pred_xgb_param_test = grid_xgb.predict(X_test)
y_proba_xgb_param_train = grid_xgb.predict_proba(X_train)
y_proba_xgb_param_test = grid_xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print("")
print(f"Weighted F1 Score After Hyperparameter Tuning (Train): {f1_score(y_train, y_pred_xgb_param_train, average = 'weighted')}")
print(f"Weighted F1 Score After Hyperparameter Tuning (Test): {f1_score(y_train, y_pred_xgb_param_test, average = 'weighted')}")
print(f"Log Loss After Hyperparameter Tuning (Train):  {log_loss(y_train, y_proba_xgb_param_train)}")
print(f"Log Loss After Hyperparameter Tuning (Test):  {log_loss(y_test, y_proba_xgb_param_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

In [31]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
y_pred_proba_ada = ada.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_ada, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_ada)}")

Weighted F1 Score: 0.5283394948236613
Log Loss:  1.5541493294978923


In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
y_pred_proba_rfc = rfc.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_rfc, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_rfc)}")

In [None]:
rfc_param_grid = {
    'max_depth': [6],
    'max_leaf_nodes'
}