In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import time
import random
%matplotlib inline

from scipy.stats import randint, uniform

import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, log_loss, accuracy_score

from xgboost import XGBClassifier

stop_words = stopwords.words('english')

In [15]:
df_raw = pd.read_csv('Data/menu_items.csv', low_memory = False)

In [16]:
relevant_columns = ['Restaurant_Item_Name', 'restaurant', 'Item_Name', 'Item_Description', 'Food_Category', 'Sugar']
df_relevant = df_raw.loc[:, relevant_columns]
df_relevant.dropna(inplace = True)
df = df_relevant.sort_values(by = 'Sugar', ascending = False)
df = df.reset_index(drop = True)

In [17]:
def sugar_classifier(sugar):
    if sugar >= 30:
        return 5
    elif (sugar < 30) & (sugar >= 7):
        return 4
    elif (sugar < 7) & (sugar > 2):
        return 3
    elif (sugar <= 2) & (sugar > 0):
        return 2
    elif sugar == 0:
        return 1

df['sugar_class'] = df['Sugar'].apply(sugar_classifier)
df

Unnamed: 0,Restaurant_Item_Name,restaurant,Item_Name,Item_Description,Food_Category,Sugar,sugar_class
0,"Dairy Queen Cookie Dough Blizzard Cake, 10 in",Dairy Queen,"Cookie Dough Blizzard Cake, 10 in","Cookie Dough Blizzard Cake, 10 in w/ Vanilla S...",Desserts,783.0,5
1,Dairy Queen Reeses Peanut Butter Cups Blizzard...,Dairy Queen,"Reeses Peanut Butter Cups Blizzard Cake, 10 in","Reeses Peanut Butter Cups Blizzard Cake, 10 in...",Desserts,737.0,5
2,"Dairy Queen Chocolate Xtreme Blizzard Cake, 10 in",Dairy Queen,"Chocolate Xtreme Blizzard Cake, 10 in","Chocolate Xtreme Blizzard Cake, 10 in w/ Brown...",Desserts,735.0,5
3,"Dairy Queen Oreo Blizzard Cake, 10 in",Dairy Queen,"Oreo Blizzard Cake, 10 in","Oreo Blizzard Cake, 10 in w/ Oreo Cookie Piece...",Desserts,720.0,5
4,"Dairy Queen DQ Round Cake, 10 in",Dairy Queen,"DQ Round Cake, 10 in","DQ Round Cake w/ Cake Crunch Filling, Chocolat...",Desserts,569.0,5
...,...,...,...,...,...,...,...
52926,Popeyes 6 Nuggets,Popeyes,6 Nuggets,"6 Nuggets, Tenders",Entrees,0.0,1
52927,"Popeyes Breast, Bonafide Spicy Chicken",Popeyes,"Breast, Bonafide Spicy Chicken","Breast, Bonafide Spicy Chicken",Entrees,0.0,1
52928,"Popeyes Thigh, Bonafide Spicy Chicken",Popeyes,"Thigh, Bonafide Spicy Chicken","Thigh, Bonafide Spicy Chicken, 300 Calories or...",Entrees,0.0,1
52929,"Popeyes Leg, Bonafide Spicy Chicken",Popeyes,"Leg, Bonafide Spicy Chicken","Leg, Bonafide Spicy Chicken, 200 Calories or U...",Entrees,0.0,1


In [18]:
value_counts = df_relevant['restaurant'].value_counts()
for value, count in value_counts.items():
    print(f'{value}: {count}')

In [19]:
df['text'] = df['Restaurant_Item_Name'] + " " + df['Item_Name'] + " " + df['Item_Description'] + " " + df['Food_Category']
df.drop(columns = ['Restaurant_Item_Name', 'Item_Name', 'Item_Description', 'Food_Category', 'Sugar'], inplace = True)

In [20]:
df['restaurant'] = df['restaurant'].str.split()
for index, row in df.iterrows():
    for string in row['restaurant']:
        df.at[index, 'text'] = df.at[index, 'text'].replace(string, '')

In [21]:
df.drop(columns = 'restaurant', inplace = True)
df

Unnamed: 0,sugar_class,text
0,5,"Cookie Dough Blizzard Cake, 10 in Cookie Dou..."
1,5,"Reeses Peanut Butter Cups Blizzard Cake, 10 ..."
2,5,"Chocolate Xtreme Blizzard Cake, 10 in Chocol..."
3,5,"Oreo Blizzard Cake, 10 in Oreo Blizzard Cake..."
4,5,"DQ Round Cake, 10 in DQ Round Cake, 10 in DQ..."
...,...,...
52926,1,"6 Nuggets 6 Nuggets 6 Nuggets, Tenders Entrees"
52927,1,"Breast, Bonafide Spicy Chicken Breast, Bonafi..."
52928,1,"Thigh, Bonafide Spicy Chicken Thigh, Bonafide..."
52929,1,"Leg, Bonafide Spicy Chicken Leg, Bonafide Spi..."


In [22]:
df.to_csv('prepared_text_data_sugar.csv', index = False)

In [23]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):
        preprocessed_data = [self.stem_doc(doc) for doc in data]
        return preprocessed_data

    def stem_doc(self, doc):
        stemmer = SnowballStemmer('english')
        lower_doc = [token.lower() for token in word_tokenize(doc) if token.isalpha()]
        filtered_doc = [token for token in lower_doc if token not in stop_words]
        stemmed_doc = [stemmer.stem(token) for token in filtered_doc]
        return " ".join(stemmed_doc)

In [24]:
X = df['text']
y = df['sugar_class']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 200)
processor = Preprocessor()
X_train_transformed = processor.fit_transform(X_train_raw)
X_test_transformed = processor.transform(X_test_raw)

vector_pipe = Pipeline([('tfidf', TfidfVectorizer())])
X_train_vector = vector_pipe.fit_transform(X_train_transformed)
X_test_vector = vector_pipe.transform(X_test_transformed)
X_train = pd.DataFrame(X_train_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())
X_test = pd.DataFrame(X_test_vector.toarray(), columns = vector_pipe['tfidf'].get_feature_names())

multinb = MultinomialNB()
multinb.fit(X_train, y_train)
y_pred_baseline = multinb.predict(X_test)
y_pred_proba_baseline = multinb.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_baseline, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_baseline)}")

Weighted F1 Score: 0.5982513713262635
Log Loss:  0.9766313775364627


In [13]:
compnb = ComplementNB()
compnb.fit(X_train, y_train)
y_pred_compnb = compnb.predict(X_test)
y_pred_proba_compnb = compnb.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_compnb, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_compnb)}")

Weighted F1 Score: 0.6021715305623653
Log Loss:  1.1526683451434967


In [29]:
lines_to_skip = sorted(random.sample(range(1, 52932), 32931))
subset = pd.read_csv('Data/prepared_text_data_sugar.csv', skiprows = lines_to_skip)

X_sub = subset['text']
y_sub = subset['Sugar_Class']
X_train_sub_raw, X_test_sub_raw, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size = 0.2, random_state = 100)
X_train_transformed_sub = processor.fit_transform(X_train_sub_raw)
X_test_transformed_sub = processor.transform(X_test_sub_raw)

vector_pipe_sub = Pipeline([('tfidf', TfidfVectorizer())])
X_train_vector_sub = vector_pipe_sub.fit_transform(X_train_transformed_sub)
X_test_vector_sub = vector_pipe_sub.transform(X_test_transformed_sub)
X_train_sub = pd.DataFrame(X_train_vector_sub.toarray(), columns = vector_pipe_sub['tfidf'].get_feature_names())
X_test_sub = pd.DataFrame(X_test_vector_sub.toarray(), columns = vector_pipe_sub['tfidf'].get_feature_names())

In [30]:
start_time_xgb = time.time()

xgb = XGBClassifier()
xgb.fit(X_train_sub, y_train_sub)
y_pred_xgb_train = xgb.predict(X_train_sub)
y_pred_xgb_test = xgb.predict(X_test_sub)
y_proba_xgb_train = xgb.predict_proba(X_train_sub)
y_proba_xgb_test = xgb.predict_proba(X_test_sub)

xgb_params = {
    'learning_rate': uniform(0.1, 0.2),
    'max_depth': randint(3, 9),
    'min_child_weight': randint(1, 3),
}

search_xgb = RandomizedSearchCV(xgb, xgb_params, scoring = 'f1_weighted', n_jobs = 1)
search_xgb.fit(X_train_sub, y_train_sub)
best_parameters = search_xgb.best_params_

print('Randomized Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))
print("")

y_pred_xgb_param_train = search_xgb.predict(X_train_sub)
y_pred_xgb_param_test = search_xgb.predict(X_test_sub)
y_proba_xgb_param_train = search_xgb.predict_proba(X_train_sub)
y_proba_xgb_param_test = search_xgb.predict_proba(X_test_sub)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train_sub, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test_sub, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train_sub, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test_sub, y_proba_xgb_test)}")
print("")
print(f"Weighted F1 Score After Hyperparameter Tuning (Train): {f1_score(y_train_sub, y_pred_xgb_param_train, average = 'weighted')}")
print(f"Weighted F1 Score After Hyperparameter Tuning (Test): {f1_score(y_test_sub, y_pred_xgb_param_test, average = 'weighted')}")
print(f"Log Loss After Hyperparameter Tuning (Train):  {log_loss(y_train_sub, y_proba_xgb_param_train)}")
print(f"Log Loss After Hyperparameter Tuning (Test):  {log_loss(y_test_sub, y_proba_xgb_param_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Randomized Search found the following optimal parameters: 
learning_rate: 0.2977513564547608
max_depth: 8
min_child_weight: 2

Weighted F1 Score (Train): 0.8400278977452622
Weighted F1 Score (Test): 0.6952111329477347
Log Loss (Train):  0.5694687150081663
Log Loss (Test):  0.782527227344457

Weighted F1 Score After Hyperparameter Tuning (Train): 0.8729975145964162
Weighted F1 Score After Hyperparameter Tuning (Test): 0.7067383656667726
Log Loss After Hyperparameter Tuning (Train):  0.49105750232531137
Log Loss After Hyperparameter Tuning (Test):  0.7490659579653876
Run Time: 6923.436443805695


In [26]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_test)
y_pred_proba_ada = ada.predict_proba(X_test)

print(f"Weighted F1 Score: {f1_score(y_test, y_pred_ada, average = 'weighted')}")
print(f"Log Loss:  {log_loss(y_test, y_pred_proba_ada)}")

Weighted F1 Score: 0.5283394948236613
Log Loss:  1.5541493294978928


In [49]:
start_time_rfc = time.time()

rfc = RandomForestClassifier()
rfc.fit(X_train_sub, y_train_sub)
y_pred_rfc_train = rfc.predict(X_train_sub)
y_pred_rfc_test = rfc.predict(X_test_sub)
y_proba_rfc_train = rfc.predict_proba(X_train_sub)
y_proba_rfc_test = rfc.predict_proba(X_test_sub)

rfc_params = {
    'n_estimators':  [10, 50, 100],
    'max_depth':  randint(3, 9),
    'min_samples_split':  randint(2, 4),
    'min_samples_leaf':  randint(1, 3),
}

search_rfc = RandomizedSearchCV(rfc, rfc_params, scoring = 'f1_weighted', n_jobs = 1)
search_rfc.fit(X_train_sub, y_train_sub)
best_parameters = search_rfc.best_params_

print('Randomized Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))
print("")

y_pred_rfc_param_train = search_rfc.predict(X_train_sub)
y_pred_rfc_param_test = search_rfc.predict(X_test_sub)
y_proba_rfc_param_train = search_rfc.predict_proba(X_train_sub)
y_proba_rfc_param_test = search_rfc.predict_proba(X_test_sub)

end_time_rfc = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train_sub, y_pred_rfc_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test_sub, y_pred_rfc_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train_sub, y_proba_rfc_train)}")
print(f"Log Loss (Test):  {log_loss(y_test_sub, y_proba_rfc_test)}")
print("")
print(f"Weighted F1 Score After Hyperparameter Tuning (Train): {f1_score(y_train_sub, y_pred_rfc_param_train, average = 'weighted')}")
print(f"Weighted F1 Score After Hyperparameter Tuning (Test): {f1_score(y_test_sub, y_pred_rfc_param_test, average = 'weighted')}")
print(f"Log Loss After Hyperparameter Tuning (Train):  {log_loss(y_train_sub, y_proba_rfc_param_train)}")
print(f"Log Loss After Hyperparameter Tuning (Test):  {log_loss(y_test_sub, y_proba_rfc_param_test)}")
print(f"Run Time: {end_time_rfc - start_time_rfc}")

Randomized Search found the following optimal parameters: 
max_depth: 8
min_samples_leaf: 1
min_samples_split: 2
n_estimators: 100

Weighted F1 Score (Train): 0.9931866485409155
Weighted F1 Score (Test): 0.726367897119568
Log Loss (Train):  0.18638262212047357
Log Loss (Test):  0.7592780225634445

Weighted F1 Score After Hyperparameter Tuning (Train): 0.46278718366778837
Weighted F1 Score After Hyperparameter Tuning (Test): 0.454376778264133
Log Loss After Hyperparameter Tuning (Train):  1.291941409039672
Log Loss After Hyperparameter Tuning (Test):  1.3010797081137913
Run Time: 237.08182501792908


In [50]:
start_time_rfc1 = time.time()

rfc1 = RandomForestClassifier(n_estimators = 200,
                              criterion = 'gini',
                              max_depth = None,
                              min_samples_split = 4,
                              min_samples_leaf = 1,
                              max_leaf_nodes = None)
rfc1.fit(X_train, y_train)
y_pred_rfc1_train = rfc1.predict(X_train)
y_pred_rfc1_test = rfc1.predict(X_test)
y_proba_rfc1_train = rfc1.predict_proba(X_train)
y_proba_rfc1_test = rfc1.predict_proba(X_test)

end_time_rfc1 = time.time()

print(f"Run Time: {end_time_rfc1 - start_time_rfc1}")
print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_rfc1_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_rfc1_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_rfc1_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_rfc1_test)}")

Run Time: 205.5940968990326
Weighted F1 Score (Train): 0.9459045001884633
Weighted F1 Score (Test): 0.7475568522988201
Log Loss (Train):  0.414485587412872
Log Loss (Test):  0.6888136677530962


In [50]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 10, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.8363684539787087
Weighted F1 Score (Test): 0.7274706111825796
Log Loss (Train):  0.5402181994621008
Log Loss (Test):  0.7095042726206793
Run Time: 347.4807481765747


In [51]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 50, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.7861715069951298
Weighted F1 Score (Test): 0.6996941067106328
Log Loss (Train):  0.6465076149790803
Log Loss (Test):  0.7721565087923342
Run Time: 191.12715482711792


In [52]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 100, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.8363684539787087
Weighted F1 Score (Test): 0.7274706111825796
Log Loss (Train):  0.5402181994621008
Log Loss (Test):  0.7095042726206793
Run Time: 388.44012236595154


In [53]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 150, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.8666329393072114
Weighted F1 Score (Test): 0.740821573576041
Log Loss (Train):  0.4751014555525105
Log Loss (Test):  0.6757501598639375
Run Time: 528.4391038417816


In [55]:
start_time_etc = time.time()

etc = ExtraTreesClassifier(max_features = 'sqrt',
                         max_samples = 0.5,
                         bootstrap = True,
                         random_state = 100)
etc.fit(X_train, y_train)
y_pred_etc_train = etc.predict(X_train)
y_pred_etc_test = etc.predict(X_test)
y_proba_etc_train = etc.predict_proba(X_train)
y_proba_etc_test = etc.predict_proba(X_test)

end_time_etc = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_etc_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_etc_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_etc_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_etc_test)}")
print(f"Run Time: {end_time_etc - start_time_etc}")

Weighted F1 Score (Train): 0.979248789252413
Weighted F1 Score (Test): 0.7687169973325381
Log Loss (Train):  0.31491471134624954
Log Loss (Test):  0.6881945138963503
Run Time: 97.14397621154785


In [56]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 200, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.8878741622932251
Weighted F1 Score (Test): 0.7473048244180484
Log Loss (Train):  0.42613900739648186
Log Loss (Test):  0.6531871223705179
Run Time: 1301.7270367145538


In [57]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 250, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.905086228975606
Weighted F1 Score (Test): 0.7540658157886949
Log Loss (Train):  0.3878126523060198
Log Loss (Test):  0.6383257557702969
Run Time: 986.0501399040222


In [58]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 300, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.9170080121344002
Weighted F1 Score (Test): 0.7585179756516903
Log Loss (Train):  0.3556067710529828
Log Loss (Test):  0.6256514843201355
Run Time: 1043.0394687652588


In [59]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 350, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.928884180773971
Weighted F1 Score (Test): 0.7625226375181446
Log Loss (Train):  0.32737022889591566
Log Loss (Test):  0.6157804221548738
Run Time: 1100.0064754486084


In [60]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 400, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.9372284327543249
Weighted F1 Score (Test): 0.7645371487927606
Log Loss (Train):  0.3036385798768231
Log Loss (Test):  0.6093739166162941
Run Time: 1256.4366989135742


In [61]:
start_time_xgb = time.time()

xgb = XGBClassifier(n_estimators = 500, learning_rate = 0.29775, max_depth = 8, min_child_weight = 2, random_state = 100)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)
y_proba_xgb_train = xgb.predict_proba(X_train)
y_proba_xgb_test = xgb.predict_proba(X_test)

end_time_xgb = time.time()

print(f"Weighted F1 Score (Train): {f1_score(y_train, y_pred_xgb_train, average = 'weighted')}")
print(f"Weighted F1 Score (Test): {f1_score(y_test, y_pred_xgb_test, average = 'weighted')}")
print(f"Log Loss (Train):  {log_loss(y_train, y_proba_xgb_train)}")
print(f"Log Loss (Test):  {log_loss(y_test, y_proba_xgb_test)}")
print(f"Run Time: {end_time_xgb - start_time_xgb}")

Weighted F1 Score (Train): 0.9516040046818818
Weighted F1 Score (Test): 0.7666405882305194
Log Loss (Train):  0.26181900007080255
Log Loss (Test):  0.6002008883437594
Run Time: 1566.1028113365173
