# Tuned XGBoost Modeling
Works similarly to untuned model in xgboost_notuning.ipynb with addition of hyperparameter tuning through GridSearch. This code runs on each participant for multiple prediction variables and exports results - start and end columns for inputs and prediction column can be specified through "start", "end", and "topredict" variables. 

Only those variables, realdata, and csv name need to be modified when running different tests :)

In [1]:
import pandas as pd 
import numpy as np
from numpy import loadtxt
import warnings

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import multiprocessing

from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt

In [2]:
realdata = pd.read_csv(f"C:\\Users\\namil\\Documents\\stmi-lab-namila\\allmetrics1")
savedval = 0.111

#run tuned model
def basemodel(participant, arr, start, end, topredict, removesnacks, nodinners):
    global realdata
    global savedval
    
    test = realdata
    
    #remove specified entries
    if nodinners:
        test = test[test['Meal Type'] != "dinner"]
    
    if removesnacks:
        if(len(test[test['snack in 3hrs'] == True]) == 0):
            #if no snack meals are removed, don't rerun
            arr.append(savedval)
            return savedval
        else:
            test = test[test['snack in 3hrs'] == False]

    valid = test[test['dexcom 3hr auc'] != "error"]
    valid = valid[valid['mets 3 hr auc'] != "error"]
    
    X = valid.loc[:, start:end].to_numpy()    
    Y = valid[topredict].to_numpy().astype(float)
    warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

    #split into train and test sets
    seed = 7
    test_size = 0.25
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
    
    # HYPERPARAM TUNING: define parameter grid and regressor
    param_grid = {'alpha': [0.0, 0.5, 1.0, 2.0],
                  'learning_rate': [0.3, 0.1, 0.01, 0.001],  # Learning rate of the model
                  'max_depth': [2, 3, 4, 5, 6],  # Maximum depth of each tree
                  'n_estimators': [100, 200, 250],  # Number of trees in the ensemble
                  }
    model = XGBRegressor()
    #set up grid search cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    #make model with best params
    model = XGBRegressor(alpha = grid_search.best_params_['alpha'],
                        learning_rate = grid_search.best_params_['learning_rate'],
                        max_depth = grid_search.best_params_['max_depth'],
                        n_estimators = grid_search.best_params_['n_estimators'])
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)


#     # SHAP EXPLAINER
#     explainer = shap.Explainer(model.predict, X_test)
#     shap_values = explainer(X_train)
#     shap.plots.bar(shap_values)
    
    #sicong's normalized rmse
    np.seterr(divide='ignore')
    errors = ((y_test - prediction)/y_test)**2
    rmsre = (np.average(errors))**.5
    arr.append(rmsre)
    return rmsre

#run all combos of meal removal
def runfour(errors, start, end, topredict):
    global savedval
    savedval = basemodel(x, errors[0], start, end, topredict, 0, 1)
    basemodel(x, errors[1], start, end, topredict, 1, 1)
    savedval = basemodel(x, errors[2], start, end, topredict,  0, 0)
    basemodel(x, errors[3], start, end, topredict, 1, 0)

In [3]:
#participants = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,26,27,28,29]
participants = [1]
inputerrors = [[],[],[],[]]
inputerrors2 = [[],[],[],[]]
predpeakh = [[],[],[],[]]
predauc = [[],[],[],[]]
prediauc = [[],[],[],[]]
predpt = [[],[],[],[]]
predpd = [[],[],[],[]]
sbg = [[],[],[],[]]

for x in participants:
    print(x)
    
    global realdata
    
    # Specify where to read data from
    #realdata = pd.read_csv(f"C:\\Users\\namil\\Documents\\stmi-lab-namila\\allmetricsn{x}")
    realdata = pd.read_csv(f"C:\\Users\\namil\\Documents\\stmi-lab-namila\\combined_dataset")
    
    #predicting macros
    start = 'peakheight'
    end = 'mets 3 hr avg'
    
    runfour(inputerrors, start, end, 'carbs')
    runfour(inputerrors2, start, end, 'calories')
        
    #predicting ppgr
    start = 'mets 3 hr auc'
    end = 'fat'

    runfour(predpeakh, start, end, 'peakheight')
    runfour(predauc, start, end, 'dexcom 3hr auc')
    runfour(prediauc, start, end, 'iauc')
    runfour(predpt, start, end, 'peaktime')
    runfour(predpd, start, end, 'peakduration_40')
    runfour(sbg, start, end, 'startbg')

#export results
df = pd.DataFrame({'participants': participants, 
                  'carbs bl raw rmsre': inputerrors[0], 'carbs bl removed rmsre': inputerrors[1],
                  'carbs bld raw rmsre': inputerrors[2], 'carbs bld removed rmsre': inputerrors[3],
                  'calories bl raw rmsre': inputerrors2[0], 'calories bl removed rmsre': inputerrors2[1],
                  'calories bld raw rmsre': inputerrors2[2], 'calories bld removed rmsre': inputerrors2[3],
                  'ph bl raw rmsre': predpeakh[0], 'ph bl removed rmsre': predpeakh[1],
                  'ph bld raw rmsre': predpeakh[2], 'ph bld removed rmsre': predpeakh[3],
                  'auc bl raw rmsre': predauc[0], 'auc bl removed rmsre': predauc[1],
                  'auc bld raw rmsre': predauc[2], 'auc bld removed rmsre': predauc[3],
                  'iauc bl raw rmsre': prediauc[0], 'iauc bl removed rmsre': prediauc[1],
                  'iauc bld raw rmsre': prediauc[2], 'iauc bld removed rmsre': prediauc[3],
                  'pt bl raw rmsre': predpt[0], 'pt bl removed rmsre': predpt[1],
                  'pt bld raw rmsre': predpt[2], 'pt bld removed rmsre': predpt[3],
                  'pd40 bl raw rmsre': predpd[0], 'pd40 bl removed rmsre': predpd[1],
                  'pd40 bld raw rmsre': predpd[2], 'pd40 bld removed rmsre': predpd[3],
                  'sbg bl raw rmsre': sbg[0], 'sbg bl removed rmsre': sbg[1],
                  'sbg bld raw rmsre': sbg[2], 'sbg bld removed rmsre': sbg[3]
                  })
df.to_csv("combinedacthr")
print("done!")

1
done!
