# Modelling Selected Parameters at 0:00 + Day Phase

## HistGradientBoostingRegressor - Hyperparameter Tuning

In [1]:
import pandas as pd
import os

In [2]:
# Load, preprocess and standardize the train data 
from pipelines import pipeline
data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(data_file, index_col=0, low_memory=False)
train_data = pipeline.fit_transform(train_data)
train_data.head()

Unnamed: 0_level_0,bg-0:00,insulin-0:00,hr-0:00,cals-0:00,bg+1:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
p01_0,2.275154,-0.152482,0.929993,-0.458394,13.4,False,True,False,False
p01_1,2.041687,-0.152482,0.929993,-0.458394,12.8,False,True,False,False
p01_2,1.874925,-0.152482,0.929993,-0.458394,15.5,False,True,False,False
p01_3,1.841572,-0.152482,0.929993,-0.458394,14.8,False,True,False,False
p01_4,1.708162,-0.152482,0.929993,-0.458394,12.7,False,True,False,False


In [3]:
X = train_data.drop(columns=['bg+1:00'])
y = train_data['bg+1:00']

In [4]:
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from sklearn.ensemble import HistGradientBoostingRegressor

param_space = {
    'learning_rate': Real(0.01, 1, prior='log-uniform'),
    'max_iter': Integer(50, 300),
    'max_leaf_nodes': Integer(31, 127),
    'max_depth': Integer(3, 7),
    'min_samples_leaf': Integer(20, 100),
    'l2_regularization': Real(0.0, 0.5),
    'max_bins': Integer(2, 255)
}

hgb_regressor = HistGradientBoostingRegressor()
opt = BayesSearchCV(
    estimator=hgb_regressor,
    search_spaces=param_space,
    n_iter=30,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

np.int = int
opt.fit(X=X, y=y)

# Get the best parameters and best score
best_params = opt.best_params_
best_score = opt.best_score_

print("Best parameters found: ", best_params)
print("Best score: ", best_score)

Best parameters found:  OrderedDict({'l2_regularization': 0.1402357937318292, 'learning_rate': 0.025405464905579496, 'max_bins': 255, 'max_depth': 3, 'max_iter': 300, 'max_leaf_nodes': 93, 'min_samples_leaf': 40})
Best score:  -4.548502806648197


In [5]:
# create and save model with best parameters
import joblib

hgb_regressor = HistGradientBoostingRegressor(**best_params)
joblib.dump(hgb_regressor, 'hgb.model.pkl')

['hgb.model.pkl']