# Modelling Selected Parameters at 0:00 + Day Phase

## Model Selection

In [1]:
# Importing the libraries
import os
import pandas as pd

## Data Preprocessing

In [2]:
# Load and preprocess the train data 
from pipelines import preprocessing_pipeline

data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(data_file, index_col=0, low_memory=False)
train_data = preprocessing_pipeline.fit_transform(train_data)
train_data.head()

Unnamed: 0_level_0,day_phase,bg-0:00,insulin-0:00,hr-0:00,cals-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
p01_0,morning,15.1,0.0417,99.6,4.8,13.4
p01_1,morning,14.4,0.0417,99.6,4.8,12.8
p01_2,morning,13.9,0.0417,99.6,4.8,15.5
p01_3,morning,13.8,0.0417,99.6,4.8,14.8
p01_4,morning,13.4,0.0417,99.6,4.8,12.7


## Data Split

In [3]:
from sklearn.model_selection import train_test_split

X = train_data.drop(columns=['bg+1:00'])
y = train_data['bg+1:00']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardization

In [4]:
from pipelines import standardization_pipeline

X_train = standardization_pipeline.fit_transform(X_train)
X_test = standardization_pipeline.transform(X_test)

X_train.head()

Unnamed: 0_level_0,bg-0:00,insulin-0:00,hr-0:00,cals-0:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
p12_24040,-0.826509,-0.069842,-0.532922,-0.201556,False,False,False,True
p12_13267,-0.459549,-0.193942,-0.809441,-0.201556,False,True,False,False
p03_131,1.875652,-0.003623,-0.66094,-0.852188,False,False,False,False
p12_20861,-0.993309,-0.193942,1.489761,0.652399,False,False,False,False
p12_24627,-0.426189,-0.193942,0.870154,0.066043,False,False,False,False


## Model Selection with LazyPredict

In [5]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor()
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 38/39 [14:29<00:50, 50.26s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 141619, number of used features: 4
[LightGBM] [Info] Start training from score 8.276012


100%|██████████| 39/39 [14:29<00:00, 22.30s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.54,0.54,2.03,0.35
LGBMRegressor,0.53,0.53,2.06,0.65
HistGradientBoostingRegressor,0.53,0.53,2.07,1.28
GradientBoostingRegressor,0.51,0.51,2.1,7.67
MLPRegressor,0.5,0.5,2.12,21.25
BaggingRegressor,0.49,0.49,2.14,3.52
SVR,0.49,0.49,2.15,474.25
PoissonRegressor,0.48,0.48,2.17,0.03
BayesianRidge,0.48,0.48,2.18,0.02
RidgeCV,0.48,0.48,2.18,0.03


The best models are:

* XGBRegressor
* LGBMRegressor
* HistGradientBoostingRegressor

For further work I choose:

* XGBRegressor
* HistGradientBoostingRegressor
* RidgeCV