# Modelling II

In this notebook, we will:
1. Load the preprocessed data
2. Preprocess the data
3. Feature engineering (day phase, p_num)
4. Model selection

In [1]:
import os

import numpy as np
import pandas as pd

data_file = os.path.join('..', '..', 'data', 'interim', 'all_train.csv')
test_file = os.path.join('..', '..', 'data', 'raw', 'test.csv')
df = pd.read_csv(data_file)
df.head()

Unnamed: 0.1,Unnamed: 0,p_num,days_since_start,time,initial_resolution,bg,insulin,carbs,hr,steps,cals,activity,bg+1:00
0,2020-01-01 00:15:00,p01,0,00:15:00,15min,,0.0083,,,,,,
1,2020-01-01 00:20:00,p01,0,00:20:00,15min,,0.0083,,,,,,
2,2020-01-01 00:25:00,p01,0,00:25:00,15min,9.6,0.0083,,,,,,
3,2020-01-01 00:30:00,p01,0,00:30:00,15min,,0.0083,,,,,,
4,2020-01-01 00:35:00,p01,0,00:35:00,15min,,0.0083,,,,,,


# Data Preprocessing

## 1. Select relevant columns

* p_num: patient number
* time: time of the day
* bg: blood glucose level at time t
* bg+1:00: blood glucose level after 1 hour

In [2]:
df = df[['p_num', 'time', 'bg', 'bg+1:00']]
df.head()

Unnamed: 0,p_num,time,bg,bg+1:00
0,p01,00:15:00,,
1,p01,00:20:00,,
2,p01,00:25:00,9.6,
3,p01,00:30:00,,
4,p01,00:35:00,,


# Clean Data

## Interpolate missing values in bg column and drop rows with missing values

In [3]:
df['bg'] = df['bg'].interpolate(method='linear').ffill().bfill()
df = df.dropna()
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 177024 entries, 71 to 235126
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   p_num    177024 non-null  object 
 1   time     177024 non-null  object 
 2   bg       177024 non-null  float64
 3   bg+1:00  177024 non-null  float64
dtypes: float64(2), object(2)
memory usage: 6.8+ MB


# Feature Engineering

## 1. Create Day Phase feature

In [4]:
from src.features.transformers import DayPhaseTransformer

day_phase_transformer = DayPhaseTransformer(time_column='time', time_format='%H:%M:%S', result_column='day_phase',
                                            drop_time_column=True)
df = day_phase_transformer.fit_transform(X=df)
df.head()

Unnamed: 0,p_num,day_phase,bg,bg+1:00
71,p01,morning,15.1,13.4
74,p01,morning,14.4,12.8
77,p01,morning,13.9,15.5
80,p01,morning,13.8,14.8
83,p01,morning,13.4,12.7


# Model selection

## 1. Split the data into train and test sets

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

X = df.drop(columns=['bg+1:00'])
y = df['bg+1:00']
X = pd.get_dummies(X, columns=['day_phase'], drop_first=True, prefix='day_phase')

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transformed = one_hot_encoder.fit_transform(X[['p_num']])
df_transformed = pd.DataFrame(transformed, columns=one_hot_encoder.get_feature_names_out(['p_num']))
df_transformed = df_transformed.set_index(X.index)
X = X.merge(df_transformed, left_index=True, right_index=True)
X = X.drop(columns=['p_num'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,bg,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon,p_num_p01,p_num_p02,p_num_p03,p_num_p04,p_num_p05,p_num_p06,p_num_p10,p_num_p11,p_num_p12
233810,5.8,False,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
222551,6.9,False,True,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
52900,13.9,False,False,False,False,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
230443,5.3,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
234455,7.0,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175647,5.7,False,False,True,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
159275,6.1,False,True,False,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
187950,8.3,False,False,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
203614,6.3,False,True,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = X_train.select_dtypes(include=np.number).columns
X_test[numerical_columns] = scaler.fit_transform(X_test[numerical_columns])
X_train[numerical_columns] = scaler.transform(X_train[numerical_columns])

X_train

Unnamed: 0,bg,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon,p_num_p01,p_num_p02,p_num_p03,p_num_p04,p_num_p05,p_num_p06,p_num_p10,p_num_p11,p_num_p12
233810,-0.828627,False,False,False,True,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,-0.413186,-0.405662,2.454831
222551,-0.462043,False,True,False,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,-0.413186,-0.405662,2.454831
52900,1.870760,False,False,False,False,-0.223958,-0.410277,2.418845,-0.404481,-0.220529,-0.218976,-0.413186,-0.405662,-0.407360
230443,-0.995255,False,False,False,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,-0.413186,-0.405662,2.454831
234455,-0.428717,False,False,False,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,-0.413186,-0.405662,2.454831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175647,-0.861952,False,False,True,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,2.420216,-0.405662,-0.407360
159275,-0.728649,False,True,False,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,2.420216,-0.405662,-0.407360
187950,0.004517,False,False,True,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,-0.413186,2.465107,-0.407360
203614,-0.661998,False,True,False,False,-0.223958,-0.410277,-0.413420,-0.404481,-0.220529,-0.218976,-0.413186,2.465107,-0.407360


## 2. Use LazyPredict

In [20]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

#reg = get_lazy_regressor()
#models, predictions = reg.fit(X_train, X_test, y_train, y_test)
#models

The best model is **GradientBoostingRegressor** with **R2 score of 0.50**.

## 3. Hyperparameter tuning

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1]
}

gbr = GradientBoostingRegressor()
grid_search = GridSearchCV(gbr, params, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X=X_train, y=y_train)
grid_search.best_params_


Fitting 5 folds for each of 27 candidates, totalling 135 fits


{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}

In [22]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}

In [23]:
# train the model with best hyperparameters
gbr = GradientBoostingRegressor(**grid_search.best_params_)
gbr.fit(X=X_train, y=y_train)
y_pred = gbr.predict(X=X_test)

## 4. Evaluate the model

In [24]:
# Evaluate the model
from sklearn.metrics import r2_score, root_mean_squared_error

print(f'R2 score: {r2_score(y_test, y_pred)}')
print(f'RMSE: {root_mean_squared_error(y_test, y_pred)}')

R2 score: 0.5389216762156354
RMSE: 2.0421612129032347


# Prepare test results

In [29]:
## Load the test data
test_data = pd.read_csv(test_file, index_col=0)
test_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,p01,06:45:00,,9.2,,,10.2,,,10.3,...,,,,,,,,,,
p01_8460,p01,11:25:00,,,9.9,,,9.4,,,...,,,,,,,,Walk,Walk,Walk
p01_8461,p01,14:45:00,,5.5,,,5.5,,,5.2,...,,,,,,,,,,
p01_8462,p01,04:30:00,,3.4,,,3.9,,,4.7,...,,,,,,,,,,
p01_8463,p01,04:20:00,,,8.3,,,10.0,,,...,,,,,,,,,,


In [30]:
test_data = test_data[['p_num', 'time', 'bg-0:00']]
test_data = day_phase_transformer.transform(test_data)
test_data.head()

Unnamed: 0_level_0,p_num,day_phase,bg-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
p01_8459,p01,morning,9.6
p01_8460,p01,noon,4.6
p01_8461,p01,afternoon,8.0
p01_8462,p01,night,9.9
p01_8463,p01,night,5.3


## Interpolate missing values in bg column and fill with mean

In [31]:
test_data.isna().sum()

p_num          0
day_phase      0
bg-0:00      132
dtype: int64

In [32]:
test_data['bg-0:00'] = test_data['bg-0:00'].fillna(test_data['bg-0:00'].median())

In [33]:
# Predict the bg+1:00 values
test_data.rename(columns={'bg-0:00': 'bg'}, inplace=True)

# encode day_phase
test_data = pd.get_dummies(test_data, columns=['day_phase'], drop_first=True)

# encode p_num
transformed = one_hot_encoder.transform(test_data[['p_num']])
df_transformed = pd.DataFrame(transformed, columns=one_hot_encoder.get_feature_names_out(['p_num']))
df_transformed = df_transformed.set_index(test_data.index)
test_data = test_data.merge(df_transformed, left_index=True, right_index=True)
test_data = test_data.drop(columns=['p_num'])

test_data['bg+1:00'] = gbr.predict(test_data)
test_data.head()

Unnamed: 0_level_0,bg,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon,p_num_p01,p_num_p02,p_num_p03,p_num_p04,p_num_p05,p_num_p06,p_num_p10,p_num_p11,p_num_p12,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
p01_8459,9.6,False,True,False,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.8
p01_8460,4.6,False,False,False,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.17
p01_8461,8.0,False,False,False,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.5
p01_8462,9.9,False,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.69
p01_8463,5.3,False,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.05


## Prepare the submission file

In [34]:
submission = pd.DataFrame(test_data['bg+1:00'])
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,20.80
p01_8460,18.17
p01_8461,21.50
p01_8462,23.69
p01_8463,20.05
...,...
p24_256,22.08
p24_257,21.62
p24_258,23.69
p24_259,20.80


In [35]:
submission.to_csv(os.path.join('..', '..', 'data', 'processed', 'modelling_II_submission.csv'))