# Univariate Modelling

In [1]:
import os
import pandas as pd

data_file = os.path.join('..', '..', 'data', 'interim', 'all_train.csv')
test_file = os.path.join('..', '..', 'data', 'raw', 'test.csv')
df = pd.read_csv(data_file)
df.head()

Unnamed: 0.1,Unnamed: 0,p_num,days_since_start,time,initial_resolution,bg,insulin,carbs,hr,steps,cals,activity,bg+1:00
0,2020-01-01 00:15:00,p01,0,00:15:00,15min,,0.0083,,,,,,
1,2020-01-01 00:20:00,p01,0,00:20:00,15min,,0.0083,,,,,,
2,2020-01-01 00:25:00,p01,0,00:25:00,15min,9.6,0.0083,,,,,,
3,2020-01-01 00:30:00,p01,0,00:30:00,15min,,0.0083,,,,,,
4,2020-01-01 00:35:00,p01,0,00:35:00,15min,,0.0083,,,,,,


# Data Preprocessing

## 1. Select only bg (train) and bg+1:00 (target) columns from dataframe

In [2]:
df = df[['bg', 'bg+1:00']]
df.head()

Unnamed: 0,bg,bg+1:00
0,,
1,,
2,9.6,
3,,
4,,


# Clean Data

## Interpolate missing values in bg column and drop rows with missing values

In [3]:
df['bg'] = df['bg'].interpolate(method='linear').ffill().bfill()
df = df.dropna()
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 177024 entries, 71 to 235126
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   bg       177024 non-null  float64
 1   bg+1:00  177024 non-null  float64
dtypes: float64(2)
memory usage: 4.1 MB


# Model selection

## 1. Split the data into train and test sets

In [4]:
from sklearn.model_selection import train_test_split

X = df[['bg']]
y = df['bg+1:00']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Use LazyPredict

In [5]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor()
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 38/39 [12:57<00:42, 42.76s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 141619, number of used features: 1
[LightGBM] [Info] Start training from score 8.276012


100%|██████████| 39/39 [12:57<00:00, 19.95s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingRegressor,0.5,0.5,2.14,2.93
LGBMRegressor,0.5,0.5,2.14,0.72
MLPRegressor,0.5,0.5,2.14,1.84
XGBRegressor,0.49,0.49,2.14,0.38
HistGradientBoostingRegressor,0.49,0.49,2.14,0.44
BaggingRegressor,0.49,0.49,2.14,0.3
ExtraTreesRegressor,0.49,0.49,2.15,2.09
ExtraTreeRegressor,0.49,0.49,2.15,0.03
DecisionTreeRegressor,0.49,0.49,2.15,0.05
SVR,0.49,0.49,2.16,432.54


The best model is **GradientBoostingRegressor** with **R2 score of 0.50**.

## 3. Hyperparameter tuning

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 1]
}

gbr = GradientBoostingRegressor()
grid_search = GridSearchCV(gbr, params, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_params_


Fitting 5 folds for each of 27 candidates, totalling 135 fits


{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}

In [7]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}

The best hyperparameters are:

- n_estimators = 150
- max_depth = 3
- learning_rate = 0.1

In [8]:
# train the model with best hyperparameters
gbr = GradientBoostingRegressor(n_estimators=150, max_depth=3, learning_rate=0.1)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)

## 4. Evaluate the model

In [9]:
# Evaluate the model
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.49551178582811184

# Prepare test results

In [10]:
## Load the test data
test_data = pd.read_csv(test_file, index_col=0)
test_data.head()

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:45,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_8459,p01,06:45:00,,9.2,,,10.2,,,10.3,...,,,,,,,,,,
p01_8460,p01,11:25:00,,,9.9,,,9.4,,,...,,,,,,,,Walk,Walk,Walk
p01_8461,p01,14:45:00,,5.5,,,5.5,,,5.2,...,,,,,,,,,,
p01_8462,p01,04:30:00,,3.4,,,3.9,,,4.7,...,,,,,,,,,,
p01_8463,p01,04:20:00,,,8.3,,,10.0,,,...,,,,,,,,,,


In [11]:
test_data = test_data[['bg-0:00']]
test_data.isna().sum()

bg-0:00    132
dtype: int64

## Interpolate missing values in bg column and fill with mean

In [12]:
test_data['bg-0:00'] = test_data['bg-0:00'].fillna(test_data['bg-0:00'].mean())

In [13]:
# Predict the bg+1:00 values
test_data.rename(columns={'bg-0:00': 'bg'}, inplace=True)
test_data['bg+1:00'] = gbr.predict(test_data[['bg']])
test_data.head()

Unnamed: 0_level_0,bg,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1
p01_8459,9.6,9.04
p01_8460,4.6,6.17
p01_8461,8.0,7.83
p01_8462,9.9,9.35
p01_8463,5.3,6.34


## Prepare the submission file

In [14]:
submission = pd.DataFrame(test_data['bg+1:00'])
submission

Unnamed: 0_level_0,bg+1:00
id,Unnamed: 1_level_1
p01_8459,9.04
p01_8460,6.17
p01_8461,7.83
p01_8462,9.35
p01_8463,6.34
...,...
p24_256,6.68
p24_257,9.75
p24_258,7.08
p24_259,8.39


In [15]:
submission.to_csv(os.path.join('..', '..', 'data', 'processed', 'univariate_submission.csv'))