# Preprocessing

## Check which patients have 5min and 15min bg measurements

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
train_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(train_data_file, index_col=0, low_memory=False)
extra_train_data_file = os.path.join('..', '..', '..', '..', 'data', 'interim', 'all_test_2h.csv')
extra_train_data = pd.read_csv(extra_train_data_file, low_memory=False)

all_train_data = pd.concat([train_data, extra_train_data], axis=0)
all_train_data.head()

Unnamed: 0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


## Lazy Predict for all patients

In [3]:
from pipelines import preprocessing_pipeline, standardization_pipeline
from sklearn.model_selection import train_test_split
from notebooks.helpers.LazyPredict import get_lazy_regressor


def lazy_predict(patient_id, all_data):
    patient_data = all_data[all_train_data['p_num'] == patient_id]
    patient_data = patient_data.drop(columns=['p_num'])

    # transform data
    patient_data = preprocessing_pipeline.fit_transform(patient_data)
    X = patient_data.drop(columns=['bg+1:00'])
    y = np.log1p(patient_data['bg+1:00'])

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # standardize data
    X_train = standardization_pipeline.fit_transform(X_train)
    X_test = standardization_pipeline.transform(X_test)

    lazy_regressor = get_lazy_regressor(exclude=['SVR'], predictions=True)
    models, predictions = lazy_regressor.fit(X_train, X_test, y_train, y_test)
    display(models)

In [4]:
all_train_data['p_num'].unique()

array(['p01', 'p02', 'p03', 'p04', 'p05', 'p06', 'p10', 'p11', 'p12',
       'p15', 'p16', 'p18', 'p19', 'p21', 'p22', 'p24'], dtype=object)

In [5]:
lazy_predict('p01', all_train_data)

 97%|█████████▋| 37/38 [01:20<00:00,  1.44it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17370
[LightGBM] [Info] Number of data points in the train set: 13913, number of used features: 80
[LightGBM] [Info] Start training from score 2.206515


100%|██████████| 38/38 [01:21<00:00,  2.15s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.62,0.63,0.24,20.36
HistGradientBoostingRegressor,0.57,0.58,0.26,6.14
XGBRegressor,0.56,0.57,0.26,1.63
LGBMRegressor,0.56,0.57,0.26,1.35
BaggingRegressor,0.55,0.56,0.27,4.38
GradientBoostingRegressor,0.5,0.52,0.28,13.71
TransformedTargetRegressor,0.5,0.51,0.28,0.15
Ridge,0.5,0.51,0.28,0.08
LassoLarsIC,0.5,0.51,0.28,0.18
BayesianRidge,0.5,0.51,0.28,0.08


In [6]:
lazy_predict('p02', all_train_data)

 97%|█████████▋| 37/38 [03:02<00:01,  1.11s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18525
[LightGBM] [Info] Number of data points in the train set: 27295, number of used features: 80
[LightGBM] [Info] Start training from score 2.303255


100%|██████████| 38/38 [03:03<00:00,  4.83s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.68,0.68,0.16,58.02
BaggingRegressor,0.58,0.59,0.18,11.88
XGBRegressor,0.56,0.57,0.19,1.34
HistGradientBoostingRegressor,0.55,0.56,0.19,10.53
LGBMRegressor,0.55,0.55,0.19,0.89
GradientBoostingRegressor,0.49,0.49,0.2,27.92
ElasticNetCV,0.41,0.42,0.22,3.75
LassoCV,0.41,0.42,0.22,2.56
LassoLarsCV,0.41,0.42,0.22,0.5
LassoLarsIC,0.41,0.42,0.22,0.28


In [7]:
lazy_predict('p03', all_train_data)

 97%|█████████▋| 37/38 [02:08<00:00,  1.11it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17652
[LightGBM] [Info] Number of data points in the train set: 20822, number of used features: 80
[LightGBM] [Info] Start training from score 2.210471


100%|██████████| 38/38 [02:09<00:00,  3.42s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.73,0.73,0.16,34.96
BaggingRegressor,0.63,0.64,0.19,10.12
XGBRegressor,0.61,0.62,0.19,1.15
LGBMRegressor,0.6,0.6,0.2,1.34
HistGradientBoostingRegressor,0.6,0.6,0.2,1.89
GradientBoostingRegressor,0.55,0.56,0.21,25.72
LassoCV,0.49,0.5,0.22,2.16
ElasticNetCV,0.49,0.5,0.22,2.74
KNeighborsRegressor,0.49,0.5,0.22,0.38
LassoLarsCV,0.49,0.5,0.22,0.33


In [8]:
lazy_predict('p04', all_train_data)

 97%|█████████▋| 37/38 [03:44<00:01,  1.31s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027746 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17904
[LightGBM] [Info] Number of data points in the train set: 27259, number of used features: 80
[LightGBM] [Info] Start training from score 2.141785


100%|██████████| 38/38 [03:46<00:00,  5.97s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.53,0.54,0.18,60.15
BaggingRegressor,0.38,0.39,0.2,15.61
XGBRegressor,0.36,0.37,0.21,1.6
HistGradientBoostingRegressor,0.35,0.36,0.21,10.05
LGBMRegressor,0.35,0.36,0.21,2.53
GradientBoostingRegressor,0.28,0.29,0.22,35.48
KNeighborsRegressor,0.27,0.28,0.22,0.72
LassoLarsIC,0.19,0.2,0.23,0.4
LassoCV,0.19,0.2,0.23,2.6
LassoLarsCV,0.19,0.2,0.23,0.6


In [9]:
lazy_predict('p05', all_train_data)

 97%|█████████▋| 37/38 [01:51<00:00,  1.36it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17728
[LightGBM] [Info] Number of data points in the train set: 14783, number of used features: 80
[LightGBM] [Info] Start training from score 2.177004


100%|██████████| 38/38 [01:53<00:00,  2.98s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.72,0.72,0.18,30.87
BaggingRegressor,0.66,0.66,0.19,9.16
XGBRegressor,0.65,0.66,0.2,1.34
LGBMRegressor,0.64,0.65,0.2,2.01
HistGradientBoostingRegressor,0.64,0.65,0.2,9.14
GradientBoostingRegressor,0.59,0.59,0.21,20.41
LassoLarsIC,0.56,0.57,0.22,0.17
TransformedTargetRegressor,0.56,0.57,0.22,0.14
LassoLarsCV,0.56,0.57,0.22,0.36
RidgeCV,0.56,0.57,0.22,0.22


In [10]:
lazy_predict('p06', all_train_data)

 97%|█████████▋| 37/38 [01:28<00:00,  1.31it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16132
[LightGBM] [Info] Number of data points in the train set: 13631, number of used features: 80
[LightGBM] [Info] Start training from score 2.250886


100%|██████████| 38/38 [01:30<00:00,  2.39s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.75,0.76,0.18,25.72
BaggingRegressor,0.7,0.71,0.2,6.55
LGBMRegressor,0.69,0.7,0.2,2.11
HistGradientBoostingRegressor,0.69,0.7,0.2,8.29
XGBRegressor,0.69,0.69,0.2,1.94
GradientBoostingRegressor,0.64,0.65,0.22,14.31
LassoCV,0.58,0.59,0.24,2.01
ElasticNetCV,0.58,0.59,0.24,1.76
BayesianRidge,0.58,0.59,0.24,0.2
LassoLarsCV,0.58,0.59,0.24,0.33


In [11]:
lazy_predict('p10', all_train_data)

 97%|█████████▋| 37/38 [03:08<00:01,  1.17s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16799
[LightGBM] [Info] Number of data points in the train set: 25651, number of used features: 80
[LightGBM] [Info] Start training from score 2.000210


100%|██████████| 38/38 [03:11<00:00,  5.03s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.56,0.57,0.14,55.94
XGBRegressor,0.4,0.41,0.16,1.61
BaggingRegressor,0.39,0.39,0.17,15.82
HistGradientBoostingRegressor,0.36,0.37,0.17,8.73
LGBMRegressor,0.36,0.37,0.17,2.32
KNeighborsRegressor,0.31,0.31,0.18,0.7
GradientBoostingRegressor,0.27,0.28,0.18,29.64
LassoLarsIC,0.21,0.22,0.19,0.25
LassoLarsCV,0.21,0.22,0.19,0.47
LassoCV,0.21,0.22,0.19,1.57


In [12]:
lazy_predict('p11', all_train_data)

 97%|█████████▋| 37/38 [02:55<00:01,  1.06s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18408
[LightGBM] [Info] Number of data points in the train set: 26113, number of used features: 80
[LightGBM] [Info] Start training from score 2.285110


100%|██████████| 38/38 [02:57<00:00,  4.68s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.73,0.73,0.15,50.36
BaggingRegressor,0.63,0.63,0.17,12.18
XGBRegressor,0.61,0.61,0.18,1.22
LGBMRegressor,0.58,0.59,0.18,2.01
HistGradientBoostingRegressor,0.58,0.58,0.18,11.62
KNeighborsRegressor,0.5,0.5,0.2,0.57
GradientBoostingRegressor,0.5,0.5,0.2,24.06
LassoCV,0.45,0.46,0.21,2.6
LassoLarsCV,0.45,0.46,0.21,0.43
LassoLarsIC,0.45,0.46,0.21,0.26


In [13]:
lazy_predict('p12', all_train_data)

 97%|█████████▋| 37/38 [03:03<00:01,  1.10s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17521
[LightGBM] [Info] Number of data points in the train set: 28722, number of used features: 80
[LightGBM] [Info] Start training from score 2.158505


100%|██████████| 38/38 [03:05<00:00,  4.87s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.78,0.78,0.14,54.95
BaggingRegressor,0.71,0.71,0.16,14.37
XGBRegressor,0.69,0.69,0.17,1.03
HistGradientBoostingRegressor,0.68,0.69,0.17,4.65
LGBMRegressor,0.68,0.69,0.17,2.01
GradientBoostingRegressor,0.64,0.65,0.18,26.93
AdaBoostRegressor,0.6,0.61,0.19,12.99
LassoLarsCV,0.6,0.61,0.19,0.32
LassoLarsIC,0.6,0.61,0.19,0.19
LassoCV,0.6,0.6,0.19,2.29


In [None]:
lazy_predict('p15', all_train_data)

 16%|█▌        | 6/38 [00:07<00:22,  1.40it/s]

In [None]:
lazy_predict('p16', all_train_data)

In [None]:
lazy_predict('p18', all_train_data)

In [None]:
lazy_predict('p19', all_train_data)

In [None]:
lazy_predict('p21', all_train_data)

In [None]:
lazy_predict('p22', all_train_data)

In [None]:
lazy_predict('p24', all_train_data)