# Modelling most important features

## Model Selection

In [1]:
import os
import pandas as pd

train_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(train_data_file, index_col=0, low_memory=False)

validation_data_file = os.path.join('..', '..', '..', '..', 'data', 'interim', 'all_test_4_55h.csv')
validation_data = pd.read_csv(validation_data_file, index_col=0, low_memory=False)

additional_train_data_file = os.path.join('..', '..', '..', '..', 'data', 'interim', 'all_test_3h.csv')
additional_train_data = pd.read_csv(additional_train_data_file, index_col=0, low_memory=False)
# remove from additional data patients that are not in validation data ids
additional_train_data = additional_train_data[~additional_train_data.index.isin(validation_data.index.unique())]

# merge train and additional data
train_data = pd.concat([train_data, additional_train_data], axis=0)

# do not train with patients that are not have to be predicted
test_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')
test_data = pd.read_csv(test_data_file, index_col=0, low_memory=False)

unique_patients = test_data['p_num'].unique()
train_data = train_data[train_data['p_num'].isin(unique_patients)]
validation_data = validation_data[validation_data['p_num'].isin(unique_patients)]
test_data = test_data[test_data['p_num'].isin(unique_patients)]

## Preprocessing and standardization

In [2]:
from pipelines import pipeline

train_data_processed = pipeline.fit_transform(train_data)
validation_data_processed = pipeline.transform(validation_data)

## Data Splitting

In [3]:
X_train = train_data_processed.drop(columns=['bg+1:00'])
y_train = train_data_processed['bg+1:00']

X_test = validation_data_processed.drop(columns=['bg+1:00'])
y_test = validation_data_processed['bg+1:00']

## Model Selection with LazyPredict

In [4]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor(exclude=['SVR'])
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 37/38 [1:14:58<00:28, 28.29s/it]   

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47286
[LightGBM] [Info] Number of data points in the train set: 231694, number of used features: 222
[LightGBM] [Info] Start training from score 8.386158


100%|██████████| 38/38 [1:15:06<00:00, 118.60s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.68,0.69,1.84,844.07
XGBRegressor,0.63,0.64,1.98,7.28
BaggingRegressor,0.61,0.63,2.02,260.84
LGBMRegressor,0.61,0.63,2.02,8.14
HistGradientBoostingRegressor,0.6,0.62,2.04,13.9
MLPRegressor,0.59,0.61,2.07,109.9
GradientBoostingRegressor,0.56,0.58,2.15,486.1
LassoLarsIC,0.55,0.57,2.17,4.6
TransformedTargetRegressor,0.55,0.57,2.17,2.81
RidgeCV,0.55,0.57,2.17,5.22
