# Modelling most important features

## Model Selection

In [1]:
# Importing the libraries
import os
import pandas as pd

## Data Preprocessing

In [2]:
# Load and preprocess the train data 
from pipelines import preprocessing_pipeline

data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(data_file, index_col=0, low_memory=False)
train_data = preprocessing_pipeline.fit_transform(train_data)
train_data.head()

Unnamed: 0_level_0,day_phase,bg-1:00,bg-0:55,bg-0:50,bg-0:45,bg-0:40,bg-0:35,bg-0:30,bg-0:25,bg-0:20,...,hr-0:05,hr-0:00,cals-0:30,cals-0:25,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,morning,17.3,17.366667,17.433333,17.5,17.433333,17.366667,17.3,16.933333,16.566667,...,78.2,78.2,6.07,6.07,6.07,6.07,6.07,6.07,6.07,13.4
p01_1,morning,17.5,17.433333,17.366667,17.3,16.933333,16.566667,16.2,15.833333,15.466667,...,78.2,78.2,6.07,6.07,6.07,6.07,6.07,6.07,6.07,12.8
p01_2,morning,17.3,16.933333,16.566667,16.2,15.833333,15.466667,15.1,14.866667,14.633333,...,78.2,78.2,6.07,6.07,6.07,6.07,6.07,6.07,6.07,15.5
p01_3,morning,16.2,15.833333,15.466667,15.1,14.866667,14.633333,14.4,14.233333,14.066667,...,78.2,78.2,6.07,6.07,6.07,6.07,6.07,6.07,6.07,14.8
p01_4,morning,15.1,14.866667,14.633333,14.4,14.233333,14.066667,13.9,13.866667,13.833333,...,78.2,78.2,6.07,6.07,6.07,6.07,6.07,6.07,6.07,12.7


## Data Splitting

In [3]:
from sklearn.model_selection import train_test_split

X = train_data.drop(columns=['bg+1:00'])
y = train_data['bg+1:00']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardization

In [4]:
from pipelines import standardization_pipeline

X_train = standardization_pipeline.fit_transform(X_train)
X_test = standardization_pipeline.transform(X_test)

X_train.head()

Unnamed: 0_level_0,bg-1:00,bg-0:55,bg-0:50,bg-0:45,bg-0:40,bg-0:35,bg-0:30,bg-0:25,bg-0:20,bg-0:15,...,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00,day_phase_evening,day_phase_late_evening,day_phase_morning,day_phase_night,day_phase_noon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p12_24040,-0.592616,-0.593367,-0.626737,-0.659452,-0.69372,-0.727012,-0.759594,-0.793945,-0.793842,-0.826333,...,-0.22974,-0.229837,-0.230414,-0.230588,-0.229908,0,0,0,0,1
p12_13267,-0.15869,-0.192302,-0.225761,-0.25899,-0.326168,-0.292693,-0.258964,-0.292693,-0.292711,-0.359165,...,-0.22974,-0.229837,-0.230414,-0.0128,-0.229908,0,0,1,0,0
p03_131,2.177836,2.18067,2.079851,2.110407,2.146453,2.146177,2.177435,2.246983,2.212943,2.243625,...,-0.320614,-0.320707,-0.32107,-0.321445,-0.320999,0,0,0,0,0
p12_20861,-0.425721,-0.559945,-0.726981,-0.692824,-0.660306,-0.626784,-0.592717,-0.593444,-0.526572,-0.55938,...,1.94055,1.002261,2.727895,-0.211883,0.642154,0,0,0,0,0
p12_24627,-0.525858,-0.559945,-0.526493,-0.492593,-0.392995,-0.225874,-0.258964,-0.32611,-0.32612,-0.325796,...,-0.13753,0.387549,0.872119,0.639227,0.043365,0,0,0,0,0


## Model Selection with LazyPredict

In [5]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor(exclude=['SVR'])
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 37/38 [10:04<00:08,  8.20s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8641
[LightGBM] [Info] Number of data points in the train set: 141619, number of used features: 39
[LightGBM] [Info] Start training from score 8.276012


100%|██████████| 38/38 [10:05<00:00, 15.94s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.63,0.63,1.83,89.91
XGBRegressor,0.6,0.6,1.9,0.63
BaggingRegressor,0.59,0.59,1.92,29.84
LGBMRegressor,0.58,0.58,1.95,1.11
HistGradientBoostingRegressor,0.58,0.58,1.95,2.12
MLPRegressor,0.57,0.57,1.98,29.59
GradientBoostingRegressor,0.54,0.54,2.03,53.38
KNeighborsRegressor,0.54,0.54,2.04,3.23
LassoLarsCV,0.52,0.52,2.09,0.46
LassoLarsIC,0.52,0.52,2.09,0.21


The best models are:

* ExtraTreesRegressor
* XGBRegressor
* BaggingRegressor

For further work I choose:

* ExtraTreesRegressor
* XGBRegressor
* BaggingRegressor