# Modelling Selected Parameters at 0:00 + Day Phase

## Model Selection

In [1]:
# Importing the libraries
import os
import pandas as pd

## Data Preprocessing

In [2]:
# Load and preprocess the train data 
from pipelines import preprocessing_pipeline

data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(data_file, index_col=0, low_memory=False)
train_data = preprocessing_pipeline.fit_transform(train_data)
train_data.head()

Unnamed: 0_level_0,day_phase,bg-3:00,bg-2:45,bg-2:30,bg-2:15,bg-2:00,bg-1:45,bg-1:30,bg-1:15,bg-1:00,...,cals-2:00,cals-1:45,cals-1:30,cals-1:15,cals-1:00,cals-0:45,cals-0:30,cals-0:15,cals-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,morning,13.9,14.2,14.2,15.4,17.2,18.2,18.4,18.0,17.3,...,0.96,0.96,0.96,0.96,0.96,0.96,0.96,0.96,4.8,13.4
p01_1,morning,14.2,14.2,15.4,17.2,18.2,18.4,18.0,17.3,17.5,...,0.96,0.96,0.96,0.96,0.96,0.96,0.96,0.96,4.8,12.8
p01_2,morning,14.2,15.4,17.2,18.2,18.4,18.0,17.3,17.5,17.3,...,0.96,0.96,0.96,0.96,0.96,0.96,0.96,0.96,4.8,15.5
p01_3,morning,15.4,17.2,18.2,18.4,18.0,17.3,17.5,17.3,16.2,...,0.96,0.96,0.96,0.96,0.96,0.96,0.96,0.96,4.8,14.8
p01_4,morning,17.2,18.2,18.4,18.0,17.3,17.5,17.3,16.2,15.1,...,0.96,0.96,0.96,0.96,0.96,0.96,0.96,0.96,4.8,12.7


## Data Splitting

In [3]:
from sklearn.model_selection import train_test_split

X = train_data.drop(columns=['bg+1:00'])
y = train_data['bg+1:00']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardization

In [4]:
from pipelines import standardization_pipeline

X_train = standardization_pipeline.fit_transform(X_train)
X_test = standardization_pipeline.transform(X_test)

X_train.head()

Unnamed: 0_level_0,bg-3:00,bg-2:45,bg-2:30,bg-2:15,bg-2:00,bg-1:45,bg-1:30,bg-1:15,bg-1:00,bg-0:45,...,cals-1:15,cals-1:00,cals-0:45,cals-0:30,cals-0:15,cals-0:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p12_24040,-0.826049,-0.826581,-0.961215,-0.961402,-0.927655,-0.76028,-0.592867,-0.526568,-0.593126,-0.659985,...,-0.207894,-0.203293,-0.209774,-0.201676,-0.221298,-0.201556,False,False,False,True
p12_13267,-0.158401,-0.258984,-0.192796,-0.259887,-0.359878,-0.326209,-0.259266,-0.159615,-0.159478,-0.259754,...,-0.207894,-0.203293,-0.209774,-0.201676,-0.221298,-0.201556,False,True,False,False
p03_131,1.911308,1.978019,1.845186,2.045092,2.178421,2.244831,2.242744,2.142186,2.175554,2.108275,...,-0.857708,-0.853256,-0.857643,-0.850384,-0.858737,-0.852188,False,False,False,False
p12_20861,0.742924,0.809435,1.243814,1.276765,1.410251,0.508544,-0.426067,-0.159615,-0.426338,-0.693337,...,-0.207894,0.402116,2.159652,-0.201676,0.963619,0.652399,False,False,False,False
p12_24627,-1.360167,-1.460955,-0.994624,-0.894591,-0.627067,-0.359599,-0.159186,-0.359771,-0.526411,-0.493222,...,0.183829,0.54364,-0.033439,0.011508,0.372446,0.066043,False,False,False,False


## Model Selection with LazyPredict

In [5]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor(exclude=['SVR'])
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 37/38 [12:49<00:07,  7.83s/it] 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13215
[LightGBM] [Info] Number of data points in the train set: 141619, number of used features: 52
[LightGBM] [Info] Start training from score 8.276012


100%|██████████| 38/38 [12:52<00:00, 20.33s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.68,0.68,1.71,127.73
BaggingRegressor,0.63,0.63,1.82,46.06
XGBRegressor,0.62,0.62,1.86,2.3
KNeighborsRegressor,0.58,0.58,1.94,3.73
HistGradientBoostingRegressor,0.58,0.58,1.95,2.14
LGBMRegressor,0.58,0.58,1.95,3.08
MLPRegressor,0.55,0.55,2.02,41.73
GradientBoostingRegressor,0.54,0.54,2.04,84.56
LassoLarsCV,0.5,0.5,2.13,0.62
LassoLarsIC,0.5,0.5,2.13,0.33


The best models are:

* KNeighborsRegressor
* ExtraTreesRegressor
* BaggingRegressor

For further work I choose:

* KNeighborsRegressor
* BaggingRegressor
* XGBRegressor