# Modelling Selected Parameters at 0:00 + Day Phase

## Model Selection

In [1]:
# Importing the libraries
import os
import pandas as pd

## Data Preprocessing

In [2]:
# Load and preprocess the train data 
from pipelines import preprocessing_pipeline

data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_data = pd.read_csv(data_file, index_col=0, low_memory=False)
train_data = preprocessing_pipeline.fit_transform(train_data)
train_data.head()

Unnamed: 0_level_0,day_phase,bg-2:00,bg-1:55,bg-1:50,bg-1:45,bg-1:40,bg-1:35,bg-1:30,bg-1:25,bg-1:20,...,bg-0:40,bg-0:35,bg-0:30,bg-0:25,bg-0:20,bg-0:15,bg-0:10,bg-0:05,bg-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,morning,17.2,17.533333,17.866667,18.2,18.266667,18.333333,18.4,18.266667,18.133333,...,17.433333,17.366667,17.3,16.933333,16.566667,16.2,15.833333,15.466667,15.1,13.4
p01_1,morning,18.2,18.266667,18.333333,18.4,18.266667,18.133333,18.0,17.766667,17.533333,...,16.933333,16.566667,16.2,15.833333,15.466667,15.1,14.866667,14.633333,14.4,12.8
p01_2,morning,18.4,18.266667,18.133333,18.0,17.766667,17.533333,17.3,17.366667,17.433333,...,15.833333,15.466667,15.1,14.866667,14.633333,14.4,14.233333,14.066667,13.9,15.5
p01_3,morning,18.0,17.766667,17.533333,17.3,17.366667,17.433333,17.5,17.433333,17.366667,...,14.866667,14.633333,14.4,14.233333,14.066667,13.9,13.866667,13.833333,13.8,14.8
p01_4,morning,17.3,17.366667,17.433333,17.5,17.433333,17.366667,17.3,16.933333,16.566667,...,14.233333,14.066667,13.9,13.866667,13.833333,13.8,13.666667,13.533333,13.4,12.7


## Data Splitting

In [3]:
from sklearn.model_selection import train_test_split

X = train_data.drop(columns=['bg+1:00'])
y = train_data['bg+1:00']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardization

In [4]:
from pipelines import standardization_pipeline

X_train = standardization_pipeline.fit_transform(X_train)
X_test = standardization_pipeline.transform(X_test)

X_train.head()

Unnamed: 0_level_0,bg-2:00,bg-1:55,bg-1:50,bg-1:45,bg-1:40,bg-1:35,bg-1:30,bg-1:25,bg-1:20,bg-1:15,...,bg-0:25,bg-0:20,bg-0:15,bg-0:10,bg-0:05,bg-0:00,day_phase_evening,day_phase_morning,day_phase_night,day_phase_noon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p12_24040,-0.927655,-0.861759,-0.827864,-0.76028,-0.727643,-0.626891,-0.592867,-0.593618,-0.526773,-0.526568,...,-0.794134,-0.793841,-0.826287,-0.860879,-0.86098,-0.826509,False,False,False,True
p12_13267,-0.359878,-0.360222,-0.359971,-0.326209,-0.293123,-0.259468,-0.259266,-0.192837,-0.192847,-0.159615,...,-0.293076,-0.292998,-0.359421,-0.293092,-0.35992,-0.459549,False,True,False,False
p03_131,2.178421,2.147466,2.213443,2.244831,2.280574,2.279087,2.242744,2.312048,2.244819,2.142186,...,2.245619,2.21122,2.241694,2.178451,1.978357,1.875652,False,False,False,False
p12_20861,1.410251,1.345006,1.07713,0.508544,0.208246,-0.12586,-0.426067,-0.460024,-0.293025,-0.159615,...,-0.593711,-0.526725,-0.559506,-0.660483,-0.827576,-0.993309,False,False,False,False
p12_24627,-0.627067,-0.393658,-0.32655,-0.359599,-0.326548,-0.259468,-0.159186,-0.259634,-0.293025,-0.359771,...,-0.32648,-0.326387,-0.326073,-0.293092,-0.35992,-0.426189,False,False,False,False


## Model Selection with LazyPredict

In [5]:
from notebooks.helpers.LazyPredict import get_lazy_regressor

reg = get_lazy_regressor(exclude=['SVR'])
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 37/38 [09:10<00:09,  9.75s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6330
[LightGBM] [Info] Number of data points in the train set: 141619, number of used features: 25
[LightGBM] [Info] Start training from score 8.276012


100%|██████████| 38/38 [09:10<00:00, 14.50s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,0.57,0.57,1.97,3.37
ExtraTreesRegressor,0.56,0.56,1.99,67.36
XGBRegressor,0.54,0.54,2.04,0.59
MLPRegressor,0.54,0.54,2.04,37.67
LGBMRegressor,0.54,0.54,2.05,0.84
HistGradientBoostingRegressor,0.54,0.54,2.05,2.18
GradientBoostingRegressor,0.52,0.52,2.08,34.87
BaggingRegressor,0.52,0.52,2.09,18.29
LassoLarsCV,0.49,0.5,2.14,0.42
LassoLarsIC,0.49,0.5,2.14,0.15


The best models are:

* KNeighborsRegressor
* ExtraTreesRegressor
* BaggingRegressor

For further work I choose:

* KNeighborsRegressor
* BaggingRegressor
* XGBRegressor