# Testing multiple regressor models with default parameters using LazyPredict

After preprocessing the data using our custom pipeline, we used LazyPredict to evaluate multiple regression models. 

In [None]:
# import get_lazy_regresor()
import importlib.util
import sys

def import_function_from_path(file_path, function_name):
    # Load the module from the file path
    spec = importlib.util.spec_from_file_location("module_name", file_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules["module_name"] = module
    spec.loader.exec_module(module)
    
    # Retrieve the function from the loaded module
    func = getattr(module, function_name)
    return func

# Below, PATH_TO_PROJECT is the location of the project folder (e.g: /home/sep24_bds_int_medical)
PATH_TO_SCRIPT = 'PATH_TO_PROJECT/notebooks/helpers/LazyPredict.py'
function_name = 'get_lazy_regressor'

get_lazy_regressor = import_function_from_path( PATH_TO_SCRIPT , function_name )

PATH_TO_SRC = '/Users/masaver/Desktop/masaver/data_science_projects/sep24_bds_int_medical'
sys.path.append( PATH_TO_SRC )

# Load other requiered libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Mute warnings
import warnings
warnings.filterwarnings("ignore")

# Import the preprocessign pipeline
from pipelines import *

In [None]:
# Read the data from train.csv
data_dir = '../../../../data/'
train_file = os.path.join( data_dir , 'raw' , 'train.csv' )
df_train = pd.read_csv(train_file, index_col = 0 , parse_dates = True )
display( df_train.head() )

Unnamed: 0_level_0,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p01_0,p01,06:10:00,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
p01_1,p01,06:25:00,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
p01_2,p01,06:40:00,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
p01_3,p01,06:55:00,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
p01_4,p01,07:10:00,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


## Data Preprocessing

### Main steps
* Re-encoding the ``timestamp`` into a ``day-phase``
* Dropping the following collumns: ``activity``, ``carbs``, ``steps`` , ``p_num``and ``time``
* Imputing NANs in the remaining columns with interpolation and medians
* Two negative values in  the ``insulin`` column replaced with ``0``
* The column ``day-phase`` is re-encoded using ``pd.get_dummies()``
* Finally, all columns were transformed using ``StandardScaler``.

In [3]:
# Split the data into Features and Target variables, 
# and Standarize the features with the preprocessing pipelines
X = df_train.drop( 'bg+1:00' , axis = 1 )
y = df_train['bg+1:00']

# Train Test Split
x_train,x_test,y_train,y_test = train_test_split( X , y , test_size=0.2 , random_state=17 )
data_pipe = pipeline_s
x_train_s = data_pipe.fit_transform( x_train )
x_test_s = data_pipe.transform( x_test )

display( x_train_s )
display( x_test_s )


Unnamed: 0_level_0,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,bg-5:15,bg-5:10,...,cals-0:45,cals-0:40,cals-0:35,cals-0:30,cals-0:25,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p11_1931,1.07,1.07,1.04,1.07,1.01,0.91,0.81,0.74,0.81,0.84,...,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34,-0.29,-0.34,-0.34
p10_9422,-0.89,-0.89,-0.82,-0.79,-0.69,-0.73,-0.79,-0.79,-0.79,-0.89,...,0.01,-0.24,-0.21,-0.25,-0.24,-0.19,0.16,-0.11,-0.26,0.06
p06_8273,0.07,0.07,0.07,-0.03,-0.13,-0.23,-0.31,-0.40,-0.49,-0.55,...,3.15,3.64,1.51,0.09,-0.01,0.72,3.56,3.88,3.62,1.48
p12_17133,-0.46,-0.49,-0.46,-0.46,-0.46,-0.49,-0.49,-0.49,-0.46,-0.46,...,4.14,4.58,4.89,4.47,4.43,4.66,4.47,3.89,4.03,4.04
p03_2346,-0.66,-0.66,-0.63,-0.66,-0.63,-0.73,-0.76,-0.76,-0.76,-0.79,...,-0.37,-0.34,-0.37,-0.38,-0.37,-0.37,-0.02,0.36,-0.34,-0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p02_17172,-0.16,-0.16,-0.16,-0.16,-0.26,-0.33,-0.39,-0.43,-0.46,-0.49,...,-0.85,-0.85,-0.85,-0.85,-0.85,-0.85,-0.85,-0.85,-0.85,-0.85
p10_23964,0.84,0.90,0.97,0.97,0.91,0.84,0.74,0.54,0.37,0.24,...,0.58,0.72,1.18,-0.22,-0.13,0.23,-0.13,1.42,2.25,-0.03
p03_7966,1.90,1.87,1.80,1.67,1.57,1.41,1.24,1.14,1.01,0.87,...,-0.37,-0.39,-0.38,-0.38,-0.38,-0.39,-0.38,-0.39,-0.39,-0.38
p03_628,-0.43,-0.46,-0.49,-0.49,-0.49,-0.49,-0.46,-0.46,-0.43,-0.46,...,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32


Unnamed: 0_level_0,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,bg-5:15,bg-5:10,...,cals-0:45,cals-0:40,cals-0:35,cals-0:30,cals-0:25,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p11_17351,0.80,0.80,0.67,0.71,0.84,1.01,1.17,1.47,1.54,1.54,...,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34,-0.34
p10_15385,-0.82,-0.79,-0.82,-0.79,-0.83,-0.79,-0.79,-0.79,-0.69,-0.66,...,-0.35,-0.35,-0.35,-0.35,-0.35,-0.35,-0.32,-0.35,-0.35,-0.35
p06_2496,-1.09,-1.09,-1.09,-1.07,-1.05,-1.02,-1.03,-1.02,-1.02,-0.97,...,0.81,0.93,1.72,1.41,1.13,1.48,0.71,0.37,0.45,-0.28
p10_8410,-0.63,-0.59,-0.59,-0.56,-0.53,-0.53,-0.56,-0.63,-0.72,-0.83,...,2.69,3.64,3.11,1.09,3.29,3.74,3.64,3.70,4.00,3.92
p02_2562,0.44,0.41,0.34,0.31,0.31,0.27,0.24,0.24,0.21,0.21,...,-0.69,-0.69,-0.69,-0.69,-0.69,-0.54,-0.88,-0.41,-0.27,-0.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p02_24761,0.90,0.34,0.01,-0.16,-0.33,-0.59,-0.79,-0.92,-0.96,-0.99,...,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32
p10_22597,-0.69,-0.86,-0.99,-1.06,-0.86,-0.82,-0.76,-0.96,-0.96,-0.96,...,-0.25,-0.35,-0.35,-0.35,-0.34,0.34,0.03,0.07,1.04,0.14
p03_2231,0.67,0.41,0.57,0.64,0.61,0.81,1.01,0.91,0.71,0.67,...,-0.38,-0.13,-0.38,-0.38,-0.38,-0.39,-0.38,-0.39,-0.39,-0.38
p12_4088,-0.92,-0.96,-0.99,-0.96,-0.93,-0.92,-0.89,-0.86,-0.86,-0.79,...,2.54,2.98,2.58,3.30,3.52,-0.06,-0.01,2.47,3.84,3.35


In [4]:
# Display descriptive statistics
display( x_train_s.describe() )
display( x_test_s.describe() )

Unnamed: 0,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,bg-5:15,bg-5:10,...,cals-0:45,cals-0:40,cals-0:35,cals-0:30,cals-0:25,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00
count,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,...,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0,141619.0
mean,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.03,...,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13
25%,-0.72,-0.72,-0.73,-0.73,-0.73,-0.73,-0.73,-0.73,-0.72,-0.73,...,-0.38,-0.39,-0.38,-0.38,-0.38,-0.39,-0.38,-0.39,-0.39,-0.38
50%,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,...,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32
75%,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,...,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1
max,6.49,6.49,6.5,6.5,6.5,6.5,6.5,6.5,6.5,6.51,...,14.36,14.39,14.39,14.38,14.38,14.41,14.39,14.38,11.85,14.47


Unnamed: 0,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,bg-5:15,bg-5:10,...,cals-0:45,cals-0:40,cals-0:35,cals-0:30,cals-0:25,cals-0:20,cals-0:15,cals-0:10,cals-0:05,cals-0:00
count,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,...,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0,35405.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.99,1.0,1.0,1.0,1.0,1.01,1.01,1.01,1.02,1.02
min,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.02,-2.03,...,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13,-1.13
25%,-0.72,-0.72,-0.73,-0.73,-0.73,-0.73,-0.73,-0.73,-0.72,-0.73,...,-0.38,-0.39,-0.38,-0.38,-0.38,-0.39,-0.38,-0.39,-0.39,-0.38
50%,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,-0.19,...,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32,-0.32
75%,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,0.54,...,-0.1,-0.1,-0.09,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.09
max,6.49,6.49,6.5,6.5,6.5,6.5,6.5,6.5,6.5,6.51,...,13.31,12.95,13.33,13.77,10.43,13.79,13.78,11.81,14.43,13.41


In [None]:
# Run a Lazy Regressor
# NOTE: Preliminary test showed that SVR and Quantile Regressor are not the best performing and also very slow. 
# So we exclude them from the LazyRegressor Task
reg = get_lazy_regressor( exclude = ['SVR','QuantileRegressor'] )
models, predictions = reg.fit( x_train_s , x_test_s , y_train , y_test )

 97%|█████████▋| 36/37 [2:26:21<00:07,  7.81s/it]     

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73353
[LightGBM] [Info] Number of data points in the train set: 141619, number of used features: 288
[LightGBM] [Info] Start training from score 8.273489


100%|██████████| 37/37 [2:26:29<00:00, 237.56s/it]


In [6]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.7,0.7,1.63,4902.69
BaggingRegressor,0.65,0.65,1.77,218.93
XGBRegressor,0.63,0.63,1.82,5.8
HistGradientBoostingRegressor,0.6,0.6,1.9,9.92
LGBMRegressor,0.6,0.6,1.9,7.96
GradientBoostingRegressor,0.55,0.55,2.0,2459.81
LassoLarsCV,0.52,0.53,2.06,7.02
LassoLarsIC,0.52,0.53,2.06,2.93
LassoCV,0.52,0.53,2.06,24.09
BayesianRidge,0.52,0.52,2.07,3.28


In [None]:
models[models['RMSE'] < 2.0]

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.7,0.7,1.63,4902.69
BaggingRegressor,0.65,0.65,1.77,218.93
XGBRegressor,0.63,0.63,1.82,5.8
HistGradientBoostingRegressor,0.6,0.6,1.9,9.92
LGBMRegressor,0.6,0.6,1.9,7.96


In [9]:
models[ (models['RMSE'] <= 2.1) & (models['Time Taken'] < 10) ]

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.63,0.63,1.82,5.8
HistGradientBoostingRegressor,0.6,0.6,1.9,9.92
LGBMRegressor,0.6,0.6,1.9,7.96
LassoLarsCV,0.52,0.53,2.06,7.02
LassoLarsIC,0.52,0.53,2.06,2.93
BayesianRidge,0.52,0.52,2.07,3.28
RidgeCV,0.52,0.52,2.07,3.93
Ridge,0.52,0.52,2.07,1.25
TransformedTargetRegressor,0.52,0.52,2.07,2.89
OrthogonalMatchingPursuitCV,0.51,0.52,2.08,5.3


Based on the RMSE scores and the computational time required, the following models have been chosen for subsequent analysis:
* ``XGBRegressor``	
* ``HistGradientBoostingRegressor``	
* ``LGBMRegressor``

The selected models are all gradient boosting algorithms recognized for their ability to handle large-scale datasets, capture both linear and nonlinear patterns, and deliver high accuracy in complex predictive tasks. These models will undergo hyperparameter tuning to optimize their performance.