<img title="GitHub Octocat" src='./img/Octocat.jpg' style='height: 60px; padding-right: 15px' alt="Octocat" align="left"> This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes 
<br>MIT Licensed
<br>Author: Leandro Pessini

# <p style="font-size:100%; text-align:left; color:#444444;">Models</p>

# <p style="font-size:100%; text-align:left; color:#444444;">Table of Contents:</p>
* [1. Datasets](#1)
  * [1.1 Rentals Data - Moby Bikes](#1.1)
  * [1.2 Weather Data - Met Éireann](#1.2)
* [2. Preprocessing & Feature Engineering](#2)
  * [2.1 Target variable distribution](#2.1)
  * [2.2 Missing values](#2.2)
  * [2.3 Exploratory Analysis](#2.3)
  * [2.4 Features Importance](#2.4)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models & Evaluation
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import bambi as bmb

# statsmodel
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
import statsmodels.stats as stats
import statsmodels.distributions.discrete as distr

# Boost models
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
import catboost as cat
from catboost import CatBoostRegressor

from sklearn import metrics

# Hyperparameter optimization
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import time
import warnings
warnings.simplefilter('ignore', FutureWarning)
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [2]:
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w

Author: Leandro Pessini

Last updated: Fri Apr 29 2022

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.25.0

matplotlib : 3.4.2
lightgbm   : 3.2.1
catboost   : 0.26.1
numpy      : 1.21.1
sklearn    : 1.0.2
bambi      : 0.7.1
seaborn    : 0.11.1
pandas     : 1.3.0
xgboost    : 1.4.0
sys        : 3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) 
[Clang 11.1.0 ]
statsmodels: 0.12.2

Watermark: 2.3.0



In [3]:
hourly_data = pd.read_csv('../data/processed/hourly_data.csv')
hourly_rentals = pd.read_csv('../data/processed/hourly_rentals.csv')
hourly_data.shape, hourly_rentals.shape

((8760, 24), (6966, 24))

In [4]:
hourly_data.head(2)

Unnamed: 0,rain,temp,rhum,wdsp,date,hour,day,month,year,holiday,...,peak,timesofday,rainfall_intensity,wind_bft,wind_speed_group,temp_r,temp_kbin,temp_kbin_quantile,temp_kbin_kmeans,count
0,0.0,0.1,98,4,2021-03-01,0,1,3,2021,False,...,False,Night,no rain,2,Calm / Light Breeze,0,1.0,0.0,0.0,0
1,0.0,-1.1,98,3,2021-03-01,1,1,3,2021,False,...,False,Night,no rain,2,Calm / Light Breeze,-1,1.0,0.0,0.0,0


> Example of dynamic table with evalution metrics: https://www.kirenz.com/post/2021-12-06-regression-splines-in-python/regression-splines-in-python/

## Splitting dataset in train and test

In [115]:
from sklearn.preprocessing import KBinsDiscretizer
# transform the temperature with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy='kmeans')
hourly_data['humidity_kbin'] = enc.fit_transform(hourly_data['rhum'].array.reshape(-1,1))
hourly_rentals['humidity_kbin'] = enc.fit_transform(hourly_rentals['rhum'].array.reshape(-1,1))

In [305]:
df = hourly_rentals.copy()
# df = hourly_rentals.copy()
df = df.astype({'holiday': 'category',
                'working_day': 'category',
                'peak': 'category',
                'season': 'category',
                'dayofweek': 'category',
                'timesofday': 'category',
                'rainfall_intensity': 'category',
                'wind_bft': 'category',
                'wind_speed_group': 'category'})

df['humidity_norm'] = df['rhum']/100
# predictors = ['temp_r','wind_speed_group','humidity_norm','rainfall_intensity','holiday','season','dayofweek','working_day','peak','timesofday']
predictors = ['temp_kbin_quantile','wind_speed_group','rainfall_intensity','holiday','peak','timesofday']

# OrdinalEnconder
enc_rain = OrdinalEncoder(dtype=np.int64, \
    categories=[['no rain', 'drizzle', 'light rain', 'moderate rain', 'heavy rain']])
df['rainfall_intensity'] = enc_rain.fit_transform(df[['rainfall_intensity']])

enc_wind = OrdinalEncoder(dtype=np.int64, \
    categories=[['Calm / Light Breeze', 'Breeze', 'Moderate Breeze', 'Strong Breeze / Near Gale','Gale / Storm']])
df['wind_speed_group'] = enc_wind.fit_transform(df[['wind_speed_group']])

X = df[[c for c in df.columns if c in predictors]]
y = df.pop('count')

num_vars = [n for n in df.select_dtypes(include=['number']).columns if n in predictors] # list comprehension to select only predictors features
cat_vars = [c for c in df.select_dtypes(include=['category']).columns if c in predictors]

dummies = pd.get_dummies(X[cat_vars], drop_first=False)
X = pd.concat([X[num_vars], dummies],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((4876, 11), (2090, 11))

In [296]:
X_train.head(2)

Unnamed: 0,rainfall_intensity,wind_speed_group,temp_kbin_quantile,holiday_True,peak_True,timesofday_Evening,timesofday_Morning,timesofday_Night
5768,0,2,3.0,0,0,0,0,0
2995,0,2,7.0,0,0,1,0,0


In [306]:
# Zero-Inflated Negative Binomial model
# X_with_constant = sm.add_constant(X_train)
# model_zinb = sm.NegativeBinomialP(endog=y_train, exog=X_train, exog_infl=X_train, inflation='logit')
# res_zinb = model_zinb.fit(maxiter=150, method='minimize', cov_type='HC3', disp=True)

# method => ‘newton’ for Newton-Raphson, ‘nm’ for Nelder-Mead, ‘bfgs’ for Broyden-Fletcher-Goldfarb-Shanno (BFGS)
# ‘lbfgs’ for limited-memory BFGS with optional box constraints, ‘powell’ for modified Powell’s method
# ‘cg’ for conjugate gradient, ‘ncg’ for Newton-conjugate gradient
# ‘basinhopping’ for global basin-hopping solver, ‘minimize’ for generic wrapper of scipy minimize (BFGS by default)
model_zinb = sm.NegativeBinomialP(endog=y_train, exog=X_train)
res_zinb = model_zinb.fit(maxiter=5000, method='nm', cov_type='HC3', disp=True, retall=True)

Optimization terminated successfully.
         Current function value: 2.338111
         Iterations: 844
         Function evaluations: 1177


In [307]:
print(res_zinb.summary())
# zinb_pred = zip_mod.predict(X_test)
# zinb_rmse = np.sqrt(metrics.mean_squared_error(y_test, zinb_pred))

                     NegativeBinomialP Regression Results                     
Dep. Variable:                  count   No. Observations:                 4876
Model:              NegativeBinomialP   Df Residuals:                     4867
Method:                           MLE   Df Model:                            8
Date:                Fri, 29 Apr 2022   Pseudo R-squ.:                 0.06142
Time:                        22:46:13   Log-Likelihood:                -11401.
converged:                       True   LL-Null:                       -12147.
Covariance Type:                  HC3   LLR p-value:                     0.000
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
rainfall_intensity      -0.1579      1.625     -0.097      0.923      -3.343       3.027
wind_speed_group        -0.0176      0.044     -0.399      0.690      -0.104       0.069
temp_kbin_qu

In [308]:
zinb_predictions = res_zinb.predict(X_test)
predicted_counts=np.round(zinb_predictions)
actual_counts = y_test
print(f'ZINB RMSE={str(np.sqrt(np.sum(np.power(np.subtract(predicted_counts,actual_counts),2))))}')

ZINB RMSE=140.71247279470288


In [309]:
res_zinb.aic

22821.26074166484

In [310]:
hourly_rentals.groupby(["holiday", "timesofday"])["count"].agg(["mean", "var"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,var
holiday,timesofday,Unnamed: 2_level_1,Unnamed: 3_level_1
False,Afternoon,6.363636,13.10849
False,Evening,4.330896,8.378856
False,Morning,5.166267,11.81318
False,Night,2.396936,3.573343
True,Afternoon,7.619048,27.607433
True,Evening,4.15625,8.523185
True,Morning,5.258065,18.997849
True,Night,1.678571,1.263228


In [311]:
def round_up(x):
    '''
    Helper function to round away from zero
    '''
    from math import copysign
    return int(x + copysign(0.5, x))

d = {'Actual': y_test, 'Predicted': zinb_predictions}
df_pred = pd.DataFrame(data=d)
df_pred['Predicted'] = df_pred['Predicted'].apply(round_up)

# df_pred = df_pred[df_pred['Actual'] == 0]
df_pred.to_csv('actualvspredicted.csv')

In [312]:
df_pred['Predicted'].describe()

count    2090.000000
mean        4.651675
std         1.646096
min         1.000000
25%         3.000000
50%         5.000000
75%         6.000000
max         8.000000
Name: Predicted, dtype: float64

## Regression with Discrete Dependent Variable¶


Regression models for limited and qualitative dependent variables. Count (Poisson, NegativeBinomial) data.

- Poisson / Zero-Inflated Poisson (ZIP)
- Negative Binomial / Zero-Inflated Negative Binomial (ZINB)

### statsmodels library

- `Poisson()` - Poisson Model
- `NegativeBinomial()` - Negative Binomial Model
- `NegativeBinomialP()` - Generalized Negative Binomial (NB-P) Model
- `GeneralizedPoisson()` - Generalized Poisson Model
- `ZeroInflatedPoisson()` - Poisson Zero Inflated Model
- `ZeroInflatedNegativeBinomialP()` - Zero Inflated Generalized Negative Binomial Model
- `ZeroInflatedGeneralizedPoisson()` - Zero Inflated Generalized Poisson Model


### Abbreviations
- ZI: Zero-inflated; 
- ZIP: Zero-inflated poisson; 
- NB: Negative binomial; 
- ZINB: Zero-inflated negative binomial; 
- HNB: Hurdle negative binomial; 
- PMF: Probability mass function; 
- CDF: Cumulative distributions function; 
- RQR: Randomized quantile residuals; 
- SW: Shaprio-Wilk; 
- AIC: Akaike information criterion



### Goodness of fit statistics
- Pearson Chi-Square test
- Log-Likelihood Ratio test


In [None]:
# print("Model: Zero Inflated Poisson")
# zip_mod = sm.ZeroInflatedPoisson(y_train, X_with_constant, inflation='logit').fit(method="nm", maxiter=50)

# zip_mean_pred = zip_mod.predict(X_test, exog_infl=np.ones((len(X_test), 1)))
# zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred)
# zip_rmse = np.sqrt(metrics.mean_squared_error(y_test, zip_ppf_obs))

# print("Model: Zero Inflated Neg. Binomial")
# zinb_mod = sm.ZeroInflatedNegativeBinomialP(y_train, X_with_constant).fit(method="nm", maxiter=50)
# zinb_pred = zip_mod.predict(X_test)
# zinb_rmse = np.sqrt(metrics.mean_squared_error(y_test, zinb_pred))

In [313]:
predictors

['temp_kbin_quantile',
 'wind_speed_group',
 'rainfall_intensity',
 'holiday',
 'peak',
 'timesofday']

In [314]:
fml = "count ~ temp_kbin_quantile + wind_speed_group + rainfall_intensity + holiday + timesofday + holiday:timesofday"

model = bmb.Model(fml, hourly_rentals, family="negativebinomial")
trace = model.fit(draws=1000, tune=1000, cores=2)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [count_alpha, holiday:timesofday, timesofday, holiday, rainfall_intensity, wind_speed_group, temp_kbin_quantile, Intercept]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 61 seconds.


In [303]:
import arviz as az

In [315]:
az.summary(trace)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
Intercept,1.329,0.044,1.246,1.411,0.001,0.001,2497.0,1395.0,1.0
temp_kbin_quantile,0.045,0.003,0.04,0.051,0.0,0.0,4255.0,1775.0,1.0
wind_speed_group[Calm / Light Breeze],0.018,0.018,-0.015,0.053,0.0,0.0,2933.0,1415.0,1.0
wind_speed_group[Gale / Storm],-0.404,0.564,-1.474,0.609,0.009,0.011,3968.0,1578.0,1.0
wind_speed_group[Moderate Breeze],0.005,0.019,-0.029,0.042,0.0,0.0,3108.0,1689.0,1.0
wind_speed_group[Strong Breeze / Near Gale],-0.171,0.066,-0.297,-0.049,0.001,0.001,4343.0,1728.0,1.0
rainfall_intensity[heavy rain],-0.024,0.235,-0.469,0.398,0.004,0.005,3785.0,1574.0,1.0
rainfall_intensity[light rain],-0.029,0.087,-0.187,0.136,0.002,0.002,3352.0,1456.0,1.0
rainfall_intensity[moderate rain],-0.148,0.061,-0.259,-0.032,0.001,0.001,2748.0,1556.0,1.0
rainfall_intensity[no rain],0.272,0.037,0.206,0.345,0.001,0.001,2300.0,1332.0,1.0


In [None]:
fdssd

## Preprocessing Pipelines

In [None]:
# Define categorical pipeline
cat_pipe = Pipeline([
    #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Define numerical pipeline
num_pipe = Pipeline([
    #('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    # ('scaler', MinMaxScaler())
])

ord_pipe = Pipeline([
    ('ordinal_enconder', OrdinalEncoder(dtype=np.int64, categories=[['no rain', 
                                                                     'drizzle', 
                                                                     'light rain', 
                                                                     'moderate rain', 
                                                                     'heavy rain']]))
])

# Combine categorical and numerical pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_vars),
    ('ordinal_enconder', ord_pipe, ord_var),
    ('num', num_pipe, num_vars)
], remainder='passthrough')

## Feature Importance

In [None]:
# LightGBM model
params_lightgbm = {'n_estimators': 5000,
                   'objective': 'l1',
                   'learning_rate': 0.01, 
                   'subsample': 0.7,
                   'verbosity': -1,
                   'feature_fraction': 0.5,
                   'bagging_fraction': 0.5,
                   'bagging_freq': 20,
                   'importance_type': 'gain'
                   }

# Fit a pipeline with transformers and an estimator to the training data
pipe_gbm = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(**params_lightgbm))
])
pipe_gbm.fit(X_train, y_train)

#Plotting features importance
feature_imp = pd.DataFrame(sorted(zip(pipe_gbm['model'].feature_importances_,X_train.columns)), 
                           columns=['Value','Feature'])
scaler_ft = MinMaxScaler()
feature_imp['Value'] = scaler_ft.fit_transform(feature_imp['Value'].values.reshape(-1,1));

fig = plt.figure(figsize=(15, 12))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features Importance')
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)

plt.show()

## Random Forest Regressor

### Extrapolation problem 

When using a Random Forest Regressor, the predicted values are never outside the training set values for the target variable. If it is tasked with the problem of predicting for values not previously seen, it will always predict an average of the values seen previously. Obviously the average of a sample can not fall outside the highest and lowest values in the sample. 

The Random Forest Regressor is unable to discover trends that would enable it in extrapolating values that fall outside the training set. When faced with such a scenario, the regressor assumes that the prediction will fall close to the maximum value in the training set. 


### Potential solutions

Ok, so how can you deal with this extrapolation problem?

There are a couple of options:

- Use a linear model such as SVM regression, Linear Regression, etc
- Build a deep learning model because neural nets are able to extrapolate (they are basically stacked linear regression models on steroids)
- Combine predictors using [stacking](https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html). For example, you can create a stacking regressor using a Linear model and a Random Forest Regressor. 
- Use modified versions of random forest

One of such extensions is [Regression-Enhanced Random Forests](https://arxiv.org/pdf/1904.10416.pdf) (RERFs). The authors of this paper propose a technique borrowed from the strengths of penalized parametric regression to give better results in extrapolation problems.

Specifically, there are two steps to the process:

run Lasso before Random Forest, 
train a Random Forest on the residuals from Lasso. 
Since Random Forest is a fully nonparametric predictive algorithm, it may not efficiently incorporate known relationships between the response and the predictors. The response values are the observed values Y1, . . . , Yn  from the training data. RERFs are able to incorporate known relationships between the responses and the predictors which is another benefit of using Regression-Enhanced Random Forests for regression problems.

Source: https://neptune.ai/blog/random-forest-regression-when-does-it-fail-and-why

In [None]:
# random forest model
params_rf = {'n_estimators': 1000, 
             'max_depth': 20, 
             'random_state': 0, 
             'min_samples_split' : 5,
             'n_jobs': -1}

# Fit a pipeline with transformers and an estimator to the training data
pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(**params_rf, criterion='mae'))
])
pipe_rf.fit(X_train, y_train)

In [None]:
pipe_rf['model'].feature_importances_

In [None]:
#Plotting features importance
feature_imp = pd.DataFrame(sorted(zip(pipe_rf['model'].feature_importances_,X_train.columns)), 
                           columns=['Value','Feature'])
scaler_ft = MinMaxScaler()
feature_imp['Value'] = scaler_ft.fit_transform(feature_imp['Value'].values.reshape(-1,1));

fig = plt.figure(figsize=(15, 12))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('Random Forest Features Importance')
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)

plt.show()

In [None]:
from sklearn.inspection import permutation_importance
import shap

In [None]:
perm_importance = permutation_importance(pipe_rf, X_test, y_test)

In [None]:
perm_importance

In [None]:
sorted_idx = perm_importance.importances_mean.argsort()
sorted_idx
# plt.barh(feature_imp[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")

In [None]:
# random forest model
params_rf = {'n_estimators': 1000, 
             'max_depth': 20, 
             'random_state': 0, 
             'min_samples_split' : 5,
             'n_jobs': -1}

# Fit a pipeline with transformers and an estimator to the training data
pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(**params_rf, criterion='mae'))
])
pipe_rf.fit(X_train, y_train)

#Plotting features importance
feature_imp = pd.DataFrame(sorted(zip(pipe_gbm['model'].feature_importances_,X_train.columns)), 
                           columns=['Value','Feature'])
scaler_ft = MinMaxScaler()
feature_imp['Value'] = scaler_ft.fit_transform(feature_imp['Value'].values.reshape(-1,1));

fig = plt.figure(figsize=(15, 12))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('Random Forest Features Importance')
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)

plt.show()

In [None]:
# random forest model
params_rf = {'n_estimators': 1000, 
             'max_depth': 20, 
             'random_state': 0, 
             'min_samples_split' : 5, 
             'n_jobs': -1}

# Fit a pipeline with transformers and an estimator to the training data
pipe_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(**params_rf))
])
pipe_rf.fit(X_train, y_train)
# y_train_pred = pipe_rf.predict(X_train)
# y_test_pred = pipe_rf.predict(X_test)

# print_evalmetrics(y_test, y_test_pred)

## Support Vector Regression

In [None]:
from sklearn.svm import SVR

# Fit a pipeline with transformers and an estimator to the training data
pipe_svr = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR(kernel='poly',gamma='scale',C=100))
])
pipe_svr.fit(X_train, y_train)
y_train_pred = pipe_svr.predict(X_train)
y_test_pred = pipe_svr.predict(X_test)

print_evalmetrics(y_test, y_test_pred)

In [None]:
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipe_svr, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scores = absolute(scores)
# summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
y_test_pred = pipe_svr.predict(X_test)
print_evalmetrics(y_test, y_test_pred)

In [None]:
predicted_values = pd.DataFrame()
predicted_values['real'] = y_test
predicted_values['predicted'] = y_test_pred

predicted_values

## GradientBoost

In [None]:
# GBM model
params_gbm = {'n_estimators': 150, 
              'max_depth': 5, 
              'random_state': 0, 
              'min_samples_leaf' : 10, 
              'learning_rate': 0.01, 
              'subsample': 0.7, 
              'loss': 'ls'}

# Fit a pipeline with transformers and an estimator to the training data
pipe_gbm = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(**params_gbm))
])
pipe_gbm.fit(X_train, y_train)
y_train_pred = pipe_gbm.predict(X_train)
y_test_pred = pipe_gbm.predict(X_test)

print_evalmetrics(y_test, y_test_pred)

## LightGBM

In [None]:
# LightGBM model
params_lightgbm = {'n_estimators': 1000, 
                   'max_depth': 15, 
                   'random_state': 0, 
                   'learning_rate': 0.01, 
                   'subsample': 0.7,
                   'num_leaves': 30,
                   'metric': 'rmse',
                   'n_jobs': 2
                   }

# Fit a pipeline with transformers and an estimator to the training data
pipe_gbm = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(**params_lightgbm))
])
pipe_gbm.fit(X_train, y_train)
y_train_pred = pipe_gbm.predict(X_train)
y_test_pred = pipe_gbm.predict(X_test)

print_evalmetrics(y_test, y_test_pred)

In [None]:
%%time

cv = KFold(n_splits=3, shuffle=True, random_state=2022)

for n_fold, (train_index, test_index) in enumerate(cv.split(X_train, y_train)):
    print('#'*40, f'Fold {n_fold+1} out of {cv.n_splits}', '#'*40)
    
    # X_train, y_train = X[train_index], y[train_index] # Train data
    # X_val, y_val = X[test_index], y[test_index] # Valid data
    
    # pipe_gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],
    #           verbose=250, early_stopping_rounds=50)
    
    # preds_lgb[test_index] += pipe_gbm.predict(X_val, raw_score=False)

## Catboost

In [None]:
# Catboost model

# Fit a pipeline with transformers and an estimator to the training data
pipe_catboost = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(verbose=1, n_estimators=100))
])
pipe_catboost.fit(X_train, y_train)
y_train_pred = pipe_catboost.predict(X_train)
y_test_pred = pipe_catboost.predict(X_test)

print('\n')
print_evalmetrics(y_test, y_test_pred)

<img title="GitHub Mark" src="./img/GitHub-Mark-64px.png" style="height: 32px; padding-right: 15px" alt="GitHub Mark" align="left"> [GitHub repository](https://github.com/pessini/moby-bikes) <br>Author: Leandro Pessini