<img title="GitHub Octocat" src='./img/Octocat.jpg' style='height: 60px; padding-right: 15px' alt="Octocat" align="left"> This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes 
<br>MIT Licensed
<br>Author: Leandro Pessini

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import sys
import os
import joblib

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models & Evaluation
from sklearn.model_selection import KFold

# statsmodel
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
import statsmodels.stats as stats

# Boost models
import xgboost as xgb
from sklearn import metrics

# Hyperparameter optimization
from bayes_opt import BayesianOptimization

# Custom objects
sys.path.insert(0, os.path.abspath('../src/'))
import experiment_tracker as et

import time
import warnings
warnings.simplefilter('ignore', FutureWarning)
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [2]:
# ideas_df = pd.read_excel('../documentation/experiment_tracker.xlsx', sheet_name='Ideas')
# experiments_df = pd.read_excel('../documentation/experiment_tracker.xlsx', sheet_name='Experiments')

In [3]:
# creates a new object to keep track of the experiments
experiment_tracker = et.ExperimentTracker()

In [4]:
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')

In [5]:
df = df_train.copy()
X = df.drop(['count'], axis=1)
y = df.pop('count')
all_columns = list(X.columns)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X.shape

(8760, 22)

In [6]:
test_df = df_test.copy()
X_test = test_df.drop(['count'], axis=1)
y_test = test_df.pop('count')
X_test.shape

(1464, 22)

In [7]:
def get_metrics_to_Experiment(dict_scores = None) -> list:
    if dict_scores is None:
        dict_scores = {}
    rsme = et.Score('RSME', '{:.4f}'.format(dict_scores['train_rsme']), '{:.4f}'.format(dict_scores['val_rsme']))
    mae = et.Score('MAE', '{:.4f}'.format(dict_scores['train_mae']), '{:.4f}'.format(dict_scores['val_mae']))
    return [rsme, mae]

In [8]:
import category_encoders as ce

def preprocessor(predictors: list) -> ColumnTransformer:
    # Setting remainder='passthrough' will mean that all columns not specified in the list of “transformers” 
    #   will be passed through without transformation, instead of being dropped

    ##################### Categorical variables #####################
    all_cat_vars = ['timesofday','dayofweek','holiday','peak','hour','working_day','season','month']
    cat_vars = [categorical_var for categorical_var in all_cat_vars if categorical_var in predictors]

    # categorical variables
    cat_pipe = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    cat_encoder = 'cat', cat_pipe, cat_vars

    ##################### Numerical variables #####################
    all_num_vars = ['rain', 'temp', 'rhum','wdsp','temp_r']
    num_vars = [numerical_var for numerical_var in all_num_vars if numerical_var in predictors]

    num_pipe = Pipeline([
        ('scaler', StandardScaler())
        # ('scaler', MinMaxScaler())
    ])

    num_enconder =  'num', num_pipe, num_vars

    ##################### Ordinal variables #####################
    all_ord_vars = ['wind_speed_group','rainfall_intensity']
    ord_vars = [ordinal_var for ordinal_var in all_ord_vars if ordinal_var in predictors]

    ordinal_cols_mapping = []
    if 'wind_speed_group' in predictors:
        ordinal_cols_mapping.append(
            {"col":"wind_speed_group",    
            "mapping": {
                'Calm / Light Breeze': 0, 
                'Breeze': 1, 
                'Moderate Breeze': 2, 
                'Strong Breeze / Near Gale': 3, 
                'Gale / Storm': 4
            }}
        )

    if 'rainfall_intensity' in predictors:
        ordinal_cols_mapping.append(
            {"col":"rainfall_intensity",    
            "mapping": {
                'no rain': 0, 
                'drizzle': 1, 
                'light rain': 2, 
                'moderate rain': 3, 
                'heavy rain': 4
            }}
        )

    # ordinal variables
    ord_pipe = Pipeline([
        ('ordinal', ce.OrdinalEncoder(mapping=ordinal_cols_mapping))
    ])

    ord_enconder =  'ordinal', ord_pipe, ord_vars
    
    #################################################################################
    
    orig_vars = [var for var in predictors if var not in cat_vars and var not in num_vars and var not in ord_vars]
    orig_enconder = 'pass_vars', 'passthrough', orig_vars
     # ['temp_bin','rhum_bin']
    # ord_pipe = 'passthrough'

    transformers_list = []
    transformers_list.append(cat_encoder) if cat_vars else None
    transformers_list.append(ord_enconder) if ord_vars else None
    transformers_list.append(num_enconder) if num_vars else None
    # transformers_list.append(orig_enconder) if orig_vars else None
    
    return ColumnTransformer(transformers=transformers_list, 
                             remainder='drop')

In [9]:
def summarize_dict(dictionary, function):
    return {k: function(v) for k,v in dictionary.items()}

In [36]:
def kfold_score(params, predictors, X=X, y=y, n_folds=5, verbose=50, early_stopping_rounds=10):
    
    pipe_xgboost = Pipeline([
        ('preprocessor', preprocessor(predictors)),
        ('model', xgb.XGBRegressor(**params))
    ])

    X = X[[c for c in X.columns if c in predictors]]
    cv = KFold(n_splits=n_folds, shuffle=True, random_state=2022)
    scores = {"train_rsme":[],"val_rsme":[],"train_mae":[],"val_mae":[]}

    for n_fold, (train_index, test_index) in enumerate(cv.split(X, y)):
        print('#'*40, f'Fold {n_fold+1} out of {cv.n_splits}', '#'*40)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Xy = xgb.DMatrix(X_train, y_train, enable_categorical=True)
        X_test_transformed = pipe_xgboost['preprocessor'].fit_transform(X_test)
        pipe_xgboost.fit(X_train, y_train,
                         model__eval_set=[(X_test_transformed, y_test)], 
                         model__early_stopping_rounds=early_stopping_rounds,
                         model__verbose=verbose)
        # pipe_xgboost.fit(X_train, y_train)
        # print(pipe_xgboost['model'].evals_result())

        # Predict on training and validation set
        y_pred_train = pipe_xgboost.predict(X_train)
        y_pred_val = pipe_xgboost.predict(X_test)

        # Calculate the RSME and MAE
        # If squared = True returns MSE value, if False returns RMSE value.
        scores['train_rsme'].append(metrics.mean_squared_error(y_train, y_pred_train, squared=False))
        scores['val_rsme'].append(metrics.mean_squared_error(y_test, y_pred_val, squared=False))
        scores['train_mae'].append(metrics.mean_absolute_error(y_train, y_pred_train))
        scores['val_mae'].append(metrics.mean_absolute_error(y_test, y_pred_val))

        print(f"Fold {n_fold+1} - best iteration: {pipe_xgboost['model'].get_booster().best_iteration}\n")

    return summarize_dict(scores, np.mean)

In [11]:
#predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

In [12]:
# Baseline model
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':3,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=3)
exp_xgboost = et.Experiment('XGBoost (Baseline)', predictors=predictors, hyperparameters=params_xgboost,
                               score=get_metrics_to_Experiment(dict_scores), notes='Baseline XGBoost')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 3 ########################################
[0]	validation_0-rmse:4.21536
[50]	validation_0-rmse:2.74378
[59]	validation_0-rmse:2.74717
Fold 1 - best iteration: 50

######################################## Fold 2 out of 3 ########################################
[0]	validation_0-rmse:3.78092
[50]	validation_0-rmse:2.50728
[53]	validation_0-rmse:2.51117
Fold 2 - best iteration: 43

######################################## Fold 3 out of 3 ########################################
[0]	validation_0-rmse:3.96063
[50]	validation_0-rmse:2.65082
[60]	validation_0-rmse:2.65222
Fold 3 - best iteration: 50

--- New Experiment added! ---
ID#: 6522915280 
Algorithm: XGBoost (Baseline) 
Predictors: ['temp', 'rhum', 'dayofweek', 'holiday', 'timesofday', 'wdsp', 'rainfall_intensity', 'peak', 'working_day', 'hour', 'season']
Hyperparameters: {'max_depth': 3, 'seed': 42, 'eval_metric': 'rmse'}
Date: 24/06/2022 20:12:50
Metric: [{ 'metric': RSME, 'trai

In [13]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':3,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (kfold=5)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed kfold from 3 to 5')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.11705
[50]	validation_0-rmse:2.70987
[53]	validation_0-rmse:2.71205
Fold 1 - best iteration: 44

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.06442
[38]	validation_0-rmse:2.67475
Fold 2 - best iteration: 29

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:3.83542
[50]	validation_0-rmse:2.49629
[75]	validation_0-rmse:2.48870
Fold 3 - best iteration: 66

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:3.86631
[44]	validation_0-rmse:2.57703
Fold 4 - best iteration: 34

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:4.01331
[50]	validation_0-rmse:2.69621
[54]	validation_0-rmse:2.69

In [14]:
experiment_tracker.print_partial_results(filter_metric='rsme')

--- Experiments ---

Model: XGBoost (Baseline)
RSME - Train: 2.4729 - Validation: 2.6320 - Test: None

Model: XGBoost (kfold=5)
RSME - Train: 2.5074 - Validation: 2.6241 - Test: None


In [15]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':4,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (max_depth=4)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed max_depth from 3 to 4')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.09116
[50]	validation_0-rmse:2.72225
[51]	validation_0-rmse:2.72316
Fold 1 - best iteration: 41

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.04536
[50]	validation_0-rmse:2.65851
[52]	validation_0-rmse:2.65806
Fold 2 - best iteration: 43

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:3.80920
[50]	validation_0-rmse:2.48782
Fold 3 - best iteration: 40

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:3.84692
[32]	validation_0-rmse:2.55594
Fold 4 - best iteration: 22

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:3.99428
[26]	validation_0-rmse:2.70347
Fold 5 - best iteration: 17

In [16]:
experiment_tracker.print_partial_results(filter_metric='rsme')

--- Experiments ---

Model: XGBoost (Baseline)
RSME - Train: 2.4729 - Validation: 2.6320 - Test: None

Model: XGBoost (kfold=5)
RSME - Train: 2.5074 - Validation: 2.6241 - Test: None

Model: XGBoost (max_depth=4)
RSME - Train: 2.4190 - Validation: 2.6204 - Test: None


In [17]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':5,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (max_depth=5)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed max_depth from 4 to 5')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.07402
[29]	validation_0-rmse:2.69766
Fold 1 - best iteration: 20

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.01404
[29]	validation_0-rmse:2.68113
Fold 2 - best iteration: 20

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:3.79392
[32]	validation_0-rmse:2.48664
Fold 3 - best iteration: 22

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:3.82639
[39]	validation_0-rmse:2.55573
Fold 4 - best iteration: 30

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:3.96547
[38]	validation_0-rmse:2.69274
Fold 5 - best iteration: 28

--- New Experiment added! ---
ID#: 6522885024 
Algorithm: XG

In [18]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (max_depth=6)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed max_depth from 5 to 6')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.06753
[22]	validation_0-rmse:2.73384
Fold 1 - best iteration: 12

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:3.99660
[25]	validation_0-rmse:2.64274
Fold 2 - best iteration: 15

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:3.77770
[29]	validation_0-rmse:2.48667
Fold 3 - best iteration: 19

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:3.80956
[23]	validation_0-rmse:2.55278
Fold 4 - best iteration: 13

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:3.95107
[24]	validation_0-rmse:2.68358
Fold 5 - best iteration: 14

--- New Experiment added! ---
ID#: 6522608560 
Algorithm: XG

In [19]:
experiment_tracker.print_partial_results(filter_metric='rsme')

--- Experiments ---

Model: XGBoost (Baseline)
RSME - Train: 2.4729 - Validation: 2.6320 - Test: None

Model: XGBoost (kfold=5)
RSME - Train: 2.5074 - Validation: 2.6241 - Test: None

Model: XGBoost (max_depth=4)
RSME - Train: 2.4190 - Validation: 2.6204 - Test: None

Model: XGBoost (max_depth=5)
RSME - Train: 2.3208 - Validation: 2.6162 - Test: None

Model: XGBoost (max_depth=6)
RSME - Train: 2.2826 - Validation: 2.6049 - Test: None


In [20]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':7,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (max_depth=7)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed max_depth from 6 to 7')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.05646
[21]	validation_0-rmse:2.73397
Fold 1 - best iteration: 12

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:3.99207
[28]	validation_0-rmse:2.69588
Fold 2 - best iteration: 19

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:3.76683
[26]	validation_0-rmse:2.47165
Fold 3 - best iteration: 16

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:3.79529
[16]	validation_0-rmse:2.55774
Fold 4 - best iteration: 7

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:3.94436
[21]	validation_0-rmse:2.69179
Fold 5 - best iteration: 12

--- New Experiment added! ---
ID#: 4534493872 
Algorithm: XGB

In [21]:
experiment_tracker.print_partial_results(filter_metric='rsme')

--- Experiments ---

Model: XGBoost (Baseline)
RSME - Train: 2.4729 - Validation: 2.6320 - Test: None

Model: XGBoost (kfold=5)
RSME - Train: 2.5074 - Validation: 2.6241 - Test: None

Model: XGBoost (max_depth=4)
RSME - Train: 2.4190 - Validation: 2.6204 - Test: None

Model: XGBoost (max_depth=5)
RSME - Train: 2.3208 - Validation: 2.6162 - Test: None

Model: XGBoost (max_depth=6)
RSME - Train: 2.2826 - Validation: 2.6049 - Test: None

Model: XGBoost (max_depth=7)
RSME - Train: 2.1642 - Validation: 2.6112 - Test: None


In [22]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.2,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (eta=0.2)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed eta (learning rate) from default=0.3 to 0.2')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.37038
[32]	validation_0-rmse:2.69679
Fold 1 - best iteration: 22

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.30055
[40]	validation_0-rmse:2.64038
Fold 2 - best iteration: 31

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.07630
[43]	validation_0-rmse:2.45687
Fold 3 - best iteration: 33

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.10665
[39]	validation_0-rmse:2.52149
Fold 4 - best iteration: 30

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:4.24758
[34]	validation_0-rmse:2.68000
Fold 5 - best iteration: 24

--- New Experiment added! ---
ID#: 6522876448 
Algorithm: XG

In [23]:
experiment_tracker.print_partial_results(filter_metric='rsme')

--- Experiments ---

Model: XGBoost (Baseline)
RSME - Train: 2.4729 - Validation: 2.6320 - Test: None

Model: XGBoost (kfold=5)
RSME - Train: 2.5074 - Validation: 2.6241 - Test: None

Model: XGBoost (max_depth=4)
RSME - Train: 2.4190 - Validation: 2.6204 - Test: None

Model: XGBoost (max_depth=5)
RSME - Train: 2.3208 - Validation: 2.6162 - Test: None

Model: XGBoost (max_depth=6)
RSME - Train: 2.2826 - Validation: 2.6049 - Test: None

Model: XGBoost (max_depth=7)
RSME - Train: 2.1642 - Validation: 2.6112 - Test: None

Model: XGBoost (eta=0.2)
RSME - Train: 2.2344 - Validation: 2.5927 - Test: None


In [24]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.01,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (eta=0.01)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed eta from 0.2 to 0.01')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.98728
[50]	validation_0-rmse:3.77322
[99]	validation_0-rmse:3.18691
Fold 1 - best iteration: 99

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.92188
[50]	validation_0-rmse:3.70921
[99]	validation_0-rmse:3.13820
Fold 2 - best iteration: 99

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.68765
[50]	validation_0-rmse:3.47770
[99]	validation_0-rmse:2.91931
Fold 3 - best iteration: 99

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.71550
[50]	validation_0-rmse:3.51716
[99]	validation_0-rmse:2.95104
Fold 4 - best iteration: 99

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:4.85

In [25]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.2,
                  'n_estimators': 500,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (estimators=500)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed estimators to 500')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.37038
[32]	validation_0-rmse:2.69679
Fold 1 - best iteration: 22

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.30055
[41]	validation_0-rmse:2.63982
Fold 2 - best iteration: 31

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.07630
[43]	validation_0-rmse:2.45687
Fold 3 - best iteration: 33

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.10665
[40]	validation_0-rmse:2.52179
Fold 4 - best iteration: 30

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:4.24758
[34]	validation_0-rmse:2.68000
Fold 5 - best iteration: 24

--- New Experiment added! ---
ID#: 6522915616 
Algorithm: XG

In [26]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.01,
                  'n_estimators': 500,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5)
exp_xgboost = et.Experiment('XGBoost (eta=0.01)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed estimators to 500 and eta = 0.01')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.98728
[50]	validation_0-rmse:3.77322
[100]	validation_0-rmse:3.17917
[150]	validation_0-rmse:2.91723
[200]	validation_0-rmse:2.79758
[250]	validation_0-rmse:2.73205
[300]	validation_0-rmse:2.70833
[350]	validation_0-rmse:2.69354
[400]	validation_0-rmse:2.68430
[450]	validation_0-rmse:2.67978
[492]	validation_0-rmse:2.67822
Fold 1 - best iteration: 483

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.92188
[50]	validation_0-rmse:3.70921
[100]	validation_0-rmse:3.13042
[150]	validation_0-rmse:2.86946
[200]	validation_0-rmse:2.75475
[250]	validation_0-rmse:2.69968
[300]	validation_0-rmse:2.67764
[350]	validation_0-rmse:2.66168
[400]	validation_0-rmse:2.65394
[450]	validation_0-rmse:2.64823
[499]	validation_0-rmse:2.64491
Fold 2 - best iteration: 496

######################################## Fold 

In [27]:
experiment_tracker.print_partial_results()

--- Experiments ---

Model: XGBoost (Baseline)
RSME - Train: 2.4729 - Validation: 2.6320 - Test: None
MAE - Train: 1.8143 - Validation: 1.9269 - Test: None

Model: XGBoost (kfold=5)
RSME - Train: 2.5074 - Validation: 2.6241 - Test: None
MAE - Train: 1.8373 - Validation: 1.9233 - Test: None

Model: XGBoost (max_depth=4)
RSME - Train: 2.4190 - Validation: 2.6204 - Test: None
MAE - Train: 1.7705 - Validation: 1.9116 - Test: None

Model: XGBoost (max_depth=5)
RSME - Train: 2.3208 - Validation: 2.6162 - Test: None
MAE - Train: 1.7007 - Validation: 1.9066 - Test: None

Model: XGBoost (max_depth=6)
RSME - Train: 2.2826 - Validation: 2.6049 - Test: None
MAE - Train: 1.6672 - Validation: 1.8952 - Test: None

Model: XGBoost (max_depth=7)
RSME - Train: 2.1642 - Validation: 2.6112 - Test: None
MAE - Train: 1.5683 - Validation: 1.8919 - Test: None

Model: XGBoost (eta=0.2)
RSME - Train: 2.2344 - Validation: 2.5927 - Test: None
MAE - Train: 1.6327 - Validation: 1.8887 - Test: None

Model: XGBoost (e

In [28]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.01,
                  'n_estimators': 1000,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250)
exp_xgboost = et.Experiment('XGBoost (n_estimators=1000)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed estimators from 500 to 1000')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.98728
[250]	validation_0-rmse:2.73205
[492]	validation_0-rmse:2.67822
Fold 1 - best iteration: 483

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.92188
[250]	validation_0-rmse:2.69968
[500]	validation_0-rmse:2.64488
[520]	validation_0-rmse:2.64454
Fold 2 - best iteration: 511

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.68765
[250]	validation_0-rmse:2.50495
[500]	validation_0-rmse:2.46272
[548]	validation_0-rmse:2.46206
Fold 3 - best iteration: 538

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.71550
[250]	validation_0-rmse:2.55613
[467]	validation_0-rmse:2.52956
Fold 4 - best iteration: 458

######################################## Fold 5 

In [31]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.001,
                  'n_estimators': 1000,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250)
exp_xgboost = et.Experiment('XGBoost (eta=0.001)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed eta from 0.01 to 0.001')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:5.01759
[250]	validation_0-rmse:4.30190
[500]	validation_0-rmse:3.79253
[750]	validation_0-rmse:3.43328
[999]	validation_0-rmse:3.18996
Fold 1 - best iteration: 999

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.95245
[250]	validation_0-rmse:4.23409
[500]	validation_0-rmse:3.72849
[750]	validation_0-rmse:3.37994
[999]	validation_0-rmse:3.14118
Fold 2 - best iteration: 999

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.71775
[250]	validation_0-rmse:4.00151
[500]	validation_0-rmse:3.49652
[750]	validation_0-rmse:3.15013
[999]	validation_0-rmse:2.92268
Fold 3 - best iteration: 999

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.74549
[250]	validati

In [32]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.001,
                  'n_estimators': 5000,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250)
exp_xgboost = et.Experiment('XGBoost (n_estimators=5000)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed n_estimators from 1000 to 5000')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:5.01759
[250]	validation_0-rmse:4.30190
[500]	validation_0-rmse:3.79253
[750]	validation_0-rmse:3.43328
[1000]	validation_0-rmse:3.18919
[1250]	validation_0-rmse:3.02957
[1500]	validation_0-rmse:2.92251
[1750]	validation_0-rmse:2.84838
[2000]	validation_0-rmse:2.79885
[2250]	validation_0-rmse:2.75850
[2500]	validation_0-rmse:2.73294
[2750]	validation_0-rmse:2.71592
[3000]	validation_0-rmse:2.70545
[3250]	validation_0-rmse:2.69693
[3500]	validation_0-rmse:2.69320
[3750]	validation_0-rmse:2.68928
[4000]	validation_0-rmse:2.68649
[4250]	validation_0-rmse:2.68398
[4499]	validation_0-rmse:2.68193
Fold 1 - best iteration: 4489

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.95245
[250]	validation_0-rmse:4.23409
[500]	validation_0-rmse:3.72849
[750]	validation_0-rmse:3.37994
[1000]	validation_0-rmse:3

In [34]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.05,
                  'n_estimators': 5000,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250)
exp_xgboost = et.Experiment('XGBoost (eta=0.05)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed eta from 0.001 to 0.05')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.85363
[114]	validation_0-rmse:2.68064
Fold 1 - best iteration: 104

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.78710
[123]	validation_0-rmse:2.64739
Fold 2 - best iteration: 113

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.55499
[172]	validation_0-rmse:2.46062
Fold 3 - best iteration: 163

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.58331
[111]	validation_0-rmse:2.52713
Fold 4 - best iteration: 101

######################################## Fold 5 out of 5 ########################################
[0]	validation_0-rmse:4.72251
[92]	validation_0-rmse:2.66695
Fold 5 - best iteration: 82

--- New Experiment added! ---
ID#: 6522913648 
Algor

In [35]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.01,
                  'n_estimators': 5000,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250)
exp_xgboost = et.Experiment('XGBoost (eta=0.01)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed eta from 0.05 to 0.01')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.98728
[250]	validation_0-rmse:2.73205
[493]	validation_0-rmse:2.67826
Fold 1 - best iteration: 483

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.92188
[250]	validation_0-rmse:2.69968
[500]	validation_0-rmse:2.64488
[520]	validation_0-rmse:2.64454
Fold 2 - best iteration: 511

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.68765
[250]	validation_0-rmse:2.50495
[500]	validation_0-rmse:2.46272
[548]	validation_0-rmse:2.46206
Fold 3 - best iteration: 538

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.71550
[250]	validation_0-rmse:2.55613
[467]	validation_0-rmse:2.52956
Fold 4 - best iteration: 458

######################################## Fold 5 

In [37]:
predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':6,
                  'eta': 0.01,
                  'n_estimators': 5000,
                   'seed': 42,
                   'eval_metric': 'rmse'
                   }

dict_scores = kfold_score(params_xgboost, predictors, n_folds=5, verbose=250, early_stopping_rounds=50)
exp_xgboost = et.Experiment('XGBoost (early_stopping_rounds=50)', predictors=predictors, hyperparameters=params_xgboost, 
                            score=get_metrics_to_Experiment(dict_scores), 
                            notes='Changed early_stopping_rounds from 10 to 50')
experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
[0]	validation_0-rmse:4.98728
[250]	validation_0-rmse:2.73205
[500]	validation_0-rmse:2.67760
[571]	validation_0-rmse:2.67785
Fold 1 - best iteration: 522

######################################## Fold 2 out of 5 ########################################
[0]	validation_0-rmse:4.92188
[250]	validation_0-rmse:2.69968
[500]	validation_0-rmse:2.64488
[595]	validation_0-rmse:2.64576
Fold 2 - best iteration: 546

######################################## Fold 3 out of 5 ########################################
[0]	validation_0-rmse:4.68765
[250]	validation_0-rmse:2.50495
[500]	validation_0-rmse:2.46272
[588]	validation_0-rmse:2.46238
Fold 3 - best iteration: 538

######################################## Fold 4 out of 5 ########################################
[0]	validation_0-rmse:4.71550
[250]	validation_0-rmse:2.55613
[500]	validation_0-rmse:2.52984
[508]	validation_0-rmse:2.53062
Fold 4 - best 

In [29]:
# experiment_tracker.to_excel('../documentation/experiment_tracker_xgboost.xlsx')
# joblib.dump(pipe_xgboost['model'], '../models/XGBoost.pkl')

In [30]:
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w

Author: Leandro Pessini

Last updated: Fri Jun 24 2022

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 7.25.0

xgboost          : 1.4.0
numpy            : 1.21.1
joblib           : 1.0.1
seaborn          : 0.11.1
sklearn          : 1.0.2
pandas           : 1.3.0
matplotlib       : 3.4.2
statsmodels      : 0.12.2
sys              : 3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) 
[Clang 11.1.0 ]
category_encoders: 2.4.0

Watermark: 2.3.0



> https://pierpaolo28.github.io/blog/blog25/#bayesian-optimization

> https://www.kdnuggets.com/2019/07/xgboost-random-forest-bayesian-optimisation.html

> https://www.kaggle.com/code/neerajmohan/randomforest-model-with-bayesian-optimization/notebook

> https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

<img title="GitHub Mark" src="./img/GitHub-Mark-64px.png" style="height: 32px; padding-right: 15px" alt="GitHub Mark" align="left"> [GitHub repository](https://github.com/pessini/moby-bikes) <br>Author: Leandro Pessini