<img title="GitHub Octocat" src='./img/Octocat.jpg' style='height: 60px; padding-right: 15px' alt="Octocat" align="left"> This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes 
<br>MIT Licensed
<br>Author: Leandro Pessini

In [262]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
import sys
import os
import joblib

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models & Evaluation
from sklearn.model_selection import KFold

# statsmodel
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
import statsmodels.stats as stats

# Boost models
import xgboost as xgb
from sklearn import metrics

# Hyperparameter optimization
from bayes_opt import BayesianOptimization

# Custom objects
sys.path.insert(0, os.path.abspath('../src/'))
import experiment_tracker as et

import time
import warnings
warnings.simplefilter('ignore', FutureWarning)
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [263]:
# ideas_df = pd.read_excel('../documentation/experiment_tracker.xlsx', sheet_name='Ideas')
# experiments_df = pd.read_excel('../documentation/experiment_tracker.xlsx', sheet_name='Experiments')

In [264]:
# creates a new object to keep track of the experiments
experiment_tracker = et.ExperimentTracker()

In [265]:
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')

In [266]:
df = df_train.copy()
X = df.drop(['count'], axis=1)
y = df.pop('count')
all_columns = list(X.columns)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_val.shape

((6132, 22), (2628, 22))

In [267]:
test_df = df_test.copy()
X_test = test_df.drop(['count'], axis=1)
y_test = test_df.pop('count')
X_test.shape

(1464, 22)

In [268]:
def evaluate_model(model, predictors, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):
    
    X_train = X_train[[c for c in X_train.columns if c in predictors]]
    X_val = X_val[[c for c in X_val.columns if c in predictors]]
    model.fit(X_train, y_train)
    
    # Predict on training and validation set
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    
    # Calculate the RSME and MAE
    # If squared = True returns MSE value, if False returns RMSE value.
    train_rsme = metrics.mean_squared_error(y_train, y_pred_train, squared=False) 
    val_rsme = metrics.mean_squared_error(y_val, y_pred_val, squared=False)
    train_mae = metrics.mean_absolute_error(y_train, y_pred_train)
    val_mae = metrics.mean_absolute_error(y_val, y_pred_val)
    
    return train_rsme, val_rsme, train_mae, val_mae

In [269]:
def get_metrics_to_Experiment() -> list:
    rsme = et.Score('RSME', '{:.4f}'.format(train_rsme), '{:.4f}'.format(val_rsme))
    mae = et.Score('MAE', '{:.4f}'.format(train_mae), '{:.4f}'.format(val_mae))
    return [rsme, mae]

In [270]:
import category_encoders as ce

def preprocessor(predictors: list) -> ColumnTransformer:
    # Setting remainder='passthrough' will mean that all columns not specified in the list of “transformers” 
    #   will be passed through without transformation, instead of being dropped

    ##################### Categorical variables #####################
    all_cat_vars = ['timesofday','dayofweek','holiday','peak','hour','working_day','season','month']
    cat_vars = [categorical_var for categorical_var in all_cat_vars if categorical_var in predictors]

    # categorical variables
    cat_pipe = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    cat_encoder = 'cat', cat_pipe, cat_vars

    ##################### Numerical variables #####################
    all_num_vars = ['rain', 'temp', 'rhum','wdsp','temp_r']
    num_vars = [numerical_var for numerical_var in all_num_vars if numerical_var in predictors]

    num_pipe = Pipeline([
        ('scaler', StandardScaler())
        # ('scaler', MinMaxScaler())
    ])

    num_enconder =  'num', num_pipe, num_vars

    ##################### Ordinal variables #####################
    all_ord_vars = ['wind_speed_group','rainfall_intensity']
    ord_vars = [ordinal_var for ordinal_var in all_ord_vars if ordinal_var in predictors]

    ordinal_cols_mapping = []
    if 'wind_speed_group' in predictors:
        ordinal_cols_mapping.append(
            {"col":"wind_speed_group",    
            "mapping": {
                'Calm / Light Breeze': 0, 
                'Breeze': 1, 
                'Moderate Breeze': 2, 
                'Strong Breeze / Near Gale': 3, 
                'Gale / Storm': 4
            }}
        )

    if 'rainfall_intensity' in predictors:
        ordinal_cols_mapping.append(
            {"col":"rainfall_intensity",    
            "mapping": {
                'no rain': 0, 
                'drizzle': 1, 
                'light rain': 2, 
                'moderate rain': 3, 
                'heavy rain': 4
            }}
        )

    # ordinal variables
    ord_pipe = Pipeline([
        ('ordinal', ce.OrdinalEncoder(mapping=ordinal_cols_mapping))
    ])

    ord_enconder =  'ordinal', ord_pipe, ord_vars
    
    #################################################################################
    
    orig_vars = [var for var in predictors if var not in cat_vars and var not in num_vars and var not in ord_vars]
    orig_enconder = 'pass_vars', 'passthrough', orig_vars
     # ['temp_bin','rhum_bin']
    # ord_pipe = 'passthrough'

    transformers_list = []
    transformers_list.append(cat_encoder) if cat_vars else None
    transformers_list.append(ord_enconder) if ord_vars else None
    transformers_list.append(num_enconder) if num_vars else None
    # transformers_list.append(orig_enconder) if orig_vars else None
    
    return ColumnTransformer(transformers=transformers_list, 
                             remainder='drop')

In [271]:
def summarize_dict(dictionary, function):
    return {k: function(v) for k,v in dictionary.items()}

In [272]:
def kfold_score(model, predictors, X=X_train, y=y_train, n_folds=5):
    cv = KFold(n_splits=n_folds, shuffle=True, random_state=2022)
    scores = {"train_rsme":[],"val_rsme":[],"train_mae":[],"val_mae":[]}
    for n_fold, (train_index, test_index) in enumerate(cv.split(X, y)):
        print('#'*40, f'Fold {n_fold+1} out of {cv.n_splits}', '#'*40)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        train_rsme, val_rsme, train_mae, val_mae = evaluate_model(model, predictors, X_train, y_train, X_test, y_test)
        scores['train_rsme'].append(train_rsme)
        scores['val_rsme'].append(val_rsme)
        scores['train_mae'].append(train_mae)
        scores['val_mae'].append(val_mae)
        
    return summarize_dict(scores, np.mean)

> https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [273]:
%%time

# idea_xgboost = et.Idea(idea='XGBoost', potential_outcome="Baseline XGBoost with all potential features had an excellent train score.")
# experiment_tracker.new_idea(idea_xgboost)

predictors = ['temp','rhum','dayofweek', 'holiday','timesofday','wdsp','rainfall_intensity','peak','working_day', 'hour', 'season']

params_xgboost = {'max_depth':10,
                   'seed': 42,
                   'eval_metric': ['rmse','mae'],
                #    'model__eval_set': [(X_train, y_train), (X_val, y_val)]
                   'verbosity': 0
                   }

pipe_xgboost = Pipeline([
    ('preprocessor', preprocessor(predictors)),
    ('model', xgb.XGBRegressor(**params_xgboost))
])

# pipe_xgboost.set_params(model__eval_set=[(X_train, y_train), (X_val, y_val)])

print(kfold_score(pipe_xgboost, predictors))
# exp_xgboost = et.Experiment('XGBoost', predictors=predictors, hyperparameters=pipe_xgboost['model'].get_params(),
#                                score=get_metrics_to_Experiment(), notes='Baseline XGBoost')
# experiment_tracker.add_experiment(exp_xgboost)

######################################## Fold 1 out of 5 ########################################
######################################## Fold 2 out of 5 ########################################
######################################## Fold 3 out of 5 ########################################
######################################## Fold 4 out of 5 ########################################
######################################## Fold 5 out of 5 ########################################
{'train_rsme': 0.22525266046799786, 'val_rsme': 2.9017208957890417, 'train_mae': 0.150431776293122, 'val_mae': 2.0966413273433795}
CPU times: user 36.9 s, sys: 1.12 s, total: 38.1 s
Wall time: 14.4 s


> https://pierpaolo28.github.io/blog/blog25/#bayesian-optimization

> https://www.kdnuggets.com/2019/07/xgboost-random-forest-bayesian-optimisation.html

> https://www.kaggle.com/code/neerajmohan/randomforest-model-with-bayesian-optimization/notebook

In [274]:
# learnings = \
# """XGBoost with almost all default hyperparameters to use as a baseline."""
# experiment_tracker.update_idea(idea_xgboost, learnings=str.strip(learnings))

In [275]:
# experiment_tracker.to_excel('../documentation/experiment_tracker_xgboost.xlsx')

In [276]:
# joblib.dump(pipe_xgboost['model'], '../models/XGBoost.pkl')

In [277]:
%reload_ext watermark
%watermark -a "Leandro Pessini" -n -u -v -iv -w

Author: Leandro Pessini

Last updated: Wed Jun 15 2022

Python implementation: CPython
Python version       : 3.9.6
IPython version      : 8.3.0

matplotlib       : 3.4.2
seaborn          : 0.11.1
xgboost          : 1.4.0
statsmodels      : 0.13.2
pandas           : 1.3.0
joblib           : 1.0.1
sklearn          : 1.0.2
sys              : 3.9.6 | packaged by conda-forge | (default, Jul 11 2021, 03:36:15) 
[Clang 11.1.0 ]
category_encoders: 2.4.0
numpy            : 1.21.1

Watermark: 2.3.0



<img title="GitHub Mark" src="./img/GitHub-Mark-64px.png" style="height: 32px; padding-right: 15px" alt="GitHub Mark" align="left"> [GitHub repository](https://github.com/pessini/moby-bikes) <br>Author: Leandro Pessini