<img title="GitHub Octocat" src='./img/Octocat.jpg' style='height: 60px; padding-right: 15px' alt="Octocat" align="left"> This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes 
<br>MIT Licensed
<br>Author: Leandro Pessini

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
import pickle

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# metrics
from sklearn import metrics

# Boost models
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

import time
import warnings
warnings.simplefilter('ignore', FutureWarning)
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [33]:
df_test = pd.read_csv('../data/processed/df_test.csv')
df_test.head()

Unnamed: 0,rain,temp,rhum,wdsp,date,hour,day,month,year,count,...,working_day,season,peak,timesofday,rainfall_intensity,wind_bft,wind_speed_group,temp_r,temp_bin,rhum_bin
0,0.0,-0.2,96,6,2022-03-01,0,1,3,2022,0,...,True,Winter,False,Night,no rain,2,Calm / Light Breeze,0,0.0,4.0
1,0.0,-1.8,93,4,2022-03-01,1,1,3,2022,1,...,True,Winter,False,Night,no rain,2,Calm / Light Breeze,-2,0.0,4.0
2,0.0,-1.5,93,5,2022-03-01,2,1,3,2022,0,...,True,Winter,False,Night,no rain,2,Calm / Light Breeze,-2,0.0,4.0
3,0.0,-2.6,93,5,2022-03-01,3,1,3,2022,3,...,True,Winter,False,Night,no rain,2,Calm / Light Breeze,-3,0.0,4.0
4,0.0,-2.5,93,3,2022-03-01,4,1,3,2022,7,...,True,Winter,False,Night,no rain,2,Calm / Light Breeze,-3,0.0,4.0


In [34]:
df = df_test.copy()
X = df.drop(['count'], axis=1)
y = df.pop('count')
all_columns = list(X.columns)
X.shape

(1464, 22)

In [35]:
import category_encoders as ce

def preprocessor(predictors: list) -> ColumnTransformer:
    # Setting remainder='passthrough' will mean that all columns not specified in the list of “transformers” 
    #   will be passed through without transformation, instead of being dropped

    ##################### Categorical variables #####################
    all_cat_vars = ['timesofday','dayofweek','holiday','peak','hour','working_day','season','month']
    cat_vars = [categorical_var for categorical_var in all_cat_vars if categorical_var in predictors]

    # categorical variables
    cat_pipe = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    cat_encoder = 'cat', cat_pipe, cat_vars

    ##################### Numerical variables #####################
    all_num_vars = ['rain', 'temp', 'rhum','wdsp','temp_r']
    num_vars = [numerical_var for numerical_var in all_num_vars if numerical_var in predictors]

    num_pipe = Pipeline([
        ('scaler', StandardScaler())
        # ('scaler', MinMaxScaler())
    ])

    num_enconder =  'num', num_pipe, num_vars

    ##################### Ordinal variables #####################
    all_ord_vars = ['wind_speed_group','rainfall_intensity']
    ord_vars = [ordinal_var for ordinal_var in all_ord_vars if ordinal_var in predictors]

    ordinal_cols_mapping = []
    if 'wind_speed_group' in predictors:
        ordinal_cols_mapping.append(
            {"col":"wind_speed_group",    
            "mapping": {
                'Calm / Light Breeze': 0, 
                'Breeze': 1, 
                'Moderate Breeze': 2, 
                'Strong Breeze / Near Gale': 3, 
                'Gale / Storm': 4
            }}
        )

    if 'rainfall_intensity' in predictors:
        ordinal_cols_mapping.append(
            {"col":"rainfall_intensity",    
            "mapping": {
                'no rain': 0, 
                'drizzle': 1, 
                'light rain': 2, 
                'moderate rain': 3, 
                'heavy rain': 4
            }}
        )

    # ordinal variables
    ord_pipe = Pipeline([
        ('ordinal', ce.OrdinalEncoder(mapping=ordinal_cols_mapping))
    ])

    ord_enconder =  'ordinal', ord_pipe, ord_vars
    
    #################################################################################
    
    orig_vars = [var for var in predictors if var not in cat_vars and var not in num_vars and var not in ord_vars]
    orig_enconder = 'pass_vars', 'passthrough', orig_vars
     # ['temp_bin','rhum_bin']
    # ord_pipe = 'passthrough'

    transformers_list = []
    transformers_list.append(cat_encoder) if cat_vars else None
    transformers_list.append(ord_enconder) if ord_vars else None
    transformers_list.append(num_enconder) if num_vars else None
    # transformers_list.append(orig_enconder) if orig_vars else None
    
    return ColumnTransformer(transformers=transformers_list, 
                             remainder='drop')

## Loading models

In [36]:
pipeline_xgboost = joblib.load('../models/xgb_pipeline.pkl')

In [37]:
pipeline_xgboost

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['timesofday', 'dayofweek',
                                                   'hour', 'working_day',
                                                   'season']),
                                                 ('ordinal',
                                                  Pipeline(steps=[('ordinal',
                                                                   OrdinalEncoder(mapping=[{'col': 'rainfall_intensity',
                                                                                            'mapping': {'drizzle': 1,
                                  

In [38]:
xgb_model = xgb.XGBRegressor()
xgb_model.load_model("../models/XGBoost.json")

In [39]:
xgb_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, eta=0.01,
             eval_metric='rmse', gamma=1.5, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, seed=42, subsample=0.7, tree_method='auto',
             validate_parameters=1, verbosity=None)

In [40]:
xgb_pipe = pickle.load(open("../models/xgb_pipeline.pkl", "rb"))

In [41]:
# predictors = ['temp', 'rhum', 'dayofweek', 'timesofday', 'wdsp', 'rainfall_intensity', 'working_day', 'hour', 'season']
# pipe_xgboost = Pipeline([
#         ('preprocessor', preprocessor(predictors))
#     ])

# transformed_feats = pipe_xgboost.fit_transform(X)

In [48]:
xgb_pipe['model']

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, eta=0.01,
             eval_metric='rmse', gamma=1.5, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.00999999978,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, seed=42, subsample=0.7, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [53]:
predicted = pd.Series(xgb_pipe.predict(X))

In [60]:
def round_up(x):
    '''
    Helper function to round away from zero
    '''
    from math import copysign
    return int(x + copysign(0.5, x))

In [57]:
actual_predicted = {'Actual': y, 'Predicted': predicted}
new_df = pd.DataFrame(actual_predicted)

In [64]:
new_df['Rounded'] = new_df['Predicted'].apply(round_up)
new_df['Rounded_up'] = new_df['Predicted'].apply(round)

In [65]:
new_df.head()

Unnamed: 0,Actual,Predicted,Rounded,Rounded_up
0,0,1.083633,1,1
1,1,0.794334,1,1
2,0,0.674126,1,1
3,3,0.932812,1,1
4,7,2.231707,2,2


In [66]:
new_df.describe()

Unnamed: 0,Actual,Predicted,Rounded,Rounded_up
count,1464.0,1464.0,1464.0,1464.0
mean,3.48224,3.815912,3.79918,3.79918
std,2.886973,2.63414,2.662742,2.662742
min,0.0,-0.032134,0.0,0.0
25%,1.0,1.409816,1.0,1.0
50%,3.0,3.536098,4.0,4.0
75%,5.0,5.924039,6.0,6.0
max,19.0,12.393959,12.0,12.0


In [56]:
type(predicted), type(y)

(pandas.core.series.Series, pandas.core.series.Series)

In [47]:
predicted

array([1.0836332 , 0.79433376, 0.6741258 , ..., 2.3571663 , 1.5654141 ,
       1.6707557 ], dtype=float32)

<img title="GitHub Mark" src="./img/GitHub-Mark-64px.png" style="height: 32px; padding-right: 15px" alt="GitHub Mark" align="left"> [GitHub repository](https://github.com/pessini/moby-bikes) <br>Author: Leandro Pessini