In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

In [20]:
# Load the Xset
df = pd.read_csv("../../data/raw/daily-bike-share.csv")

In [21]:
# Define the features and the target
X = df.drop(['rentals', 'dteday'],axis=1)
y = df['rentals']

In [22]:
# Feature engineer
## Convert columns to categorical type
cat_cols = ["season","yr", "mnth", "holiday", "weekday", "workingday", "weathersit"]
for col in cat_cols:
    X[col] = X[col].astype('category')

In [23]:
## Create dummies for categorical features - encode
dummies_cols = ["season","yr", "mnth", "holiday", "weekday", "workingday", "weathersit"]
for col in dummies_cols:
    dummies = pd.get_dummies(X[col], prefix=col, drop_first=False)
    X = pd.concat([X, dummies], axis=1)

In [25]:
pd.set_option('display.max_columns', None)

In [26]:
 X

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,season_1,season_2,season_3,season_4,yr_0,yr_1,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12,holiday_0,holiday_1,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1,weathersit_1,weathersit_2,weathersit_3
0,1,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0
1,2,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
2,3,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0
3,4,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0
4,5,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0
727,728,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,1,0
728,729,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0
729,730,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0


In [7]:
# Split the X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Use the best parameters from the grid search
params = {'learning_rate': 0.1, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}

In [9]:
# Create the gradient boosting model with the best parameters
model = GradientBoostingRegressor(**params)

In [10]:
# Fit the model to the training data
model.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=4, max_features='sqrt', min_samples_leaf=2,
                          min_samples_split=5)

In [11]:
joblib.dump(model, "../../models/trained_model2.pkl")

['../../models/trained_model2.pkl']

In [12]:
# Make predictions for the test data
y_pred = model.predict(X_test)

In [27]:
y_pred

array([ 6.66748122e+02,  4.08973049e+01,  2.74904975e+02,  1.63767943e+03,
        9.86872384e+02,  1.07072660e+03,  2.91783627e+01,  3.25291897e+02,
        1.28868498e+03,  1.10731114e+03,  7.80764788e+01,  2.32962420e+02,
        7.33710591e+02,  3.12119795e+02,  2.33300951e+02,  6.94436221e+02,
        2.24646561e+02,  1.38027863e+03,  1.04719109e+03,  3.23109853e+02,
        1.13381324e+03,  8.53897852e+02,  9.09935918e+02,  1.63644358e+02,
        5.39112645e+01,  7.18714922e+02,  2.12404850e+02,  1.02283063e+03,
        1.56131540e+02,  5.07871610e+02,  5.05758379e+02,  7.42301633e+02,
        1.01713441e+03,  1.09588689e+03,  9.82988029e+02,  2.37748720e+03,
        1.37105661e+03,  2.04029375e+03,  2.03263989e+02,  2.49152422e+01,
        6.82476592e+02,  2.24677567e+02,  1.10426531e+03,  1.63085750e+03,
        1.07090622e+03, -3.63002298e+01,  8.18468821e+02,  7.46364113e+02,
        2.02604500e+03,  1.76281039e+03,  1.64032035e+03,  4.86465568e+02,
        5.80258544e+02,  

In [13]:
# Calculate the R-squared and mean squared error scores
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [14]:
# Print the scores
print("R-squared score: {:.3f}".format(r2))
print("MSE score: {:.3f}".format(mse))

R-squared score: 0.844
MSE score: 60618.036
