#   <center> Project (bike_rentals)   Amine Dahane + Mostafa Alaywan

In [None]:
## ======================================================================
#            Importing the necessary modules and tools
## ======================================================================

import pandas as pd; import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# from sklearn.metrics import necessary metrics
from sklearn.metrics import mean_squared_error as MSE

# to use ANOVA test 
import pingouin as pg 


# Set notebook options
# --------------------
pd.options.display.float_format = '{:,.3f}'.format
%matplotlib inline


In [None]:
data_br = pd.read_csv('bike_rentals.csv')
print("*" * 50)
data_br.info()
print("*" * 50)
data_br.head()


In [None]:
# checkking missing value
# ----------------------
data_br.isnull().sum()

In [None]:
## ======================================
#     Missing Data Imputation
## ======================================



# Imputing values with fillna() from pandas  
#=================================================

data_br['yr'] = data_br['yr'].fillna(method = 'ffill')
data_br['mnth'] = data_br['mnth'].fillna(method = 'ffill')


# Imputing values with Simple Imputer from sklearn 
#=================================================

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')


data_br['windspeed']=imp.fit_transform(data_br[['windspeed']])
data_br['temp']=imp.fit_transform(data_br[['temp']])
data_br['atemp']=imp.fit_transform(data_br[['atemp']])
data_br['hum']=imp.fit_transform(data_br[['hum']])

# Check the missing data again
#===========================
data_br.isnull().sum()

#data_br.dropna(inplace=True)
#data_br.info()

In [None]:
# Display rows 56 and 81, 128, 298, 528  /// Just for to be sure of using MEAN function

data_br.iloc[[56, 81, 128, 298, 528]]


In [None]:
# transform 'dteday' to date time type 
# ===================================== 
data_br['dteday']= pd.to_datetime(data_br['dteday'])


In [None]:
data_br['year']=data_br['dteday'].dt.year
data_br['month']=data_br['dteday'].dt.month
data_br['DOW']=data_br['dteday'].dt.dayofweek

In [None]:
# Correlation Matrix : 
# ========================
# the most correlated variables with "cnt" are : 'temp','atemp','casual'
# the two variables 'temp' and 'atemp' are highly correlated , redundante variable 
corr_data = data_br[['cnt','temp','atemp','casual']]
correlation_matrix = corr_data.corr()


# Correlation Heatmap :
# ========================
sns.heatmap(correlation_matrix , annot=True )

In [None]:
# correlation between cnt and season
# ====================================
season = pg.anova(dv = 'cnt' , between = 'season', data = data_br , detailed = True )
season

In [None]:
# correlation between cnt and weathersit
# =======================================
weathersit = pg.anova(dv = 'cnt' , between = 'weathersit', data = data_br , detailed = True )
weathersit

In [None]:
data_br.columns

In [None]:
# correlation between cnt and year
# =======================================
year = pg.anova(dv = 'cnt' , between = 'year', data = data_br , detailed = True )
year

In [None]:
# correlation between cnt and month
# =======================================
month = pg.anova(dv = 'cnt' , between = 'month', data = data_br , detailed = True )
month

In [None]:
# List of the features (idependent variables)
# ==============================================
ind_variables_selected = ['season', 'weathersit' ,'temp' , 'casual' , 'year', 'month']


In [None]:
# Splitting Data 
# =================

X = data_br[ind_variables_selected]
y = data_br['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                   y,
                                                  test_size=0.20, 
                                                   random_state=3)

In [None]:
## ================================================================
#      Training Gradient Boosting Regressor
## ================================================================

# Instantiate gb
# --------------
gb = GradientBoostingRegressor(max_depth=4, 
                              n_estimators= 100,
                              random_state=1)

# Fit gb to the training set
gb.fit(X_train, y_train)

# Predict test set labels
y_pred = gb.predict(X_test)

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute MSE
mse_test = MSE(y_test, y_pred)

# Compute RMSE
rmse_test = mse_test**0.5

# Print RMSE
print('Test set RMSE of gb: {:.3f}'.format(rmse_test))

In [None]:

gb=GradientBoostingRegressor()
print("="*40)
for key, value in gb.get_params().items():
    print("{:30}{}".format(key, value))
print("="*40)

In [None]:
## =========================================================
#.      Gradient Boosting Regressor Hyperparameter Tuning 
#.      using GridSearch CV
## =========================================================

# Import GridsearchCV
# -------------------
from sklearn.model_selection import GridSearchCV

# Create a dictionary of the grid
# -----------------------------
gbm_grid = {
    "n_estimators":[100,150,200,250, 300, 500, 1000],
    "learning_rate": [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 1],
    "max_depth":[3, 5, 8, 10],
    "max_features":["log2","sqrt","auto"],
    "min_samples_leaf": [2, 10, 30, 40, 50, 70, 100],
    
    }

# Instantiate the grid search CV
# ------------------------------
grid_gb = GridSearchCV(
            estimator=GradientBoostingRegressor(random_state=1),
            param_grid= gbm_grid,
            cv=6, 
            n_jobs=8,
            verbose = 1) 

In [None]:
# Fit the grid seach object to the training data
# ---------------------------------------------
grid_gb.fit(X_train, y_train)

In [None]:
# Print the best parameters
# -------------------------
print("*"*40)
for hyper_name, val in grid_gb.best_params_.items():
    print("{0:18}: {1}".format(hyper_name, val))

# print the best score
# ---------------------
print("*"*40)
print('The best achieved score: {:.5f}'.format(grid_gb.best_score_))
print("*"*40)

In [None]:
# Train the best Regressoor
# --------------------------
best_reg = GradientBoostingRegressor(
              random_state=1,
              learning_rate=0.01, 
              max_depth=5,
              max_features='log2', 
              n_estimators=1000,
           min_samples_leaf=10,
                                 )
# Fit the best Regressor
# ------------------------
best_reg.fit(X_train, y_train)

# Make predictions
# ----------------

best_preds = best_reg.predict(X_test)




# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Computethe best  MSE
mse_best = MSE(y_test, best_preds)

# Compute the best RMSE
rmse_best = mse_best**0.5

# Print the best RMSE
print('Test set RMSE of gb: {:.3f}'.format(rmse_best))
