# 1. Installing dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# 2. Loading the Dataset

In [3]:
df = pd.read_csv('data/SeoulBikeData.csv', encoding='unicode_escape')
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


# 3. Feature Engineering

## 3.1. Splitting the Date column

In [4]:
# Converting the 'date' column into separate ones - 'Day', 'Month', and 'Day of the Week'
import datetime as dt
df['date'] = df['Date'].apply(lambda x: dt.datetime.strptime(x,"%d/%m/%Y"))

df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df['Day of the week'] = df['date'].dt.day_name()

df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,date,Month,Day,Day of the week
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017-12-01,12,1,Friday
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017-12-01,12,1,Friday
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes,2017-12-01,12,1,Friday
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017-12-01,12,1,Friday
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017-12-01,12,1,Friday


## 3.2. Dropping the less important features

In [5]:
df = df.drop(['Date', 'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)', 'Seasons', 'Holiday', 'Functioning Day', 'date', 'Visibility (10m)'], axis=1)
df.head()     

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Rainfall(mm),Snowfall (cm),Month,Day,Day of the week
0,254,0,-5.2,37,2.2,0.0,0.0,12,1,Friday
1,204,1,-5.5,38,0.8,0.0,0.0,12,1,Friday
2,173,2,-6.0,39,1.0,0.0,0.0,12,1,Friday
3,107,3,-6.2,40,0.9,0.0,0.0,12,1,Friday
4,78,4,-6.0,36,2.3,0.0,0.0,12,1,Friday


## 3.3. Combining Rainfall & Snowfall Columns

In [7]:
df['Precipitation (mm)'] = df['Rainfall(mm)'] + 10 * df['Snowfall (cm)']
df = df.drop(['Rainfall(mm)', 'Snowfall (cm)'], axis=1)
df.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Month,Day,Day of the week,Precipitation (mm)
0,254,0,-5.2,37,2.2,12,1,Friday,0.0
1,204,1,-5.5,38,0.8,12,1,Friday,0.0
2,173,2,-6.0,39,1.0,12,1,Friday,0.0
3,107,3,-6.2,40,0.9,12,1,Friday,0.0
4,78,4,-6.0,36,2.3,12,1,Friday,0.0


## 3.4. Encoding the Day of the week column

In [8]:
def encode(x):
  day = {
      'Monday': 1,
      'Tuesday': 2,
      'Wednesday': 3,
      'Thursday': 4,
      'Friday': 5,
      'Saturday': 6,
      'Sunday': 7
  }

  return day[x]

In [9]:
df['Day of the week'] = df['Day of the week'].apply(lambda x: encode(x))
df.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Month,Day,Day of the week,Precipitation (mm)
0,254,0,-5.2,37,2.2,12,1,5,0.0
1,204,1,-5.5,38,0.8,12,1,5,0.0
2,173,2,-6.0,39,1.0,12,1,5,0.0
3,107,3,-6.2,40,0.9,12,1,5,0.0
4,78,4,-6.0,36,2.3,12,1,5,0.0


## 3.5. Splitting the dataset

In [10]:
from sklearn.model_selection import train_test_split
X = df.drop(['Rented Bike Count'], axis=1)
y = df['Rented Bike Count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Model Building

## 4.1. Defining the metrics method

In [11]:
from sklearn.metrics import r2_score,mean_squared_error

def metrics(y_pred):
  print("The evaluation metrics are as follows:")

  # Mean squared error for test set
  MSE  = mean_squared_error(y_test,y_pred)      
  print("MSE :" , MSE)
  
  RMSE = np.sqrt(MSE)
  print("RMSE :" ,RMSE)

  # R2 score for prediction on test set
  r2_test = r2_score(y_test,y_pred)             
  print("R2 :" ,r2_test)

## 4.2. Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300, max_depth=25, min_samples_split=2, min_samples_leaf=2)
rf.fit(X_train, y_train)

In [13]:
y_test_predict = rf.predict(X_test)
metrics(y_test_predict)

The evaluation metrics are as follows:
MSE : 58210.415557408625
RMSE : 241.26834760782157
R2 : 0.8579506565355022


## 4.3. XGBoost Regressor

In [14]:
import xgboost as xgb
params = {
    'objective': 'reg:squarederror',
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 5,
    'alpha': 10,
    'n_estimators': 300
}
xgb_reg = xgb.XGBRegressor(**params)
xgb_reg.fit(X_train, y_train)

In [15]:
y_test_predict = xgb_reg.predict(X_test)
metrics(y_test_predict)

The evaluation metrics are as follows:
MSE : 41639.46627451491
RMSE : 204.05750727310894
R2 : 0.8983883074898589


In [16]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
xgb_params = {
    'objective': ['reg:squarederror'],
    'colsample_bytree': [0.8],
    'learning_rate': [0.1],
    'max_depth': [3, 5, 7],
    'alpha': [10],
    'n_estimators': [100, 200, 300]
}

xgb_reg = xgb.XGBRegressor()
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=xgb_params, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_test_predict = best_model.predict(X_test)

In [17]:
metrics(y_test_predict)

The evaluation metrics are as follows:
MSE : 34157.526277691235
RMSE : 184.8175486194188
R2 : 0.9166462885438061
