In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

### Part A: Data Preprocessing and Baseline

1. Data Loading and Feature Engineering

In [None]:
#Loading in dataset
df = pd.read_csv('./bike+sharing+dataset/hour.csv')
df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [None]:
#Dropping unwanted columns
df = df.drop(columns = ['instant','dteday','casual','registered'])
df

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,119
17375,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,89
17376,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,90
17377,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,61


In [None]:
#Checking dataset feature types, null checks, etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      17379 non-null  int64  
 1   yr          17379 non-null  int64  
 2   mnth        17379 non-null  int64  
 3   hr          17379 non-null  int64  
 4   holiday     17379 non-null  int64  
 5   weekday     17379 non-null  int64  
 6   workingday  17379 non-null  int64  
 7   weathersit  17379 non-null  int64  
 8   temp        17379 non-null  float64
 9   atemp       17379 non-null  float64
 10  hum         17379 non-null  float64
 11  windspeed   17379 non-null  float64
 12  cnt         17379 non-null  int64  
dtypes: float64(4), int64(9)
memory usage: 1.7 MB


- The features that have data type int64 are categorical. Thus we will convert them into a one-hot encoded form (except the target variable)

In [None]:
#One-hot encoding the categorical variables
df_ohe = pd.get_dummies(df, columns=['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit'])

In [None]:
#Displaying the dataset with OHE variables
df_ohe

Unnamed: 0,temp,atemp,hum,windspeed,cnt,season_1,season_2,season_3,season_4,yr_0,...,weekday_3,weekday_4,weekday_5,weekday_6,workingday_0,workingday_1,weathersit_1,weathersit_2,weathersit_3,weathersit_4
0,0.24,0.2879,0.81,0.0000,16,True,False,False,False,True,...,False,False,False,True,True,False,True,False,False,False
1,0.22,0.2727,0.80,0.0000,40,True,False,False,False,True,...,False,False,False,True,True,False,True,False,False,False
2,0.22,0.2727,0.80,0.0000,32,True,False,False,False,True,...,False,False,False,True,True,False,True,False,False,False
3,0.24,0.2879,0.75,0.0000,13,True,False,False,False,True,...,False,False,False,True,True,False,True,False,False,False
4,0.24,0.2879,0.75,0.0000,1,True,False,False,False,True,...,False,False,False,True,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,0.26,0.2576,0.60,0.1642,119,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
17375,0.26,0.2576,0.60,0.1642,89,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
17376,0.26,0.2576,0.60,0.1642,90,True,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
17377,0.26,0.2727,0.56,0.1343,61,True,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False


In [None]:
#Splitting the dataset into features (X) and target (Y)
X = df_ohe.drop(columns = 'cnt')
Y = df_ohe['cnt']

2. Train/Test Split

In [None]:
#Splitting the dataset in train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

3. Baseline Model

In [None]:
#Defining and training a Decision Tree Regressor
decision_tree = DecisionTreeRegressor(max_depth=6)
decision_tree.fit(X_train,Y_train)

In [None]:
#Predictions using Decision Tree Regressor
y_pred_decision_tree = decision_tree.predict(X_test)

In [None]:
#Defining and training a Linear Regressor
linear = LinearRegression()
linear.fit(X_train,Y_train)

In [None]:
#Predictions using Linear Regressor
y_pred_linear = linear.predict(X_test)

In [None]:
#RMSE calculation for both baseline models
rmse_decision_tree = root_mean_squared_error(Y_test,y_pred_decision_tree)
rmse_linear = root_mean_squared_error(Y_test,y_pred_linear)

In [25]:
print(f"RMSE with Decision Tree Regressor: {rmse_decision_tree}")
print(f"RMSE with Linear Regressor: {rmse_linear}")

RMSE with Decision Tree Regressor: 122.24735526428948
RMSE with Linear Regressor: 101.20541681355333


- We shall choose the linear regressor as our baseline as it achieves a lower RMSE.

### Part B: Ensemble Techniques for Bias and Variance Reduction

1. Bagging

In [None]:
#Defining and training Bagging Regressor
bagging = BaggingRegressor(estimator=DecisionTreeRegressor(),n_estimators=1000)
bagging.fit(X_train, Y_train)

In [None]:
#Prediction using bagging Regressor
y_pred_bag = bagging.predict(X_test)

In [None]:
#RMSE calculation
rmse_bagging = root_mean_squared_error(Y_test,y_pred_bag)
print(f"RMSE with Bagging: {rmse_bagging}")

RMSE with Bagging: 49.78773409408626


- A single Deicision Tree is a high-variance model. Small changes in the training data can lead to very different learnt trees. This often results in overfitting, which can lead to high RMSE on unseen test data.
- Bagging involves generating multiple samples from the training data by sampling with replacement. In a Bagging Regressor, each Decision Tree is training on one of these bootstrap samples. The final prediction is the average of the predictions of each decision tree.
- Each tree makes slightly different errors as it sees different training data. Averaging the predictions smoothens the random fluctuations in the predictions. 
- Mathematically, assuming the trees are independent/uncorrelated, averaging the predictions decreases the variance by a factor of $\frac{1}{n}$. As a result, the RMSE of the bagging regressor is also less than the baseline Decision Tree Regressor.
- Thus, bagging effectively reduced the variance compared to the single Decision Tree baseline.

2. Boosting

In [None]:
#Defining and training the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=1000)
gbr.fit(X_train,Y_train)

In [None]:
#Predictions using Gradient Boosting Regressor
y_pred_gbr = gbr.predict(X_test)

In [None]:
#RMSE calculation
rmse_gbr = root_mean_squared_error(Y_test,y_pred_gbr)
print(f"RMSE with Gradient Boosting: {rmse_gbr}")

RMSE with Gradient Boosting: 51.35930907631139


- Gradient boosting works on the principle of fitting weak learners to the residuals of previous learners. This would lead to correcting the error made by the previous learner, thus reducing the bias.
- As a result, it achieves a low bias and a low RMSE. This is seen in the RMSE achieved by the Gradient Boosting Regressor, which is much less than both the baseline models.
- However, the RMSE is slightly higher than the Bagging Regressor which works on variance reduction. This could be due to variance still present in boosting, or hyperparameters not tuned well.
- Thus, bias reduction via Gradient Boosting proves to be an effective way to generalise better and achieve a lower RMSE.

### Part C: Stacking for Optimal Performance

1. Stacking Implementation

- Stacking is an ensemble learning technique that combines the predictions of multiple diverse models (base learners) to produce a more accurate final prediction. Instead of averaging (bagging) or sequentially correcting errors (boosting), stacking learns how to combine models optimally.
- Stacking involves two levels:
    1. Base learners: These are individual models that are trained on the original training data.
    2. Meta learner: This model is not trained on the training data but rather, the predictions of each individual base learner. It tries to learn the optimal combination of the base preedictions to minimise prediction error.
- This combines various aspects of diverse models such as the benefits of low bias and low variance, which leads to better generalisation.

In [None]:
#Defining the base learners: KNN, Bagging Regressor, Gradient Boosting Regressor. We use the same number of estimators as used in Part B for Bagging and Gradient Boosting Regressor
knn_base = KNeighborsRegressor(n_neighbors=5)
bagging_base = BaggingRegressor(estimator=DecisionTreeRegressor(),n_estimators=1000)
gbr_base = GradientBoostingRegressor(n_estimators=1000)

In [None]:
#Defining the meta learner: Ridge Regressor
meta_learner = Ridge(alpha=1.0)

In [None]:
#Defining the Stacking Regressor
stacking = StackingRegressor(
    estimators=[
        ('knn', knn_base),
        ('bagging', bagging_base),
        ('gbr', gbr_base)
    ],
    final_estimator=meta_learner,
)

In [None]:
#Training the Stacking Regressor
stacking.fit(X_train, Y_train)

In [None]:
#Predictions using Stacking Regressor
y_pred_stack = stacking.predict(X_test)

2. Final Evaluation

In [None]:
#RMSE calculation
rmse_stack = root_mean_squared_error(Y_test,y_pred_stack)
print(f"RMSE with Stacking Regressor: {rmse_stack}")

RMSE with Stacking Regressor: 46.61888530287946


### Part D: Final Analysis

1. Comparative Table

| Model                          | RMSE           |
|--------------------------------|----------------|
| Linear Regressor               | 101.2054       |
| Bagging Regressor              | 49.7877        |
| Gradient Boosting Regressor    | 51.3593        |
| Stacking Regressor             | 46.6188        |


2. Conclusion

- We see that the Stacking Regressor achieves the lowest RMSE on the test data, hence it is the best performing model.
- Baseline models such as linear regression and decision trees often face issues due to high bias (not able to capture complex patterns and leads to underfitting) or high variance (overfits to the training data, and cannot generalise well). As a result, the model is not able to capture all aspects of the data, leading to a higher RMSE. This is known as the bias-variance tradeoff.
- Stacking on the other hand, combines diverse base learners and a meta-learner to optimally combine them. The base learners are chosen in a way, such that each one captures patterns in a unique way.
- In our case, we can say that the base learners used have different learning patterns. The KNN Regressor captures low level patterns and has low bias, the Bagging Regressor works on variance reduction, and the Gradient Boosting Regressor works on bias reduction. Each base learner provides a different perspective to the data, and produces different types of errors that are uncorrelated. Combining the predictions of each of these base learners would smoothen out these errors.
- The meta learner then learns how to combine the predictions of these base learners optimally, such that the final output minimises the prediction error. It learns to leverage and weight the predictions such that it reduces both bias and variance. It also learns which base learner to weigh more for which region of the feature space.
- As a result, the stacking regressor reduces both bias and variance, which results in the least RMSE among the models that have been tested. The margin of difference in RMSE between the stacking regressor and the baseline model is also high (almost half the RMSE of Linear Regressor), indicating that the stacking regressor generalises much better than the baseline models.