# Setup

In [1]:
import math
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
import warnings; warnings.simplefilter('ignore')

<br>
<br>
<br>

# Load Final Data Frame

In [3]:
col_names = ['count', 'max(AQI)', 'pre_AQI', 'month', 'year', 'season']
bikeshare = pd.read_csv("Final_BikeshareDF2.csv", header=None, names=col_names, parse_dates=True)
bikeshare.head()

Unnamed: 0,count,max(AQI),pre_AQI,month,year,season
0,14587,29,23,1,2016,winter
1,15499,28,29,1,2016,winter
2,19593,23,28,1,2016,winter
3,18053,48,23,1,2016,winter
4,24569,60,48,1,2016,winter


<br>
<br>
<br>

# Prepare Data for Machine Learning Pipeline

In [4]:
bike_ml = bikeshare.dropna()
bike_ml = pd.get_dummies(bike_ml)
bike_ml.head()

Unnamed: 0,count,max(AQI),pre_AQI,month,year,season_autumn,season_spring,season_summer,season_winter
0,14587,29,23,1,2016,0,0,0,1
1,15499,28,29,1,2016,0,0,0,1
2,19593,23,28,1,2016,0,0,0,1
3,18053,48,23,1,2016,0,0,0,1
4,24569,60,48,1,2016,0,0,0,1


<br>
<br>
<br>

# Split Training (Pre-2018) and Test Sets (2018)

In [5]:
training = bike_ml[bike_ml.year != 2018].drop(['month', 'year'], axis=1)
test = bike_ml[bike_ml.year == 2018].drop(['month', 'year'], axis=1)

# Train-Validation Split

In [6]:
training_features = training.drop('max(AQI)', axis=1)
training_labels = training['max(AQI)']

test_features = test.drop('max(AQI)', axis=1)
test_labels = test['max(AQI)']

In [7]:
train_data, val_data, train_target, val_target = train_test_split(training_features,
                                                                    training_labels,
                                                                    train_size=0.8,
                                                                    test_size=0.2)

<br>
<br>
<br>

# Modeling

## Baseline

In [8]:
train_avg_aqi = train_target.mean()  # best constant prediction
print(f"Average AQI Over Training Dataset: {train_avg_aqi}")

base_val_rmse = math.sqrt(mean_squared_error(val_target,
                                             pd.Series([train_avg_aqi] * val_target.size)))
print(f'Baseline Validation RMSE = {round(base_val_rmse, 4)}')

base_test_rmse = math.sqrt(mean_squared_error(test_labels,
                                              pd.Series([train_avg_aqi] * test_labels.size)))
print(f'Baseline Test RMSE = {round(base_test_rmse, 4)}')

Average AQI Over Training Dataset: 27.824956672443673
Baseline Validation RMSE = 8.6839
Baseline Test RMSE = 16.028


## Linear Regression

In [9]:
# Scale only Non-categorical columns
ct = ColumnTransformer(
    [('scaler', StandardScaler(), [0, 1, 2])],
    remainder='passthrough',
    n_jobs=-1
)

# Build Pipeline
lr_pipe = Pipeline([('transf', ct),
                   ('clf', ElasticNet())])

# Regularization parameter
parameters = dict(clf__l1_ratio=np.arange(0, 1, 0.1))

# Gridsearch
gs = GridSearchCV(estimator=lr_pipe,
                  param_grid=parameters,
                  scoring='neg_mean_squared_error',
                  cv=10,
                  n_jobs=-1)

# Fit
gs.fit(train_data, train_target)

# Validation RMSE
print(f"Validation RMSE: {math.sqrt(-gs.score(val_data, val_target))}")

# Test RMSE
print(f"Test RMSE: {math.sqrt(-gs.score(test_features, test_labels))}")

Validation RMSE: 7.804544887052386
Test RMSE: 11.912447652825255


<br>
<br>
<br>

## Random Forest 

In [10]:
# Regressor
rf_model = RandomForestRegressor(n_jobs=-1)

# Max depth & number of trees
parameters = dict(max_depth=list(range(2, 30, 7)),
                  n_estimators=range(2, 50, 8))

# Gridsearch
gs = GridSearchCV(estimator=rf_model,
                  param_grid=parameters,
                  scoring='neg_mean_squared_error',
                  cv=10,
                  n_jobs=-1)

# Fit
gs.fit(train_data, train_target)

# Validation RMSE
print(f"Validation RMSE: {math.sqrt(-gs.score(val_data, val_target))}")

# Test RMSE
print(f"Test RMSE: {math.sqrt(-gs.score(test_features, test_labels))}")

Validation RMSE: 7.837041964093058
Test RMSE: 12.024241911159445


<br>
<br>
<br>

## Gradient Boosted Trees

In [11]:
# Regressor
rf_model = GradientBoostingRegressor()

# Max depth and number of iterations
parameters = dict(max_depth=list(range(2, 11, 4)),
                  n_estimators=range(2, 11, 4))

# Gridsearch
gs = GridSearchCV(estimator=rf_model,
                  param_grid=parameters,
                  scoring='neg_mean_squared_error',
                  cv=10,
                  n_jobs=-1)

# Fit
gs.fit(train_data, train_target)

# Validation RMSE
print(f"Validation RMSE: {math.sqrt(-gs.score(val_data, val_target))}")

# Test RMSE
print(f"Test RMSE: {math.sqrt(-gs.score(test_features, test_labels))}")

Validation RMSE: 7.9035582946902485
Test RMSE: 13.493144070367315
