In [1]:
%matplotlib inline

# Bikeshare Ridership 

Notebook to predict the number of trips per day for Capital Bikeshare in Washington, DC based on the season of year and the given weather.  Data built from two years of trip data (2011 and 2012) from Capital Bikeshare.

### Notebook Setup

In [3]:
import os 
import sys 

sys.path.append("/Users/benjamin/Repos/ddl/yellowbrick")
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

sns.set_context('notebook')
sns.set_style('whitegrid')

## Data Loading

In [5]:
data = pd.read_csv('data/hour.csv')
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.6,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.6,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.6,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61
17378,17379,2012-12-31,1,1,12,23,0,1,1,1,0.26,0.2727,0.65,0.1343,12,37,49


## Exploratory

In [11]:
# Stats for trips data
trips_cols = ['casual', 'registered', 'cnt']
data[trips_cols].describe()


Unnamed: 0,casual,registered,cnt
count,17379.0,17379.0,17379.0
mean,35.676218,153.786869,189.463088
std,49.30503,151.357286,181.387599
min,0.0,0.0,1.0
25%,4.0,34.0,40.0
50%,17.0,115.0,142.0
75%,48.0,220.0,281.0
max,367.0,886.0,977.0


In [20]:
# Plot of trips over time to show seasonality and need to split training data by time of year
trips_df = data[trips_cols + ['dteday', 'hr']].set_index(['dteday', 'hr'])
trips_df.plot(kind='line')

ValueError: scatter requires and x and y column

In [5]:
from sklearn.model_selection import train_test_split as tts 

features = [
    'season', 'year', 'month', 'hour', 'holiday', 'weekday', 'workingday', 
    'weather', 'temp', 'feelslike', 'humidity', 'windspeed', 
]

target = 'registered' # can be one of 'casual', 'registered', 'cnt'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

## Do Some Regression 

In [6]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score 

In [7]:
# OLS 
from sklearn.linear_model import LinearRegression 

model = LinearRegression() 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f}".format(r2,me))

r2=0.339 MSE=15021.976


In [8]:
# L2 and L1 Regularization 
alphas = np.logspace(-10, 0, 200)

In [9]:
from sklearn.linear_model import RidgeCV 

model = RidgeCV(alphas=alphas) 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.alpha_))

r2=0.339 MSE=15023.188 alpha=1.000


In [10]:
from sklearn.linear_model import LassoCV 

model = LassoCV(alphas=alphas) 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.alpha_))

r2=0.339 MSE=15022.186 alpha=0.003


In [11]:
from sklearn.linear_model import ElasticNetCV

model = ElasticNetCV(alphas=alphas) 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f}".format(r2,me))

r2=0.339 MSE=15026.760


In [None]:
sns.boxplot(y=target, data=data)

In [13]:
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.pipeline import Pipeline 

model = Pipeline([
    ('poly', PolynomialFeatures(2)), 
    ('lasso', LassoCV(alphas=alphas)),
])

model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.named_steps['lasso'].alpha_))

r2=0.479 MSE=11837.677 alpha=0.008


In [14]:
model = Pipeline([
    ('poly', PolynomialFeatures(2)), 
    ('ridge', RidgeCV(alphas=alphas)),
])

model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.named_steps['ridge'].alpha_))

r2=0.479 MSE=11847.402 alpha=0.070


In [15]:
model = Pipeline([
    ('poly', PolynomialFeatures(3)), 
    ('ridge', RidgeCV(alphas=alphas)),
])

model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.named_steps['ridge'].alpha_))

r2=0.557 MSE=10072.626 alpha=0.314


In [16]:
model = Pipeline([
    ('poly', PolynomialFeatures(4)), 
    ('ridge', RidgeCV(alphas=alphas)),
])

model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.named_steps['ridge'].alpha_))

r2=0.593 MSE=9255.512 alpha=1.000


In [22]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor() 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f}".format(r2,me))

r2=0.937 MSE=1420.796


## Save the Forests!

In [23]:
import pickle 

with open('forest-riders.pkl', 'wb') as f:
    pickle.dump(model, f)

In [24]:
with open('forest-riders.pkl', 'rb') as f:
    model = pickle.load(f)

In [25]:
model.predict(X_test)

array([ 161.7,  211.5,  243.6, ...,    2.6,  233.9,  234.9])

In [18]:
from sklearn.ensemble import AdaBoostRegressor

model = AdaBoostRegressor() 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f}".format(r2,me))

r2=0.681 MSE=7251.873


In [19]:
from sklearn.linear_model import BayesianRidge

model = BayesianRidge() 
model.fit(X_train, y_train)

yhat = model.predict(X_test)

r2 = r2_score(y_test, yhat)
me = mse(y_test, yhat)

print("r2={:0.3f} MSE={:0.3f}".format(r2,me))

r2=0.339 MSE=15025.645


In [21]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(5)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.91569589942537322