In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['rush_hour','member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [10]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/Casual_RushMetro/landmark_casual.csv'
bikeshare_machine = set_data(file)

In [11]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,metro_dist,landmark_dist_start,landmark_dist_end,temp,hum,wind,rush_hour_1,holiday_1,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
145352,21.35,1.39101,0.294425,0.347277,1.079381,8.058349,57.5833,20.459254,0,0,...,0,0,0,0,1,0,0,0,1,0
124938,14.167,0.421029,0.328093,0.306432,0.604853,27.1325,57.8333,12.292557,0,0,...,1,0,0,0,0,0,0,1,0,0
16458,39.133,0.0,0.250981,0.661659,0.661659,24.743349,43.4167,12.415904,1,0,...,0,0,0,0,1,0,0,1,0,0
169589,18.4,2.345214,0.133167,0.328533,0.580586,-2.9475,41.4583,12.3749,1,0,...,0,0,0,1,0,0,0,0,1,0
141049,57.767,2.17671,0.460965,0.455379,0.21843,15.663466,74.3043,9.522174,0,0,...,0,1,0,0,0,0,0,1,0,0


### Model 1 - Remove Some Variables that are correlated

In [12]:
# Leave workday, drop weekdays, leave season, drop month
# Workday overlaps with workday/not workday and months overlap with seasons
remove_cols = ['weekday_0', 'weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6', 
              'month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8','month_9',
              'month_10','month_11','month_12','time_diff']
X1 = np.matrix(bikeshare_machine.drop(remove_cols, 1))
y1 = bikeshare_machine['time_diff']

In [13]:
# Split the data into training and testing sets and check the shape
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((152719, 275), (50907, 275), (152719,), (50907,))

In [14]:
# Fit model
model1 = LinearRegression()
model1.fit(X1_train, y1_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
pred1 = model1.predict(X1_test)
cvscores_model1 = cross_val_score(model1, X1_train, y1_train, cv=5)
model1_r = model1.score(X1_test, y1_test)
model1_mse = mean_squared_error(y1_test, pred1)
model1_rmse = np.sqrt(model1_mse)
adjustedr1 = 1 - (1-model1_r)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [16]:
cvscores_model1, model1_r, adjustedr1, model1_mse, model1_rmse

(array([  1.58442358e-01,   1.52326309e-01,  -2.47345409e+14,
          1.51326758e-01,   1.49478048e-01]),
 0.15624521604588226,
 0.15166239987422092,
 283.48177223265003,
 16.836916945588644)

In [17]:
model1_sm = sm.OLS(y1_train, X1_train.astype(float)).fit()
model1_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.156
Model:,OLS,Adj. R-squared:,0.155
Method:,Least Squares,F-statistic:,105.5
Date:,"Sat, 16 Dec 2017",Prob (F-statistic):,0.0
Time:,14:08:55,Log-Likelihood:,-646940.0
No. Observations:,152719,AIC:,1294000.0
Df Residuals:,152450,BIC:,1297000.0
Df Model:,268,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,4.0148,0.055,72.824,0.000,3.907,4.123
x2,0.1237,1.041,0.119,0.905,-1.916,2.163
x3,0.8887,0.584,1.521,0.128,-0.256,2.034
x4,0.3577,0.462,0.774,0.439,-0.548,1.264
x5,0.0663,0.009,7.027,0.000,0.048,0.085
x6,-0.0025,0.004,-0.588,0.557,-0.011,0.006
x7,0.0078,0.010,0.798,0.425,-0.011,0.027
x8,-4.1177,0.209,-19.721,0.000,-4.527,-3.708
x9,-0.9988,0.239,-4.174,0.000,-1.468,-0.530

0,1,2,3
Omnibus:,31093.323,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55900.282
Skew:,1.307,Prob(JB):,0.0
Kurtosis:,4.397,Cond. No.,1.25e+16


### Model 2 - Try reverse of Model 1

In [18]:
# Drop workday, leave weekdays, drop season, leave month
# Workday overlaps with workday/not workday and months overlap with seasons
remove_cols = ['work_day_1','season_1', 'season_2', 'season_3', 'season_4','time_diff']
X2 = np.matrix(bikeshare_machine.drop(remove_cols, 1))
y2 = bikeshare_machine['time_diff']

In [19]:
# Split the data into training and testing sets and check the shape
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.25)
X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape

((152719, 289), (50907, 289), (152719,), (50907,))

In [20]:
# Fit model
model2 = LinearRegression()
model2.fit(X2_train, y2_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
pred2 = model2.predict(X2_test)
cvscores_model2 = cross_val_score(model2, X2_train, y2_train, cv=5)
model2_r = model2.score(X2_test, y2_test)
model2_mse = mean_squared_error(y2_test, pred2)
model2_rmse = np.sqrt(model2_mse)
adjustedr2 = 1 - (1-model2_r)*(len(y2_test)-1)/(len(y2_test)-X2_test.shape[1]-1)

In [22]:
cvscores_model2, model2_r, adjustedr2, model2_mse, model2_rmse

(array([ -4.71543384e+13,   1.62005545e-01,   1.54618498e-01,
         -1.98880462e+13,   1.61265650e-01]),
 0.15806506791659947,
 0.1532580031879095,
 280.0603381414939,
 16.735003380384896)

In [23]:
# Use statsmodel to check the statistical side of the model
model2_sm = sm.OLS(y2_train, X2_train.astype(float)).fit()
model2_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.159
Model:,OLS,Adj. R-squared:,0.157
Method:,Least Squares,F-statistic:,102.5
Date:,"Sat, 16 Dec 2017",Prob (F-statistic):,0.0
Time:,14:09:17,Log-Likelihood:,-646970.0
No. Observations:,152719,AIC:,1294000.0
Df Residuals:,152437,BIC:,1297000.0
Df Model:,281,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.9785,0.055,71.969,0.000,3.870,4.087
x2,-0.3880,1.095,-0.354,0.723,-2.534,1.758
x3,0.6701,0.568,1.180,0.238,-0.443,1.783
x4,0.6736,0.454,1.484,0.138,-0.216,1.563
x5,0.1164,0.014,8.396,0.000,0.089,0.144
x6,-0.0134,0.005,-2.790,0.005,-0.023,-0.004
x7,-0.0274,0.010,-2.695,0.007,-0.047,-0.007
x8,-4.0926,0.208,-19.653,0.000,-4.501,-3.684
x9,1.4983,0.265,5.662,0.000,0.980,2.017

0,1,2,3
Omnibus:,30569.982,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54367.045
Skew:,1.294,Prob(JB):,0.0
Kurtosis:,4.36,Cond. No.,1.25e+16


### Model 3 - All variables

In [24]:
# Keep all but time_diff
X3 = np.matrix(bikeshare_machine.drop('time_diff', 1))
y3 = bikeshare_machine['time_diff']

In [25]:
# Split the data into training and testing sets and check the shape
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.25)
X3_train.shape, X3_test.shape, y3_train.shape, y3_test.shape

((152719, 294), (50907, 294), (152719,), (50907,))

In [26]:
# Fit model
model3 = LinearRegression()
model3.fit(X3_train, y3_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
pred3 = model3.predict(X3_test)
cvscores_model3 = cross_val_score(model3, X3_train, y3_train, cv=5)
model3_r = model3.score(X3_test, y3_test)
model3_mse = mean_squared_error(y3_test, pred3)
model3_rmse = np.sqrt(model3_mse)
adjustedr3 = 1 - (1-model3_r)*(len(y3_test)-1)/(len(y3_test)-X3_test.shape[1]-1)

In [28]:
cvscores_model3, model3_r, adjustedr3, model3_mse, model3_rmse

(array([  1.56268550e-01,  -1.97788069e+13,  -1.26791042e+11,
          1.56200985e-01,   1.54054087e-01]),
 0.15351783611850578,
 0.14860070665946123,
 279.25809201133882,
 16.711017084885611)

In [29]:
# Use statsmodel to check the statistical side of the model
model3_sm = sm.OLS(y3_train, X3_train.astype(float)).fit()
model3_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.161
Model:,OLS,Adj. R-squared:,0.159
Method:,Least Squares,F-statistic:,102.6
Date:,"Sat, 16 Dec 2017",Prob (F-statistic):,0.0
Time:,14:09:43,Log-Likelihood:,-647030.0
No. Observations:,152719,AIC:,1295000.0
Df Residuals:,152434,BIC:,1297000.0
Df Model:,284,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.9601,0.055,71.622,0.000,3.852,4.068
x2,-0.7234,1.011,-0.716,0.474,-2.704,1.257
x3,1.3772,0.532,2.590,0.010,0.335,2.419
x4,0.1192,0.471,0.253,0.800,-0.804,1.042
x5,0.1115,0.014,8.037,0.000,0.084,0.139
x6,-0.0110,0.005,-2.295,0.022,-0.020,-0.002
x7,-0.0234,0.010,-2.284,0.022,-0.043,-0.003
x8,-4.0633,0.208,-19.528,0.000,-4.471,-3.655
x9,3.4894,0.227,15.393,0.000,3.045,3.934

0,1,2,3
Omnibus:,30610.751,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54544.03
Skew:,1.293,Prob(JB):,0.0
Kurtosis:,4.371,Cond. No.,1.25e+16


### Model 4 - Ridge with Round 1

In [30]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge1 = Ridge(fit_intercept=True)
ridge1_gs = GridSearchCV(ridge1, param_grid, cv=5)
ridge1_gs.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [31]:
pred4 = ridge1_gs.predict(X1_test)
ridge1_r = ridge1_gs.score(X1_test, y1_test)
ridge1_mse = mean_squared_error(y1_test, pred4)
ridge1_rmse = np.sqrt(ridge1_mse)
adjustedr4 = 1 - (1-ridge1_r)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [32]:
ridge1_r, ridge1_mse, ridge1_rmse, adjustedr4

(0.15623735819469864,
 283.48441228592361,
 16.836995346139513,
 0.15165449934347197)

### Ridge with Round 2

In [33]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge2 = Ridge(fit_intercept=True)
ridge2_gs = GridSearchCV(ridge2, param_grid, cv=5)
ridge2_gs.fit(X2_train, y2_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [34]:
pred5 = ridge2_gs.predict(X2_test)
ridge2_r = ridge2_gs.score(X2_test, y2_test)
ridge2_mse = mean_squared_error(y2_test, pred5)
ridge2_rmse = np.sqrt(ridge2_mse)
adjustedr5 = 1 - (1-ridge2_r)*(len(y2_test)-1)/(len(y2_test)-X2_test.shape[1]-1)

In [35]:
ridge2_r, ridge2_mse, ridge2_rmse, adjustedr5

(0.15806130677476604,
 280.06158924370328,
 16.735040760144663,
 0.15325422057167037)

### Ridge with all data

In [36]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge3 = Ridge(fit_intercept=True)
ridge3_gs = GridSearchCV(ridge3, param_grid, cv=5)
ridge3_gs.fit(X3_train, y3_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [37]:
pred6 = ridge3_gs.predict(X3_test)
ridge3_r = ridge3_gs.score(X3_test, y3_test)
ridge3_mse = mean_squared_error(y3_test, pred6)
ridge3_rmse = np.sqrt(ridge3_mse)
adjustedr6 = 1 - (1-ridge3_r)*(len(y3_test)-1)/(len(y3_test)-X3_test.shape[1]-1)

In [38]:
ridge3_r, ridge3_mse, ridge3_rmse, adjustedr6

(0.15351768867213056,
 279.2581406545296,
 16.711018540308356,
 0.14860055835658492)

In [39]:
#bikeshare_machine.to_csv('machine_full.csv')