In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['rush_hour','member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [11]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/Casual_RushMetro/landmarks.csv'
bikeshare_machine = set_data(file)

In [12]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,metro_dist,landmark_dist_start,landmark_dist_end,temp,hum,wind,rush_hour_1,member_type,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
276777,4.7,0.651763,0.380306,1.555143,0.916964,18.398349,83.7917,18.582718,1,1,...,0,0,1,0,0,0,0,0,1,0
159152,26.833,2.898367,0.300527,0.285993,0.62432,24.860849,66.6667,6.834,0,0,...,1,0,0,0,0,0,0,0,1,0
541890,6.833,0.691932,0.391648,0.469097,0.661659,25.996651,57.5417,9.625689,0,1,...,0,0,0,1,0,0,0,1,0,0
470770,9.917,1.106609,0.450642,0.98739,0.21843,22.55,89.7917,8.333393,0,1,...,0,0,0,0,0,1,0,0,1,0
548361,8.25,0.708816,0.152939,0.430721,0.604853,23.294151,71.2083,13.999918,0,1,...,0,1,0,0,0,0,0,1,0,0


In [13]:
bikeshare_machine = bikeshare_machine[bikeshare_machine['member_type'] == 1]

### Model 1 - Remove Some Variables that are correlated

In [14]:
# Leave workday, drop weekdays, leave season, drop month
# Workday overlaps with workday/not workday and months overlap with seasons
remove_cols = ['weekday_0', 'weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6', 
              'month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8','month_9',
              'month_10','month_11','month_12','time_diff']
X1 = np.matrix(bikeshare_machine.drop(remove_cols, 1))
y1 = bikeshare_machine['time_diff']

In [15]:
# Split the data into training and testing sets and check the shape
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((762165, 280), (254056, 280), (762165,), (254056,))

In [16]:
# Fit model
model1 = LinearRegression()
model1.fit(X1_train, y1_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [17]:
pred1 = model1.predict(X1_test)
cvscores_model1 = cross_val_score(model1, X1_train, y1_train, cv=5)
model1_r = model1.score(X1_test, y1_test)
model1_mse = mean_squared_error(y1_test, pred1)
model1_rmse = np.sqrt(model1_mse)
adjustedr1 = 1 - (1-model1_r)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [18]:
cvscores_model1, model1_r, adjustedr1, model1_mse, model1_rmse

(array([  4.55456150e-01,  -4.55047940e+10,   4.55888789e-01,
          4.65284324e-01,   4.53909719e-01]),
 0.45489773586750382,
 0.45429630296845103,
 35.195495939430515,
 5.9325791979062963)

In [19]:
model1_sm = sm.OLS(y1_train, X1_train.astype(float)).fit()
model1_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.459
Model:,OLS,Adj. R-squared:,0.459
Method:,Least Squares,F-statistic:,2374.0
Date:,"Sat, 16 Dec 2017",Prob (F-statistic):,0.0
Time:,15:04:16,Log-Likelihood:,-2434600.0
No. Observations:,762165,AIC:,4870000.0
Df Residuals:,761892,BIC:,4873000.0
Df Model:,272,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,7.1301,0.010,707.505,0.000,7.110,7.150
x2,-0.2706,0.174,-1.559,0.119,-0.611,0.069
x3,0.1952,0.162,1.203,0.229,-0.123,0.513
x4,-0.0107,0.104,-0.103,0.918,-0.214,0.192
x5,0.0535,0.001,37.832,0.000,0.051,0.056
x6,-0.0006,0.001,-0.959,0.338,-0.002,0.001
x7,-0.0112,0.001,-7.750,0.000,-0.014,-0.008
x8,-1.2951,0.020,-63.477,0.000,-1.335,-1.255
const,2.7221,0.089,30.745,0.000,2.549,2.896

0,1,2,3
Omnibus:,725728.559,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38849510.135
Skew:,4.622,Prob(JB):,0.0
Kurtosis:,36.733,Cond. No.,1.21e+16


### Model 2 - Try reverse of Model 1

In [20]:
# Drop workday, leave weekdays, drop season, leave month
# Workday overlaps with workday/not workday and months overlap with seasons
remove_cols = ['work_day_1','season_1', 'season_2', 'season_3', 'season_4','time_diff']
X2 = np.matrix(bikeshare_machine.drop(remove_cols, 1))
y2 = bikeshare_machine['time_diff']

In [21]:
# Split the data into training and testing sets and check the shape
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.25)
X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape

((762165, 294), (254056, 294), (762165,), (254056,))

In [22]:
# Fit model
model2 = LinearRegression()
model2.fit(X2_train, y2_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
pred2 = model2.predict(X2_test)
cvscores_model2 = cross_val_score(model2, X2_train, y2_train, cv=5)
model2_r = model2.score(X2_test, y2_test)
model2_mse = mean_squared_error(y2_test, pred2)
model2_rmse = np.sqrt(model2_mse)
adjustedr2 = 1 - (1-model2_r)*(len(y2_test)-1)/(len(y2_test)-X2_test.shape[1]-1)

In [24]:
cvscores_model2, model2_r, adjustedr2, model2_mse, model2_rmse

(array([  4.61436076e-01,   4.57119519e-01,   4.54775217e-01,
          4.59238168e-01,  -1.20998599e+13]),
 0.45916653274999553,
 0.4585399390678635,
 34.668611472125654,
 5.8880057296274479)

In [25]:
# Use statsmodel to check the statistical side of the model
model2_sm = sm.OLS(y2_train, X2_train.astype(float)).fit()
model2_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.458
Model:,OLS,Adj. R-squared:,0.458
Method:,Least Squares,F-statistic:,2262.0
Date:,"Sat, 16 Dec 2017",Prob (F-statistic):,0.0
Time:,15:06:18,Log-Likelihood:,-2435800.0
No. Observations:,762165,AIC:,4872000.0
Df Residuals:,761879,BIC:,4876000.0
Df Model:,285,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,7.1216,0.010,704.803,0.000,7.102,7.141
x2,-0.4108,0.172,-2.395,0.017,-0.747,-0.075
x3,0.3393,0.162,2.095,0.036,0.022,0.657
x4,-0.0087,0.104,-0.084,0.933,-0.212,0.194
x5,0.0467,0.002,22.898,0.000,0.043,0.051
x6,-0.0040,0.001,-5.516,0.000,-0.005,-0.003
x7,-0.0149,0.001,-9.941,0.000,-0.018,-0.012
x8,-1.3107,0.020,-64.228,0.000,-1.351,-1.271
const,2.6833,0.091,29.563,0.000,2.505,2.861

0,1,2,3
Omnibus:,726686.249,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39002958.425
Skew:,4.631,Prob(JB):,0.0
Kurtosis:,36.799,Cond. No.,1.21e+16


### Model 3 - All variables

In [26]:
# Keep all but time_diff
X3 = np.matrix(bikeshare_machine.drop('time_diff', 1))
y3 = bikeshare_machine['time_diff']

In [27]:
# Split the data into training and testing sets and check the shape
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.25)
X3_train.shape, X3_test.shape, y3_train.shape, y3_test.shape

((762165, 299), (254056, 299), (762165,), (254056,))

In [28]:
# Fit model
model3 = LinearRegression()
model3.fit(X3_train, y3_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
pred3 = model3.predict(X3_test)
cvscores_model3 = cross_val_score(model3, X3_train, y3_train, cv=5)
model3_r = model3.score(X3_test, y3_test)
model3_mse = mean_squared_error(y3_test, pred3)
model3_rmse = np.sqrt(model3_mse)
adjustedr3 = 1 - (1-model3_r)*(len(y3_test)-1)/(len(y3_test)-X3_test.shape[1]-1)

In [30]:
cvscores_model3, model3_r, adjustedr3, model3_mse, model3_rmse

(array([  4.53711113e-01,  -8.68304864e+10,   4.61608190e-01,
          4.57377059e-01,  -2.11638639e+13]),
 0.4585805553488147,
 0.45794260229962291,
 34.969475251081327,
 5.9134994082253298)

In [31]:
# Use statsmodel to check the statistical side of the model
model3_sm = sm.OLS(y3_train, X3_train.astype(float)).fit()
model3_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.459
Model:,OLS,Adj. R-squared:,0.458
Method:,Least Squares,F-statistic:,2240.0
Date:,"Sat, 16 Dec 2017",Prob (F-statistic):,0.0
Time:,15:08:25,Log-Likelihood:,-2434700.0
No. Observations:,762165,AIC:,4870000.0
Df Residuals:,761876,BIC:,4873000.0
Df Model:,288,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,7.1146,0.010,705.023,0.000,7.095,7.134
x2,-0.4269,0.171,-2.493,0.013,-0.763,-0.091
x3,0.2999,0.162,1.853,0.064,-0.017,0.617
x4,-0.1585,0.156,-1.019,0.308,-0.463,0.146
x5,0.0444,0.002,21.775,0.000,0.040,0.048
x6,-0.0036,0.001,-5.013,0.000,-0.005,-0.002
x7,-0.0141,0.001,-9.437,0.000,-0.017,-0.011
x8,-1.2853,0.020,-63.041,0.000,-1.325,-1.245
const,2.3701,0.087,27.342,0.000,2.200,2.540

0,1,2,3
Omnibus:,722147.777,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37798220.615
Skew:,4.593,Prob(JB):,0.0
Kurtosis:,36.255,Cond. No.,1.21e+16


### Model 4 - Ridge with Round 1

In [32]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge1 = Ridge(fit_intercept=True)
ridge1_gs = GridSearchCV(ridge1, param_grid, cv=5)
ridge1_gs.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [33]:
pred4 = ridge1_gs.predict(X1_test)
ridge1_r = ridge1_gs.score(X1_test, y1_test)
ridge1_mse = mean_squared_error(y1_test, pred4)
ridge1_rmse = np.sqrt(ridge1_mse)
adjustedr4 = 1 - (1-ridge1_r)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [34]:
ridge1_r, ridge1_mse, ridge1_rmse, adjustedr4

(0.45489773276065715,
 35.195496140029583,
 5.9325792148128613,
 0.45429629985817654)

### Ridge with Round 2

In [35]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge2 = Ridge(fit_intercept=True)
ridge2_gs = GridSearchCV(ridge2, param_grid, cv=5)
ridge2_gs.fit(X2_train, y2_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [36]:
pred5 = ridge2_gs.predict(X2_test)
ridge2_r = ridge2_gs.score(X2_test, y2_test)
ridge2_mse = mean_squared_error(y2_test, pred5)
ridge2_rmse = np.sqrt(ridge2_mse)
adjustedr5 = 1 - (1-ridge2_r)*(len(y2_test)-1)/(len(y2_test)-X2_test.shape[1]-1)

In [37]:
ridge2_r, ridge2_mse, ridge2_rmse, adjustedr5

(0.45916654622072428,
 34.668610608622394,
 5.8880056563001357,
 0.4585399525541991)

### Ridge with all data

In [38]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge3 = Ridge(fit_intercept=True)
ridge3_gs = GridSearchCV(ridge3, param_grid, cv=5)
ridge3_gs.fit(X3_train, y3_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [39]:
pred6 = ridge3_gs.predict(X3_test)
ridge3_r = ridge3_gs.score(X3_test, y3_test)
ridge3_mse = mean_squared_error(y3_test, pred6)
ridge3_rmse = np.sqrt(ridge3_mse)
adjustedr6 = 1 - (1-ridge3_r)*(len(y3_test)-1)/(len(y3_test)-X3_test.shape[1]-1)

In [40]:
ridge3_r, ridge3_mse, ridge3_rmse, adjustedr6

(0.45858057082352299,
 34.969474251592978,
 5.9134993237162865,
 0.45794261779256495)

In [None]:
#bikeshare_machine.to_csv('machine_full.csv')