In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [16]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [17]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/reg_metro_rush.csv'
bikeshare_machine = set_data(file)

In [18]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,rush_hour,metro_dist,temp,hum,wind,holiday_1,work_day_1,start_station_10th & Monroe St NE,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
153563,8.683,0.684677,0,0.033736,21.845,49.4583,20.45845,0,1,0,...,0,1,0,0,0,0,0,1,0,0
907578,3.8,0.394903,0,0.328093,9.233349,49.0833,17.958211,0,1,0,...,0,1,0,0,0,0,0,1,0,0
861135,3.617,0.46608,0,0.009423,4.455,56.2083,13.000479,0,1,0,...,0,0,0,0,1,0,0,1,0,0
916287,15.35,2.233072,0,0.154365,14.834151,61.5417,15.208129,0,1,0,...,0,0,0,0,1,0,0,1,0,0
321617,23.217,1.852016,0,0.639386,18.946651,42.625,25.833257,0,1,0,...,0,1,0,0,0,0,0,1,0,0


### Model 1 - Remove Some Variables that are correlated

In [19]:
# Remove Lasso Columns while including all of the start/end stations
remove_cols = ['weather_cat_2','time_diff']
X1 = np.matrix(bikeshare_machine.drop(remove_cols, 1))
y1 = bikeshare_machine['time_diff']

In [20]:
# Split the data into training and testing sets and check the shape
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((762165, 295), (254056, 295), (762165,), (254056,))

In [21]:
# Fit model
model1 = LinearRegression()
model1.fit(X1_train, y1_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
pred1 = model1.predict(X1_test)
cvscores_model1 = cross_val_score(model1, X1_train, y1_train, cv=5)
model1_r = model1.score(X1_test, y1_test)
model1_mse = mean_squared_error(y1_test, pred1)
model1_rmse = np.sqrt(model1_mse)
adjustedr1 = 1 - (1-model1_r)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [23]:
cvscores_model1, model1_r, adjustedr1, model1_mse, model1_rmse

(array([ -3.14788463e+08,   4.47945514e-01,   4.60274684e-01,
          4.66770620e-01,   4.53205937e-01]),
 0.45824556915088455,
 0.45761577108538765,
 34.824402314314888,
 5.9012204088912732)

In [24]:
model1_sm = sm.OLS(y1_train, X1_train.astype(float)).fit()
model1_sm.summary()

0,1,2,3
Dep. Variable:,time_diff,R-squared:,0.459
Model:,OLS,Adj. R-squared:,0.458
Method:,Least Squares,F-statistic:,2241.0
Date:,"Thu, 14 Dec 2017",Prob (F-statistic):,0.0
Time:,22:29:41,Log-Likelihood:,-2435300.0
No. Observations:,762165,AIC:,4871000.0
Df Residuals:,761876,BIC:,4874000.0
Df Model:,288,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,7.1179,0.010,705.682,0.000,7.098,7.138
x2,-1.2874,0.020,-63.082,0.000,-1.327,-1.247
x3,-0.1034,0.137,-0.756,0.450,-0.372,0.165
x4,0.0467,0.002,22.888,0.000,0.043,0.051
x5,-0.0037,0.001,-5.145,0.000,-0.005,-0.002
x6,-0.0159,0.001,-10.599,0.000,-0.019,-0.013
x7,1.4749,0.051,28.959,0.000,1.375,1.575
x8,1.1120,0.037,29.868,0.000,1.039,1.185
x9,-1.6630,0.155,-10.756,0.000,-1.966,-1.360

0,1,2,3
Omnibus:,727117.135,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39183540.227
Skew:,4.634,Prob(JB):,0.0
Kurtosis:,36.882,Cond. No.,1.21e+16


### Model 4 - Ridge with Round 1

In [25]:
# Fit the model with 5 folds
alpha = np.arange(0.0001, 20, 25)
param_grid = {'alpha': alpha}
ridge1 = Ridge(fit_intercept=True)
ridge1_gs = GridSearchCV(ridge1, param_grid, cv=5)
ridge1_gs.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 0.0001])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [26]:
pred4 = ridge1_gs.predict(X1_test)
ridge1_r = ridge1_gs.score(X1_test, y1_test)
ridge1_mse = mean_squared_error(y1_test, pred4)
ridge1_rmse = np.sqrt(ridge1_mse)
adjustedr4 = 1 - (1-ridge1_r)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [27]:
ridge1_r, ridge1_mse, ridge1_rmse, adjustedr4

(0.45824562138375524,
 34.824398956744744,
 5.9012201244102682,
 0.45761582337897988)