In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [4]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/metro_rush.csv'
bikeshare_machine = set_data(file)

In [5]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,rush_hour,metro_dist,temp,hum,wind,member_type,holiday_1,work_day_1,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
358466,1.517,0.0,0,0.197601,19.965,61.4167,16.208975,1,0,1,...,0,0,0,1,0,0,0,1,0,0
877012,6.25,0.515173,0,0.133167,14.364151,63.625,28.292425,1,0,1,...,0,0,0,0,1,0,0,1,0,0
714364,6.7,0.566366,0,0.516612,11.896651,75.75,3.167425,1,0,1,...,0,0,0,0,1,0,0,0,1,0
348767,36.6,1.463341,0,0.328093,19.338349,81.0833,12.875725,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1055345,3.767,0.545279,1,0.294425,6.883349,75.2917,6.125475,1,0,1,...,0,0,0,0,1,0,0,0,1,0


### Model 1 - Remove Some Variables that are correlated

In [6]:
# Leave workday, drop weekdays, leave season, drop month
# Workday overlaps with workday/not workday and months overlap with seasons
remove_cols = ['weekday_0', 'weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6', 
              'month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8','month_9',
              'month_10','month_11','month_12','time_diff']
X1 = np.matrix(bikeshare_machine.drop(remove_cols, 1))
y1 = bikeshare_machine['time_diff']

In [7]:
# Split the data into training and testing sets and check the shape
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25, random_state=17)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((914885, 278), (304962, 278), (914885,), (304962,))

In [8]:
# Fit model
parameters = {'l1_ratio':np.arange(0.003,1,30),
             'alpha': np.arange(0.01, 1, 30)}
elastic1 = ElasticNet()
ecv1 = GridSearchCV(elastic1, param_grid=parameters, cv=5)
ecv1.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'l1_ratio': array([ 0.003]), 'alpha': array([ 0.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [9]:
ecv_pred1 = ecv1.predict(X1_test)
ecv_score1 = ecv1.score(X1_test, y1_test)
ecv_mse1 = mean_squared_error(y1_test, ecv_pred1)
ecv_rmse1 = np.sqrt(ecv_mse1)
ecv_evar1 = explained_variance_score(y1_test, ecv_pred1)
adjustedr1 = 1 - (1-ecv_score1)*(len(y1_test)-1)/(len(y1_test)-X1_test.shape[1]-1)

In [10]:
ecv_score1, ecv_mse1, ecv_rmse1, ecv_evar1, adjustedr1

(0.40373128188197693,
 82.320635009832884,
 9.0730719720408306,
 0.40374109692754978,
 0.40318723215279351)

### Model 2 - Try reverse of Model 1

In [11]:
# Drop workday, leave weekdays, drop season, leave month
# Workday overlaps with workday/not workday and months overlap with seasons
remove_cols = ['work_day_1','season_1', 'season_2', 'season_3', 'season_4','time_diff']
X2 = bikeshare_machine.drop(remove_cols, 1)
y2 = bikeshare_machine['time_diff']

In [12]:
# Split the data into training and testing sets and check the shape
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.25, random_state=17)
X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape

((914885, 292), (304962, 292), (914885,), (304962,))

In [13]:
# Fit model
parameters = {'l1_ratio':np.arange(0.003,1,30),
             'alpha': np.arange(0.01, 1, 30)}
elastic2 = ElasticNet()
ecv2 = GridSearchCV(elastic1, param_grid=parameters, cv=5)
ecv2.fit(X2_train, y2_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'l1_ratio': array([ 0.003]), 'alpha': array([ 0.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [14]:
ecv_pred2 = ecv2.predict(X2_test)
ecv_score2 = ecv2.score(X2_test, y2_test)
ecv_mse2 = mean_squared_error(y2_test, ecv_pred2)
ecv_rmse2 = np.sqrt(ecv_mse2)
ecv_evar2 = explained_variance_score(y2_test, ecv_pred2)
adjustedr2 = 1 - (1-ecv_score2)*(len(y2_test)-1)/(len(y2_test)-X2_test.shape[1]-1)

In [15]:
ecv_score2, ecv_mse2, ecv_rmse2, ecv_evar2, adjustedr2

(0.40431081718471773,
 82.240624583857795,
 9.0686616754545319,
 0.40432061077506676,
 0.40373989844542346)

### Model 3 - All variables

In [16]:
# Keep all but time_diff
X3 = bikeshare_machine.drop('time_diff', 1)
y3 = bikeshare_machine['time_diff']

In [17]:
# Split the data into training and testing sets and check the shape
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.25, random_state=17)
X3_train.shape, X3_test.shape, y3_train.shape, y3_test.shape

((914885, 297), (304962, 297), (914885,), (304962,))

In [18]:
# Fit model
parameters = {'l1_ratio':np.arange(0.003,1,30),
             'alpha': np.arange(0.01, 1, 30)}
elastic3 = ElasticNet()
ecv3 = GridSearchCV(elastic3, param_grid=parameters, cv=5)
ecv3.fit(X3_train, y3_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'l1_ratio': array([ 0.003]), 'alpha': array([ 0.01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [19]:
ecv_pred3 = ecv3.predict(X3_test)
ecv_score3 = ecv3.score(X3_test, y3_test)
ecv_mse3 = mean_squared_error(y3_test, ecv_pred3)
ecv_rmse3 = np.sqrt(ecv_mse3)
ecv_evar3 = explained_variance_score(y3_test, ecv_pred3)
adjustedr3 = 1 - (1-ecv_score3)*(len(y3_test)-1)/(len(y3_test)-X3_test.shape[1]-1)

In [20]:
ecv_score3, ecv_mse3, ecv_rmse3, ecv_evar3, adjustedr3

(0.4042841026327133,
 82.244312784862132,
 9.0688650218680689,
 0.40429387177967235,
 0.40370337231499254)