In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [4]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/metro_rush.csv'
bikeshare_machine = set_data(file)

In [5]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,rush_hour,metro_dist,temp,hum,wind,member_type,holiday_1,work_day_1,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
375865,3.583,0.269883,0,0.133167,15.6175,73.9167,18.416893,0,0,1,...,0,0,1,0,0,0,0,0,1,0
463725,16.5,0.812067,0,0.514459,22.706651,71.375,7.708618,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4983,13.083,1.422636,0,0.09643,4.424356,30.2174,14.217668,1,0,1,...,0,1,0,0,0,0,0,1,0,0
347591,47.45,0.245477,0,0.149435,19.338349,81.0833,12.875725,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1010807,16.833,0.734059,0,0.294425,4.650849,40.0833,14.458064,1,0,1,...,0,1,0,0,0,0,0,1,0,0


### Lasso for Variables

In [6]:
X = bikeshare_machine.drop('time_diff', 1)
y = bikeshare_machine['time_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=17)

In [7]:
lasso = Lasso(fit_intercept=True)
alpha = np.arange(0.00001, 20, 25)
param_grid = {'alpha': alpha}
lasso_gs = GridSearchCV(lasso, param_grid, cv=5)
lasso_gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [8]:
coef = lasso_gs.best_estimator_.coef_
coef

array([  6.41855476e+00,  -1.45038478e+00,  -4.20016567e-01,
         8.19276515e-02,  -5.01010615e-03,  -2.48087916e-02,
        -1.33368496e+01,  -4.43332162e-01,  -1.04803655e+00,
        -7.82100683e-02,   1.33936810e-02,   1.12589945e-01,
        -1.03319657e-01,   2.97161016e-02,  -1.08743354e-05,
         5.78272877e-01,   1.19317211e+00,   8.16467880e-01,
        -2.62510968e-01,  -6.43097840e-01,  -3.62860616e-01,
        -3.16932278e-01,   1.48389445e-01,   2.19137169e-01,
        -1.79934231e-01,   0.00000000e+00,   7.59883431e-02,
        -1.75521849e-01,  -2.35913651e-01,  -1.22223473e-01,
         1.00094409e-01,   2.27740418e-01,   3.10097440e-01,
        -0.00000000e+00,  -5.27389801e-01])

In [9]:
column = X_train.columns
df = pd.DataFrame({'col': column, 'coef': coef})
small_df = df[df['coef'] != 0]
cols = list(small_df['col'])
cols

['miles',
 'rush_hour',
 'metro_dist',
 'temp',
 'hum',
 'wind',
 'member_type',
 'holiday_1',
 'work_day_1',
 'season_1',
 'season_2',
 'season_3',
 'season_4',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6',
 'weather_cat_1',
 'weather_cat_3']

In [10]:
df

Unnamed: 0,coef,col
0,6.418555,miles
1,-1.450385,rush_hour
2,-0.420017,metro_dist
3,0.081928,temp
4,-0.00501,hum
5,-0.024809,wind
6,-13.33685,member_type
7,-0.443332,holiday_1
8,-1.048037,work_day_1
9,-0.07821,season_1
