In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [4]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/reg_metro_rush.csv'
bikeshare_machine = set_data(file)

In [5]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,rush_hour,metro_dist,temp,hum,wind,holiday_1,work_day_1,season_1,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
701641,11.067,1.314794,1,0.09643,7.549151,58.5833,15.375093,0,1,0,...,0,0,0,0,0,1,0,0,1,0
517238,20.183,1.749633,0,0.250981,28.503349,65.0417,8.7502,0,1,0,...,0,0,1,0,0,0,0,1,0,0
100399,6.183,0.726468,0,0.427048,1.229108,43.7273,16.636703,0,1,1,...,0,1,0,0,0,0,0,1,0,0
402058,7.983,0.814219,0,0.380306,22.785,63.9167,9.500332,0,1,0,...,0,0,0,0,1,0,0,1,0,0
695415,8.283,1.078723,0,0.669037,7.98,70.3333,7.12545,0,1,0,...,0,1,0,0,0,0,0,1,0,0


### Lasso for Variables

In [6]:
X = bikeshare_machine.drop('time_diff', 1)
y = bikeshare_machine['time_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [7]:
lasso = Lasso(fit_intercept=True)
alpha = np.arange(0.00001, 20, 25)
param_grid = {'alpha': alpha}
lasso_gs = GridSearchCV(lasso, param_grid, cv=5)
lasso_gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [8]:
coef = lasso_gs.best_estimator_.coef_
coef

array([  7.20234036e+00,  -1.46163996e+00,  -3.38217138e-01,
         5.33312773e-02,  -4.07838453e-03,  -1.75288293e-02,
        -3.22846054e-01,  -5.08187217e-01,  -5.70757033e-02,
         4.04471318e-06,   2.76139040e-02,  -6.28207773e-03,
        -1.54163280e-01,  -2.70495636e-01,   5.66402631e-02,
         7.50757661e-01,   9.11981405e-01,   2.58241222e-01,
        -1.68816120e-01,  -4.25283177e-02,  -3.77971287e-02,
         6.30375325e-02,  -1.09539848e-01,  -3.14004225e-01,
        -2.35132377e-02,   1.46128654e-03,  -6.94517687e-02,
        -1.43200384e-01,  -5.84771777e-02,   2.36423924e-02,
         2.29150045e-01,   2.10452209e-01,   0.00000000e+00,
        -3.06331737e-01])

In [9]:
column = X_train.columns
df = pd.DataFrame({'col': column, 'coef': coef})
small_df = df[df['coef'] != 0]
cols = list(small_df['col'])
cols

['miles',
 'rush_hour',
 'metro_dist',
 'temp',
 'hum',
 'wind',
 'holiday_1',
 'work_day_1',
 'season_1',
 'season_2',
 'season_3',
 'season_4',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'weekday_0',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6',
 'weather_cat_1',
 'weather_cat_3']

In [10]:
df

Unnamed: 0,coef,col
0,7.20234,miles
1,-1.46164,rush_hour
2,-0.338217,metro_dist
3,0.053331,temp
4,-0.004078,hum
5,-0.017529,wind
6,-0.322846,holiday_1
7,-0.508187,work_day_1
8,-0.057076,season_1
9,4e-06,season_2
