In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.utils import shuffle
import statsmodels.api as sm

  from pandas.core import datetools


In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [4]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/casual_metro_rush.csv'
bikeshare_machine = set_data(file)

In [5]:
bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
bikeshare_machine = bikeshare_machine.rename(columns = {'member_type_Registered':'member_type'})
bikeshare_machine.head()

Unnamed: 0,time_diff,miles,rush_hour,metro_dist,temp,hum,wind,holiday_1,work_day_1,season_1,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
47373,16.65,1.426679,0,0.047985,17.85,87.0,13.499964,0,1,0,...,0,0,0,1,0,0,0,0,1,0
103088,5.283,0.630874,1,0.294425,25.291651,41.5,8.416607,0,1,0,...,0,0,0,0,0,1,0,1,0,0
94030,8.1,0.914571,0,0.129048,23.646651,60.5,16.958236,0,1,0,...,0,0,0,1,0,0,0,1,0,0
110246,10.633,1.660194,0,0.252381,29.874151,48.0833,11.042471,0,0,0,...,1,0,0,0,0,0,0,1,0,0
54611,16.733,1.175429,0,0.485937,14.520849,59.0,15.292482,0,1,0,...,0,0,0,0,0,1,0,1,0,0


### Lasso for Variables

In [6]:
X = bikeshare_machine.drop('time_diff', 1)
y = bikeshare_machine['time_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [7]:
lasso = Lasso(fit_intercept=True)
alpha = np.arange(0.00001, 20, 25)
param_grid = {'alpha': alpha}
lasso_gs = GridSearchCV(lasso, param_grid, cv=5)
lasso_gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [8]:
coef = lasso_gs.best_estimator_.coef_
coef

array([ 3.27240087, -5.22856121, -0.12989261,  0.1950043 , -0.01373751,
       -0.06027718, -1.00361062, -3.02129284,  0.07782866,  0.20259759,
        0.26017543, -0.49852536,  0.05060597,  1.10364553,  3.19861679,
        3.32351329,  0.76888845, -1.79225235, -1.95650483, -0.83649529,
       -0.95463695,  1.0310668 ,  1.91788917, -0.13552017,  0.01375866,
        0.61466087, -1.09499816, -0.65410081, -0.57649437,  0.62376866,
        0.10297077,  0.99096102,  0.07179298, -3.07634101])

In [9]:
column = X_train.columns
df = pd.DataFrame({'col': column, 'coef': coef})
small_df = df[df['coef'] != 0]
cols = list(small_df['col'])
cols

['miles',
 'rush_hour',
 'metro_dist',
 'temp',
 'hum',
 'wind',
 'holiday_1',
 'work_day_1',
 'season_1',
 'season_2',
 'season_3',
 'season_4',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'weekday_0',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6',
 'weather_cat_1',
 'weather_cat_2',
 'weather_cat_3']

In [10]:
df

Unnamed: 0,coef,col
0,3.272401,miles
1,-5.228561,rush_hour
2,-0.129893,metro_dist
3,0.195004,temp
4,-0.013738,hum
5,-0.060277,wind
6,-1.003611,holiday_1
7,-3.021293,work_day_1
8,0.077829,season_1
9,0.202598,season_2
