In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['rush_hour','member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['season','month','weekday','weather_cat'])
    bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)        
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [3]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/Casual_RushMetro/landmarks.csv'
bs = set_data(file)

In [4]:
bs.columns

Index(['time_diff', 'miles', 'metro_dist', 'landmark_dist_start',
       'landmark_dist_end', 'temp', 'hum', 'wind', 'rush_hour_1',
       'member_type_Registered', 'holiday_1', 'work_day_1', 'season_1',
       'season_2', 'season_3', 'season_4', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'weekday_0', 'weekday_1',
       'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
       'weather_cat_1', 'weather_cat_2', 'weather_cat_3'],
      dtype='object')

In [5]:
bs=bs.rename(columns = {'member_type_Registered':'member_type'})
bs.head()

Unnamed: 0,time_diff,miles,metro_dist,landmark_dist_start,landmark_dist_end,temp,hum,wind,rush_hour_1,member_type,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
788677,19.3,0.0,0.065011,0.298604,0.298604,13.0325,91.0,9.249618,0,1,...,0,1,0,0,0,0,0,0,0,1
526864,65.0,1.579787,0.485937,0.485473,0.257095,24.508349,47.0,18.54225,1,0,...,0,1,0,0,0,0,0,1,0,0
750683,29.5,2.437893,0.165341,1.627072,0.424481,4.494151,58.0,16.083886,0,1,...,0,0,0,0,1,0,0,1,0,0
222665,6.117,0.66821,0.485937,0.485473,0.401587,27.915849,67.7083,13.875164,0,1,...,0,0,0,1,0,0,0,0,1,0
1094951,11.183,0.817557,0.191363,0.688921,0.086349,14.834151,61.5417,15.208129,1,1,...,0,0,0,0,1,0,0,1,0,0


In [6]:
X = bs.drop('member_type', 1)
y = bs['member_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914885, 37), (304962, 37), (914885,), (304962,))

### GridSearch Logistic Regression

In [7]:
c_space = np.linspace(0.01, 30, 20)
class_weight = [{0:.675, 1:.325}, {0:.7, 1:.3}, {0:.68, 1:.32}]
param_grid = {'C': c_space, 'class_weight':class_weight}

svc1 = LinearSVC(dual=False)
svc_gs1 = RandomizedSearchCV(svc1, param_grid, cv=3, n_iter=25, scoring='roc_auc')
svc_gs1.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          fit_params={}, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'C': array([  1.00000e-02,   1.58842e+00,   3.16684e+00,   4.74526e+00,
         6.32368e+00,   7.90211e+00,   9.48053e+00,   1.10589e+01,
         1.26374e+01,   1.42158e+01,   1.57942e+01,   1.73726e+01,
         1.89511e+01,   2.05295e+01,   2.21079e+01,   2.36863e+01,
         2.52647e+01,   2.68432e+01,   2.84216e+01,   3.00000e+01]), 'class_weight': [{0: 0.675, 1: 0.325}, {0: 0.7, 1: 0.3}, {0: 0.68, 1: 0.32}]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='roc_auc', verbose=0)

In [8]:
pred1 = svc_gs1.predict(X_test)
score1 = svc_gs1.score(X_test, y_test)
train_score1 = svc_gs1.score(X_train, y_train)

In [9]:
score1, train_score1

(0.83567661047134056, 0.8349739844420927)

In [10]:
confusion_matrix(y_test, pred1)

array([[ 22351,  28656],
       [ 10460, 243495]])

In [11]:
print(classification_report(y_test, pred1))

             precision    recall  f1-score   support

          0       0.68      0.44      0.53     51007
          1       0.89      0.96      0.93    253955

avg / total       0.86      0.87      0.86    304962



In [12]:
svc_gs1.best_params_

{'C': 18.95105263157895, 'class_weight': {0: 0.675, 1: 0.325}}

In [13]:
roc_auc_score(y_test, pred1)

0.69850317903134629