In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['rush_hour','member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [3]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/Casual_RushMetro/landmarks.csv'
bs = set_data(file)

In [4]:
bs.columns

Index(['time_diff', 'temperature', 'humidity', 'windspeed', 'miles',
       'metro_dist', 'landmark_dist_start', 'landmark_dist_end', 'temp', 'hum',
       ...
       'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'weather_cat_1', 'weather_cat_2',
       'weather_cat_3'],
      dtype='object', length=303)

In [5]:
bs=bs.rename(columns = {'member_type_Registered':'member_type'})
bs.head()

Unnamed: 0,time_diff,temperature,humidity,windspeed,miles,metro_dist,landmark_dist_start,landmark_dist_end,temp,hum,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
890491,13.583,0.511667,0.486667,0.281717,1.905459,0.165573,0.821243,0.257095,16.048349,48.6667,...,1,0,0,0,0,0,0,1,0,0
177058,20.033,0.604167,0.507083,0.269283,1.809965,0.131776,0.255462,0.916964,20.395849,50.7083,...,0,0,1,0,0,0,0,1,0,0
112738,17.883,0.226957,0.436957,0.1869,2.242739,0.023451,0.839915,2.391916,2.666979,43.6957,...,0,0,0,1,0,0,0,1,0,0
83284,9.233,0.211304,0.585217,0.127839,0.959833,0.441819,0.424481,0.285821,1.931288,58.5217,...,0,0,0,0,0,1,0,0,1,0
1126010,11.917,0.2875,0.350417,0.22575,1.228613,0.048442,0.06028,0.411781,5.5125,35.0417,...,0,0,0,0,0,0,1,1,0,0


In [None]:
remove_cols = ['work_day_1','season_1', 'season_2', 'season_3', 'season_4','member_type']
X = bs.drop(remove_cols, 1)
y = bs['member_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914885, 302), (304962, 302), (914885,), (304962,))

### GridSearch Logistic Regression

In [None]:
c_space = np.linspace(0.001, 30, 30)
degree = [1,2,3,4,5]
kernel = ['linear','poly','rbf']
param_grid = {'C': c_space, 'degree': degree, 'kernel': kernel}

svc1 = SVC()
svc_gs1 = RandomizedSearchCV(svc1, param_grid, cv=3, scoring='roc_auc')
svc_gs1.fit(X_train, y_train)

In [None]:
pred1 = svc_gs1.predict(X_test)
score1 = svc_gs1.score(X_test, y_test)
train_score1 = svc_gs1.score(X_train, y_train)

In [None]:
score1, train_score1

In [None]:
confusion_matrix(y_test, pred1)

In [None]:
print(classification_report(y_test, pred1))