In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles',
                     'rush_hour','metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    bikeshare_machine['rush_hour'] = bikeshare_machine['rush_hour'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles','rush_hour',
                'metro_dist','landmark_dist_start','landmark_dist_end']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['rush_hour','member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [4]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/Casual_RushMetro/landmarks.csv'
bs = set_data(file)

In [5]:
bs.columns

Index(['time_diff', 'temperature', 'humidity', 'windspeed', 'miles',
       'metro_dist', 'landmark_dist_start', 'landmark_dist_end', 'temp', 'hum',
       ...
       'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'weather_cat_1', 'weather_cat_2',
       'weather_cat_3'],
      dtype='object', length=303)

In [6]:
bs=bs.rename(columns = {'member_type_Registered':'member_type'})
bs.head()

Unnamed: 0,time_diff,temperature,humidity,windspeed,miles,metro_dist,landmark_dist_start,landmark_dist_end,temp,hum,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
385150,17.267,0.4375,0.602917,0.162312,1.148545,0.441819,0.424481,0.075027,12.5625,60.2917,...,0,0,0,0,1,0,0,1,0,0
488295,0.833,0.643333,0.727083,0.139929,0.0,0.09643,0.21843,0.21843,22.236651,72.7083,...,0,0,0,0,0,1,0,0,1,0
674350,6.233,0.709167,0.757917,0.225129,0.63035,0.094222,0.482777,0.257095,25.330849,75.7917,...,0,0,0,0,0,1,0,0,1,0
692854,14.55,0.716667,0.6825,0.228858,2.269004,0.179499,2.602752,1.329181,25.683349,68.25,...,1,0,0,0,0,0,0,0,1,0
215215,20.733,0.62,0.354167,0.253121,1.588125,0.048442,0.06028,1.594631,21.14,35.4167,...,0,0,0,0,0,1,0,1,0,0


### Model 1

In [7]:
X1 = np.matrix(bs[['time_diff', 'miles', 'metro_dist', 'landmark_dist_start', 'landmark_dist_end']])
y1 = bs['member_type']

In [8]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((914885, 5), (304962, 5), (914885,), (304962,))

In [9]:
c_space = np.linspace(0.0001, 50, 20)
param_grid = {'C': c_space}

log3 = LogisticRegression()
log_gs2 = GridSearchCV(log3, param_grid, cv=3)
log_gs2.fit(X1_train, y1_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   2.63167e+00,   5.26325e+00,   7.89482e+00,
         1.05264e+01,   1.31580e+01,   1.57895e+01,   1.84211e+01,
         2.10527e+01,   2.36843e+01,   2.63158e+01,   2.89474e+01,
         3.15790e+01,   3.42106e+01,   3.68421e+01,   3.94737e+01,
         4.21053e+01,   4.47369e+01,   4.73684e+01,   5.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [10]:
pred3 = log_gs2.predict(X1_test)
score3 = log_gs2.score(X1_test, y1_test)
train_score3 = log_gs2.score(X1_train, y1_train)

In [11]:
score3, train_score3

(0.86499957371738123, 0.86578750334741528)

In [12]:
confusion_matrix(y1_test, pred3)

array([[ 14839,  36163],
       [  5007, 248953]])

In [13]:
print(classification_report(y1_test, pred3))

             precision    recall  f1-score   support

          0       0.75      0.29      0.42     51002
          1       0.87      0.98      0.92    253960

avg / total       0.85      0.86      0.84    304962



In [14]:
roc_auc_score(y1_test, pred3)

0.63561683563698801

### Model 2

In [15]:
X1 = np.matrix(bs[['time_diff', 'miles','landmark_dist_start', 'landmark_dist_end']])
y1 = bs['member_type']

In [16]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((914885, 4), (304962, 4), (914885,), (304962,))

In [17]:
c_space = np.linspace(0.0001, 50, 20)
param_grid = {'C': c_space}

log3 = LogisticRegression()
log_gs2 = GridSearchCV(log3, param_grid, cv=3)
log_gs2.fit(X1_train, y1_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   2.63167e+00,   5.26325e+00,   7.89482e+00,
         1.05264e+01,   1.31580e+01,   1.57895e+01,   1.84211e+01,
         2.10527e+01,   2.36843e+01,   2.63158e+01,   2.89474e+01,
         3.15790e+01,   3.42106e+01,   3.68421e+01,   3.94737e+01,
         4.21053e+01,   4.47369e+01,   4.73684e+01,   5.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
pred3 = log_gs2.predict(X1_test)
score3 = log_gs2.score(X1_test, y1_test)
train_score3 = log_gs2.score(X1_train, y1_train)

In [19]:
score3, train_score3

(0.86539306536552096, 0.8656760139252474)

In [20]:
confusion_matrix(y1_test, pred3)

array([[ 15074,  35952],
       [  5098, 248838]])

In [21]:
print(classification_report(y1_test, pred3))

             precision    recall  f1-score   support

          0       0.75      0.30      0.42     51026
          1       0.87      0.98      0.92    253936

avg / total       0.85      0.87      0.84    304962



In [22]:
roc_auc_score(y1_test, pred3)

0.63767104877116942

In [23]:
X1 = np.matrix(bs[['time_diff','metro_dist']])
y1 = bs['member_type']

In [24]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((914885, 2), (304962, 2), (914885,), (304962,))

In [25]:
c_space = np.linspace(0.0001, 50, 20)
param_grid = {'C': c_space}

log3 = LogisticRegression()
log_gs2 = GridSearchCV(log3, param_grid, cv=3)
log_gs2.fit(X1_train, y1_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   2.63167e+00,   5.26325e+00,   7.89482e+00,
         1.05264e+01,   1.31580e+01,   1.57895e+01,   1.84211e+01,
         2.10527e+01,   2.36843e+01,   2.63158e+01,   2.89474e+01,
         3.15790e+01,   3.42106e+01,   3.68421e+01,   3.94737e+01,
         4.21053e+01,   4.47369e+01,   4.73684e+01,   5.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [26]:
pred3 = log_gs2.predict(X1_test)
score3 = log_gs2.score(X1_test, y1_test)
train_score3 = log_gs2.score(X1_train, y1_train)

In [27]:
score3, train_score3

(0.85954643529357755, 0.85945009482066048)

In [28]:
confusion_matrix(y1_test, pred3)

array([[ 12428,  38450],
       [  4383, 249701]])

In [29]:
print(classification_report(y1_test, pred3))

             precision    recall  f1-score   support

          0       0.74      0.24      0.37     50878
          1       0.87      0.98      0.92    254084

avg / total       0.85      0.86      0.83    304962



In [30]:
roc_auc_score(y1_test, pred3)

0.61351020370018361