In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score

In [2]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [3]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/metro_rush.csv'
bs = set_data(file)

In [4]:
bs.columns

Index(['time_diff', 'miles', 'temp', 'hum', 'wind', 'member_type_Registered',
       'holiday_1', 'work_day_1', 'start_station_10th & Monroe St NE',
       'start_station_10th & U St NW',
       ...
       'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'weather_cat_1', 'weather_cat_2',
       'weather_cat_3'],
      dtype='object', length=296)

In [5]:
bs=bs.rename(columns = {'member_type_Registered':'member_type'})
bs.head()

Unnamed: 0,time_diff,miles,temp,hum,wind,member_type,holiday_1,work_day_1,start_station_10th & Monroe St NE,start_station_10th & U St NW,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
936826,4.733,0.537182,8.763349,79.1667,14.874871,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
719752,7.733,0.825764,10.134151,59.5417,4.125244,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
206628,12.367,1.203865,23.881651,60.0,8.167032,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
529291,13.283,1.232617,25.409151,77.0,16.666518,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
102968,5.433,0.360527,2.966651,49.875,10.583521,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [6]:
X = bs.drop('member_type', 1)
y = bs['member_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914885, 295), (304962, 295), (914885,), (304962,))

### Plain Logistic Regression

In [7]:
log = LogisticRegression()
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
pred = log.predict(X_test)
scores = cross_val_score(log, X_train, y_train, cv=5)
score = log.score(X_test, y_test)
train_score = log.score(X_train, y_train)

In [14]:
scores, score, train_score

(array([ 0.87687044,  0.87705079,  0.87770048,  0.87731178,  0.87715329]),
 0.87665676379352186,
 0.87725998349519341)

### GridSearch Logistic Regression

In [10]:
c_space = np.linspace(0.0001, 50, 25)
param_grid = {'C': c_space}

log2 = LogisticRegression()
log_gs = GridSearchCV(log2, param_grid, cv=5)
log_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   2.08343e+00,   4.16676e+00,   6.25009e+00,
         8.33342e+00,   1.04167e+01,   1.25001e+01,   1.45834e+01,
         1.66667e+01,   1.87501e+01,   2.08334e+01,   2.29167e+01,
         2.50000e+01,   2.70834e+01,   2.91667e+01,   3.12500e+01,
         3.33334e+01,   3.54167e+01,   3.75000e+01,   3.95834e+01,
         4.16667e+01,   4.37500e+01,   4.58333e+01,   4.79167e+01,
         5.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
pred2 = log_gs.predict(X_test)
score2 = log_gs.score(X_test, y_test)
train_score2 = log_gs.score(X_train, y_train)

In [16]:
score2, train_score2

(0.87668627566713231, 0.87726763473004798)

In [17]:
confusion_matrix(y_test, pred2)

array([[ 19040,  31913],
       [  5693, 248316]])

In [20]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.77      0.37      0.50     50953
          1       0.89      0.98      0.93    254009

avg / total       0.87      0.88      0.86    304962



In [23]:
remove_cols = ['weekday_0', 'weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6', 
              'month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8','month_9',
              'month_10','month_11','month_12','member_type']
X1 = np.matrix(bs.drop(remove_cols, 1))
y1 = bs['member_type']

In [24]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.25)
X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape

((914885, 276), (304962, 276), (914885,), (304962,))

In [25]:
log3 = LogisticRegression()
log_gs2 = GridSearchCV(log3, param_grid, cv=5)
log_gs2.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   2.08343e+00,   4.16676e+00,   6.25009e+00,
         8.33342e+00,   1.04167e+01,   1.25001e+01,   1.45834e+01,
         1.66667e+01,   1.87501e+01,   2.08334e+01,   2.29167e+01,
         2.50000e+01,   2.70834e+01,   2.91667e+01,   3.12500e+01,
         3.33334e+01,   3.54167e+01,   3.75000e+01,   3.95834e+01,
         4.16667e+01,   4.37500e+01,   4.58333e+01,   4.79167e+01,
         5.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [26]:
pred3 = log_gs2.predict(X1_test)
score3 = log_gs2.score(X1_test, y1_test)
train_score3 = log_gs2.score(X1_train, y1_train)

In [27]:
score3, train_score3

(0.87686990510293084, 0.87677030446449555)

In [28]:
confusion_matrix(y1_test, pred3)

array([[ 19086,  31923],
       [  5627, 248326]])

In [29]:
print(classification_report(y1_test, pred3))

             precision    recall  f1-score   support

          0       0.77      0.37      0.50     51009
          1       0.89      0.98      0.93    253953

avg / total       0.87      0.88      0.86    304962

