In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score

In [3]:
def set_data(file):
    bikeshare_machine = pd.read_csv(file, 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
    bikeshare_machine.drop('Unnamed: 0', 1, inplace=True)
    included_cols = ['start_station','end_station','Member Type','time_diff','season','mnth','holiday',
                     'weekday','workingday','weathersit','temp','hum','windspeed','miles']
    bikeshare_machine = bikeshare_machine[included_cols]
    bikeshare_machine['season'] = bikeshare_machine['season'].astype('category')
    bikeshare_machine['mnth'] = bikeshare_machine['mnth'].astype('category')
    bikeshare_machine['holiday'] = bikeshare_machine['holiday'].astype('category')
    bikeshare_machine['weekday'] = bikeshare_machine['weekday'].astype('category')
    bikeshare_machine['workingday'] = bikeshare_machine['workingday'].astype('category')
    bikeshare_machine['weathersit'] = bikeshare_machine['weathersit'].astype('category')
    bikeshare_machine['Member Type'] = bikeshare_machine['Member Type'].astype('category')
    bikeshare_machine['start_station'] = bikeshare_machine['start_station'].astype('category')
    bikeshare_machine['end_station'] = bikeshare_machine['end_station'].astype('category')
    col_names = ['start_station', 'end_station','member_type','time_diff','season','month','holiday',
             'weekday','work_day','weather_cat','temperature','humidity','windspeed','miles']
    bikeshare_machine.columns = col_names
    tmin = -8
    tmax = 39
    hum_max = 100
    wind_max = 67
    bikeshare_machine['temp'] = bikeshare_machine['temperature'] * (tmax - tmin) + tmin
    bikeshare_machine['hum'] = bikeshare_machine['humidity'] * 100
    bikeshare_machine['wind'] = bikeshare_machine['windspeed'] * 67
    bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['member_type','holiday','work_day'], drop_first=True)
    bikeshare_machine = pd.get_dummies(bikeshare_machine, 
                                 columns=['start_station','end_station','season','month','weekday','weather_cat'])
    bikeshare_machine = bikeshare_machine.drop(['temperature', 'humidity', 'windspeed'], 1)
    bikeshare_machine = shuffle(bikeshare_machine)
    return bikeshare_machine

In [4]:
file = '/Users/matthewcassi/Documents/Bike-Sharing-Dataset/Bikeshare_Time_Prediction/metro_rush.csv'
bs = set_data(file)

In [5]:
bs.columns

Index(['time_diff', 'miles', 'temp', 'hum', 'wind', 'member_type_Registered',
       'holiday_1', 'work_day_1', 'start_station_10th & Monroe St NE',
       'start_station_10th & U St NW',
       ...
       'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'weather_cat_1', 'weather_cat_2',
       'weather_cat_3'],
      dtype='object', length=296)

In [6]:
bs=bs.rename(columns = {'member_type_Registered':'member_type'})
bs.head()

Unnamed: 0,time_diff,miles,temp,hum,wind,member_type,holiday_1,work_day_1,start_station_10th & Monroe St NE,start_station_10th & U St NW,...,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weather_cat_1,weather_cat_2,weather_cat_3
181751,28.05,0.0,21.845,49.4583,20.45845,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
321071,15.2,1.373495,20.983349,69.7083,22.958689,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
960734,6.467,1.212422,6.508712,64.6522,12.565984,1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
668056,9.817,0.852298,27.1325,57.8333,12.292557,1,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
516639,10.2,0.0,23.646651,60.5,16.958236,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


In [10]:
remove_cols = ['weekday_0', 'weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6', 
              'month_1','month_2','month_3','month_4','month_5','month_6','month_7','month_8','month_9',
              'month_10','month_11','month_12','member_type']
X = bs.drop(remove_cols, 1)
y = bs['member_type']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914885, 276), (304962, 276), (914885,), (304962,))

In [26]:
# create a function to create a rand
def datasets(x,y):
    x.assign(member_type=y)
    reg = x[x['member_type'] == 1].sample(n=125000)
    cas = x[x['member_type'] == 0].sample(n=125000)
    bs = reg.append(cas)
    train_x = bs.drop('member_type', 1)
    train_y = bs['member_type']
    return train_x, train_y

In [27]:
X_train1, y_train1 = datasets(X_train, y_train)
X_train2, y_train2 = datasets(X_train, y_train)
X_train3, y_train3 = datasets(X_train, y_train)
X_train4, y_train4 = datasets(X_train, y_train)
X_train5, y_train5 = datasets(X_train, y_train)
X_train6, y_train6 = datasets(X_train, y_train)
X_train7, y_train7 = datasets(X_train, y_train)
X_train8, y_train8 = datasets(X_train, y_train)
X_train9, y_train9 = datasets(X_train, y_train)
X_train10, y_train10 = datasets(X_train, y_train)
X_train11, y_train11 = datasets(X_train, y_train)

### Ensemble Logistic Regression

In [31]:
c_space = np.linspace(0.0001, 30, 20)
param_grid = {'C': c_space}

def log_reg(X_train,y_train,X_test):
    log = LogisticRegression()
    log_gs = GridSearchCV(log, param_grid, cv=2)
    log_gs.fit(X_train, y_train)
    pred = log_gs.predict(X_test)
    return pred

In [32]:
log1 = log_reg(X_train1, y_train1, X_test)
log2 = log_reg(X_train2, y_train2, X_test)
log3 = log_reg(X_train3, y_train3, X_test)
log4 = log_reg(X_train4, y_train4, X_test)
log5 = log_reg(X_train5, y_train5, X_test)
log6 = log_reg(X_train6, y_train6, X_test)
log7 = log_reg(X_train7, y_train7, X_test)
log8 = log_reg(X_train8, y_train8, X_test)
log9 = log_reg(X_train9, y_train9, X_test)
log10 = log_reg(X_train10, y_train10, X_test)
log11 = log_reg(X_train11, y_train11, X_test)

In [41]:
np.array(log11)

array([1, 0, 1, ..., 1, 0, 1], dtype=uint8)

In [42]:
predictions = {'1':np.array(log1),
              '2':np.array(log2),
              '3':np.array(log3),
              '4':np.array(log4),
              '5':np.array(log5),
              '6':np.array(log6),
              '7':np.array(log7),
              '8':np.array(log8),
              '9':np.array(log9),
              '10':np.array(log10),
              '11':np.array(log11)}
preds = pd.DataFrame(predictions)

In [43]:
preds.head()

Unnamed: 0,1,10,11,2,3,4,5,6,7,8,9
0,1,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1
3,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1


In [44]:
preds['sum'] = preds.apply(sum, axis=1)

In [59]:
def prediction(row):
    pred = 0
    if row['sum'] >= 6:
        pred=1
    else:
        pred=0
    return pred

In [60]:
preds['maj_vote'] = preds.apply(prediction, axis=1)

In [61]:
preds.head()

Unnamed: 0,1,10,11,2,3,4,5,6,7,8,9,sum,maj_vote
0,1,1,1,1,1,1,1,1,1,1,1,11,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,11,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,1,11,1


In [62]:
final_pred = np.array(preds['maj_vote'])

In [63]:
len(final_pred)

304962

In [64]:
# Accuracy
sum(final_pred == y_test) / len(y_test)

0.81857411743102415

In [65]:
confusion_matrix(y_test, final_pred)

array([[ 35446,  15412],
       [ 39916, 214188]])

In [66]:
print(classification_report(y_test, final_pred))

             precision    recall  f1-score   support

          0       0.47      0.70      0.56     50858
          1       0.93      0.84      0.89    254104

avg / total       0.86      0.82      0.83    304962

