In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import itertools
from sklearn import metrics
%matplotlib inline
import datetime

In [2]:
#path = "../../../Google Drive/Data_science/NYU/Machine Learning/ML Project (Collisions)/" #Joe
path = "../../../../Google Drive/ML Project (Collisions)/" # Joyce
# path = "" # Lucas

In [None]:
fig = plt.figure(figsize=(14, 10))

num_features = 30

y_pos = np.arange(num_features)
importances = np.vstack((rf.feature_importances_, X_train.columns.values)).T
topX = importances[importances[:,0].argsort()][::-1][:num_features]

plt.bar(y_pos, topX[:, 0])
plt.xticks(y_pos, topX[:, 1])
fig.autofmt_xdate()

## Using H2O without 1 hot encoding

In [31]:
with open(path+"collisions_no1hot.pkl", 'rb') as infile:
    df_no1hot = pickle.load(infile)

In [32]:
df_no1hot['year'] = pd.to_datetime(df_no1hot['date_time']).dt.year

In [39]:
df_no1hot = df_no1hot.sort_values('date_time')

In [47]:
view_date = pd.to_datetime(df_no1hot['date_time'])

train_indices = (0, np.sum(view_date < datetime.date(2015,9,12))-1)
val_indices = (train_indices[1]+1,\
               train_indices[1] + \
               np.sum((view_date >= datetime.date(2015,9,12)) & (view_date < datetime.date(2016,7,31))))
test_indices = (val_indices[1]+1,\
               val_indices[1] + np.sum(view_date >= datetime.date(2016,7,31)))

In [54]:
df_no1hot.to_csv(path+"no1hot.csv")

In [5]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [6]:
h2o.init(ip="127.0.0.1",max_mem_size_GB = 4)

Checking whether there is an H2O instance running at http://127.0.0.1:54321. connected.


0,1
H2O cluster uptime:,4 days 2 hours 31 mins
H2O cluster version:,3.10.4.3
H2O cluster version age:,15 days
H2O cluster name:,H2O_from_python_joycewu_aayuou
H2O cluster total nodes:,1
H2O cluster free memory:,3.099 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://127.0.0.1:54321


In [7]:
h2o.connect()

Connecting to H2O server at http://localhost:54321... successful.


0,1
H2O cluster uptime:,4 days 2 hours 31 mins
H2O cluster version:,3.10.4.3
H2O cluster version age:,15 days
H2O cluster name:,H2O_from_python_joycewu_aayuou
H2O cluster total nodes:,1
H2O cluster free memory:,3.099 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


<H2OConnection to http://localhost:54321, session _sid_b87e>

In [77]:
dataFrame = h2o.upload_file(path=path+"no1hot.csv", destination_frame="no1hot")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [78]:
dataFrame = dataFrame.drop(['vehicle_type_code_1_nan',
 'vehicle_type_code_2_nan',
 'vehicle_type_code_3_nan',
 'vehicle_type_code_4_nan',
 'vehicle_type_code_5_nan',
 'borough_nan', 'injured', 'killed', 'C1'])

In [79]:
train_df = dataFrame[train_indices[0]:train_indices[1], :]
val_df = dataFrame[val_indices[0]:val_indices[1], :]
test_df = dataFrame[test_indices[0]:test_indices[1], :]

In [63]:
def score_confusion(confusion):
    sensitivity = confusion[1][1]/float(np.sum(confusion[1]))
    specificity = confusion[0][0]/float(np.sum(confusion[0]))
    return sensitivity, specificity

In [80]:
grid_search = []

In [81]:
n_trees = [50, 100, 200]
max_depth = [10, 20, 50, 100]
min_rows = [None, 10, 50, 100]

for i in itertools.product(n_trees,max_depth,min_rows):
    
    if i not in [i[0] for i in grid_search]:
    
        rf = H2ORandomForestEstimator(
        model_id = "rf",
        ntrees=i[0],
        max_depth = i[1],
        min_rows = i[2],
        stopping_rounds=2,
        seed=1000000)

        rf.train(train_df.drop('injured_or_killed').columns, 'injured_or_killed',\
                 training_frame=train_df, validation_frame=val_df)

        results = [i,\
                    rf.auc(train=True),\
                    rf.auc(valid=True),\
                    score_confusion(rf.confusion_matrix(train=True).to_list()),\
                    score_confusion(rf.confusion_matrix(valid=True).to_list())]
        print ("%s finished!" % (str(i)))
        print ("Train AUC: %s, Valid AUC: %s, Train S&S: %s, Valid S&S: %s" % (results[1], results[2], results[3], results[4]))
        
        grid_search.append(results)   
        

drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, None) finished!
Train AUC: 0.7964105178756552, Valid AUC: 0.7660665893018703, Train S&S: (0.5537270544643363, 0.896978568997239), Valid S&S: (0.556944010552258, 0.8502693579339206)
drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, 10) finished!
Train AUC: 0.7970170234669708, Valid AUC: 0.766704565876601, Train S&S: (0.555352855549464, 0.8958951164544627), Valid S&S: (0.5517720156895415, 0.8536681232803865)
drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, 50) finished!
Train AUC: 0.7964908410998762, Valid AUC: 0.76384546954093, Train S&S: (0.5513167098323156, 0.8982244281840045), Valid S&S: (0.5580547745496199, 0.8490747805445732)
drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, 100) finished!
Train AUC: 0.7960537976866242, Valid AUC: 0.7638900979738261, Train S&S: (0.5524509896

In [85]:
grid_search[np.argmax([i[2] for i in grid_search])]

[(200, 20, 10),
 0.8063652595253678,
 0.7736229099603015,
 (0.5546628353214738, 0.8979975038321294),
 (0.5584713110486307, 0.850015028554253)]

In [86]:
grid_search

[[(50, 10, None),
  0.7964105178756552,
  0.7660665893018703,
  (0.5537270544643363, 0.896978568997239),
  (0.556944010552258, 0.8502693579339206)],
 [(50, 10, 10),
  0.7970170234669708,
  0.766704565876601,
  (0.555352855549464, 0.8958951164544627),
  (0.5517720156895415, 0.8536681232803865)],
 [(50, 10, 50),
  0.7964908410998762,
  0.76384546954093,
  (0.5513167098323156, 0.8982244281840045),
  (0.5580547745496199, 0.8490747805445732)],
 [(50, 10, 100),
  0.7960537976866242,
  0.7638900979738261,
  (0.552450989659149, 0.8977349835034896),
  (0.5663507931549169, 0.8431250144505329)],
 [(50, 20, None),
  0.7953462248268284,
  0.7675644933543188,
  (0.5345860823865247, 0.9049809895236591),
  (0.5548613280572043, 0.8515332978813592)],
 [(50, 20, 10),
  0.8009010937711374,
  0.7715020612179854,
  (0.5436225116736298, 0.9032479104049265),
  (0.5656565656565656, 0.8448359575501144)],
 [(50, 20, 50),
  0.8032141693827634,
  0.7711867880988179,
  (0.546656710210409, 0.9014058186073519),
  (0.

In [96]:
results = pd.DataFrame(grid_search)
results.to_csv("tmp.csv")

## Sklearn with 1 hot


In [100]:
with open(path+"collisions_1hot.pkl", 'rb') as infile:
    df = pickle.load(infile)

In [102]:
df = df.drop(['injured', 'killed'], axis=1)

In [103]:
X_train = df.drop(['injured_or_killed'], axis=1).iloc[train_indices[0]:train_indices[1]]
y_train = df['injured_or_killed'].iloc[train_indices[0]:train_indices[1]]
X_val = df.drop(['injured_or_killed'], axis=1).iloc[val_indices[0]:val_indices[1]]
y_val = df['injured_or_killed'][val_indices[0]:val_indices[1]]
X_test = df.drop(['injured_or_killed'], axis=1).iloc[test_indices[0]:test_indices[1]]
y_test = df['injured_or_killed'][test_indices[0]:test_indices[1]]

In [None]:
rf = RandomForestClassifier(n_estimators=i[0], max_depth=20, min_samples_leaf=10)
rf.fit(X_train, y_train)

In [98]:
sk_grid_search = []

In [108]:
n_trees = [50, 100, 200]
max_depth = [10, 20, 50, 100]
min_rows = [1, 10, 50, 100]

for i in itertools.product(n_trees,max_depth,min_rows):
    
    rf = RandomForestClassifier(n_estimators=i[0], max_depth=i[1], min_samples_leaf=i[2])
    rf.fit(X_train, y_train)
    
    #Train results
    t_predictions = rf.predict(X_train)
    t_auc = metrics.roc_auc_score(y_train, t_predictions)
    t_sens = y_train.astype(int).dot(t_predictions.astype(int))/np.sum(y_train)
    t_spec = (y_train == 0).astype(int).dot((t_predictions==0).astype(int))/np.sum((y_train == 0))
    
    #Val results
    v_predictions = rf.predict(X_test)
    v_auc = metrics.roc_auc_score(y_test, v_predictions)
    v_sens = y_test.astype(int).dot(v_predictions.astype(int))/np.sum(y_test)
    v_spec = (y_test == 0).astype(int).dot((v_predictions==0).astype(int))/np.sum((y_test == 0))
    
    results = [i,\
                t_auc,\
                v_auc,\
                (t_sens, t_spec),\
                (v_sens, v_spec)]
    
    
    print ("%s finished!" % (str(i)))
    print ("Train AUC: %s, Valid AUC: %s, Train S&S: %s, Valid S&S: %s" %\
           (results[1], results[2], results[3], results[4]))
    
    sk_grid_search.append(results)

(50, 10, 1) finished!
Train AUC: 0.557355989745, Valid AUC: 0.575971714236, Train S&S: (0.12092482187000145, 0.99378715761908132), Valid S&S: (0.1660598179453836, 0.98588361052598483)
(50, 10, 10) finished!
Train AUC: 0.545554210649, Valid AUC: 0.569975602201, Train S&S: (0.097794580970384376, 0.99331384032840253), Valid S&S: (0.15513654096228868, 0.9848146634388657)
(50, 10, 50) finished!
Train AUC: 0.562150011028, Valid AUC: 0.587469687239, Train S&S: (0.13322669768794532, 0.99107332436832052), Valid S&S: (0.1936931079323797, 0.98124626654510017)
(50, 10, 100) finished!
Train AUC: 0.540441178842, Valid AUC: 0.560432125928, Train S&S: (0.085987106781057629, 0.99489525090239928), Valid S&S: (0.13211963589076722, 0.98874461596503915)
(50, 20, 1) finished!
Train AUC: 0.675076714515, Valid AUC: 0.661946506815, Train S&S: (0.36010857447530414, 0.99004485455446245), Valid S&S: (0.36300390117035108, 0.96088911245952147)
(50, 20, 10) finished!
Train AUC: 0.637940961162, Valid AUC: 0.658604423

In [110]:
results = pd.DataFrame(sk_grid_search)
results.to_csv("tmp.csv")

In [None]:
fig = plt.figure(figsize=(14, 10))

num_features = 30

y_pos = np.arange(num_features)

plt.bar(y_pos, rf.varimp(True)['percentage'][:num_features])
plt.xticks(y_pos, rf.varimp(True)['variable'][:num_features])
fig.autofmt_xdate()