In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import itertools
from sklearn import metrics
import datetime
%matplotlib inline
import os

In [2]:
#path = "../../../Google Drive/Data_science/NYU/Machine Learning/ML Project (Collisions)/" #Joe
path = "../../../../Google Drive/ML Project (Collisions)/" # Joyce
# path = "" # Lucas

## Using H2O without 1 hot encoding

In [None]:
with open(path+"data_for_training/v4/collisions_no1hot.pkl", 'rb') as infile:
    df_no1hot = pickle.load(infile)

In [None]:
df_no1hot = df_no1hot.sort_values('date_time')

In [None]:
view_date = pd.to_datetime(df_no1hot['date_time'])

train_indices = (0, np.sum(view_date < datetime.date(2015,9,12))-1)
val_indices = (train_indices[1]+1,\
               train_indices[1] + \
               np.sum((view_date >= datetime.date(2015,9,12)) & (view_date < datetime.date(2016,7,31))))
test_indices = (val_indices[1]+1,\
               val_indices[1] + np.sum(view_date >= datetime.date(2016,7,31)))

In [None]:
df_no1hot.to_csv(path+"data_for_training/v2/no1hot.csv")

In [None]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [None]:
h2o.init(ip="127.0.0.1",max_mem_size_GB = 4)

In [None]:
h2o.connect()

In [None]:
dataFrame = h2o.upload_file(path=path+"data_for_training/v2/no1hot.csv", destination_frame="no1hot")

In [None]:
dataFrame = dataFrame.drop(['injured', 'killed', 'C1'])

In [None]:
train_df = dataFrame[train_indices[0]:train_indices[1], :]
val_df = dataFrame[val_indices[0]:val_indices[1], :]
test_df = dataFrame[test_indices[0]:test_indices[1], :]

In [None]:
def score_confusion(confusion):
    sensitivity = confusion[1][1]/float(np.sum(confusion[1]))
    specificity = confusion[0][0]/float(np.sum(confusion[0]))
    return sensitivity, specificity

In [None]:
grid_search = []

In [None]:
n_trees = [100, 200, 400]
max_depth = [20, 30, 50, 100]
min_rows = [5, 10, 50, 100]

for i in itertools.product(n_trees,max_depth,min_rows):
    
    if i not in [i[0] for i in grid_search]:
    
        rf = H2ORandomForestEstimator(
        model_id = "rf",
        ntrees=i[0],
        max_depth = i[1],
        min_rows = i[2],
        stopping_rounds=2)

        rf.train(train_df.drop('injured_or_killed').columns, 'injured_or_killed',\
                 training_frame=train_df, validation_frame=val_df)

        results = [i,\
                    rf.auc(train=True),\
                    rf.auc(valid=True),\
                    score_confusion(rf.confusion_matrix(train=True).to_list()),\
                    score_confusion(rf.confusion_matrix(valid=True).to_list())]
        print ("%s finished!" % (str(i)))
        print ("Train AUC: %s, Valid AUC: %s, Train S&S: %s, Valid S&S: %s" % (results[1], results[2], results[3], results[4]))
        
        grid_search.append(results)   
        

In [None]:
results = pd.DataFrame(grid_search)
results.to_csv("tmp.csv")

## Sklearn with 1 hot


In [6]:
target_variable = 'injured_or_killed'
column_names = [i for i in pickle.load(open(path+'data_for_training/v4/collisions_1hot.pkl', 'rb')).columns.values if i != target_variable]

In [None]:
sk_grid_search = []

In [None]:
n_trees = [200] #[50, 100, 200]
max_depth = [50]#[10, 20, 50, 100]
min_rows = [50] #[1, 10, 50, 100]

for i in itertools.product(n_trees,max_depth,min_rows):
    
    rf = RandomForestClassifier(n_estimators=i[0], max_depth=i[1], min_samples_leaf=i[2])
    rf.fit(X_train, y_train)
    
    #Train results
    t_predictions = rf.predict(X_train)
    t_predictions_prob = [i[1] for i in rf.predict_proba(X_train)]
    t_auc = metrics.roc_auc_score(y_train, t_predictions_prob)
    t_sens = y_train['injured_or_killed'].astype(int).dot(t_predictions.astype(int))/np.sum(y_train)
    t_spec = (y_train['injured_or_killed'] == 0).astype(int).dot((t_predictions==0).astype(int))/np.sum((y_train == 0))
    
    #Val results
    v_predictions = rf.predict(X_val)
    v_predictions_prob = [i[1] for i in rf.predict_proba(X_val)]
    v_auc = metrics.roc_auc_score(y_val, v_predictions_prob)
    v_sens = y_val['injured_or_killed'].astype(int).dot(v_predictions.astype(int))/np.sum(y_val)
    v_spec = (y_val['injured_or_killed'] == 0).astype(int).dot((v_predictions==0).astype(int))/np.sum((y_val == 0))
    
    results = [i,\
                t_auc,\
                v_auc,\
                (t_sens, t_spec),\
                (v_sens, v_spec)]
    
    print ("%s\t%s\t%s\t%s\t%s" % (str(i), results[1], results[2], results[3], results[4]))
    
    sk_grid_search.append(results)

In [None]:
results = pd.DataFrame(sk_grid_search)
results.to_csv("tmp.csv")

In [None]:
rf = H2ORandomForestEstimator(
model_id = "rf",
ntrees=200,
max_depth = 50,
min_rows = 50,
stopping_rounds=2)

rf.train(train_df.drop('injured_or_killed').columns, 'injured_or_killed',\
         training_frame=train_df, validation_frame=val_df)

In [None]:
fig = plt.figure(figsize=(14, 10))

num_features = 30

y_pos = np.arange(num_features)

plt.bar(y_pos, rf.varimp(True)['percentage'][:num_features])
plt.xticks(y_pos, rf.varimp(True)['variable'][:num_features])
fig.autofmt_xdate()

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=50, min_samples_leaf=50)
rf.fit(X_train, y_train)

In [3]:
with open(path+"data_for_training/v4/collisions_1hot.pkl", 'rb') as infile:
    df = pickle.load(infile)

view_date = pd.to_datetime(df['date_time'])

train_indices = (0, np.sum(view_date < datetime.date(2015,9,12))-1)
val_indices = (train_indices[1]+1,train_indices[1] + np.sum((view_date >= datetime.date(2015,9,12)) & (view_date < datetime.date(2016,7,31))))
test_indices = (val_indices[1]+1, val_indices[1] + np.sum(view_date >= datetime.date(2016,7,31)))

In [4]:
X_train = df.iloc[train_indices[0]:train_indices[1]].drop(['injured_or_killed'], axis=1)
y_train = df.iloc[train_indices[0]:train_indices[1]]['injured_or_killed']
X_val = df.iloc[val_indices[0]:val_indices[1]].drop(['injured_or_killed'], axis=1)
y_val = df.iloc[val_indices[0]:val_indices[1]]['injured_or_killed']
X_test = df.iloc[test_indices[0]:test_indices[1]].drop(['injured_or_killed'], axis=1)
y_test = df.iloc[test_indices[0]:test_indices[1]]['injured_or_killed']

In [None]:
rf = {}

title_map = {'all_None.pkl': 'all collisions',
            'bike_None.pkl': 'collisions involving a bicycle',
            'one_None.pkl': 'collisions with one vehicle (no bicycles)',
            'multi_None.pkl': 'collisions with multiple vehicles (no bicycles)'}

for file in ['all_None.pkl', 'bike_None.pkl', 'one_None.pkl', 'multi_None.pkl']:
    with open(os.path.join(path+"models/", file), 'rb') as infile:
        rf[file] = pickle.load(infile)

    fig = plt.figure(figsize=(14, 10))
    num_features = 20
    y_pos = np.arange(num_features)

    importances = pd.Series(rf[file].feature_importances_, index=column_names)
    topX = importances.sort_values(ascending = False)[:num_features]
    plt.bar(y_pos, topX)
    plt.xticks(y_pos, topX.index)
    plt.title("Random Forest - Feature importances for model using {0}".format(title_map[file]))
    fig.autofmt_xdate()

In [13]:
for model in ['all_None.pkl', 'all_balanced.pkl']:
    pickle.dump(rf[model].predict_proba(X_val)[:,1],
                open(path+"ROC curve/{0}".format(model), 'wb'),
                pickle.HIGHEST_PROTOCOL)

In [None]:
from treeinterpreter import treeinterpreter as ti

prediction, bias, contributions = ti.predict(rf['all_None.pkl'], X_val)