In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import itertools

%matplotlib inline

In [2]:
#path = "../../../Google Drive/Data_science/NYU/Machine Learning/ML Project (Collisions)/" #Joe
path = "../../../../Google Drive/ML Project (Collisions)/" # Joyce
# path = "" # Lucas

In [None]:
with open(path+"collisions_1hot.pkl", 'rb') as infile:
    df = pickle.load(infile)

In [None]:
df = df.sort_values(by=['date_time'])

In [None]:
for column in ['date_time', 'rise_time', 'set_time']:
    df[column] = pd.to_datetime(df[column])
    df[column] = df[column].dt.hour*60 + df[column].dt.minute
    
    if np.sum(df[column].isnull()) > 0:
        df[column+'_nan'] = df[column].isnull()
        df[column].fillna(df[column].mean(), inplace=True)

In [None]:
df = df.drop(['injured', 'killed'], axis=1)

In [None]:
train_size = int(np.floor(0.8*df.shape[0]))

In [None]:
X_train = df.drop(['injured_or_killed'], axis=1).iloc[:train_size]
y_train = df['injured_or_killed'].iloc[:train_size]
X_test = df.drop(['injured_or_killed'], axis=1).iloc[train_size:]
y_test = df['injured_or_killed'][train_size:]

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=100, min_samples_leaf=10)

In [None]:
rf.fit(X_train, y_train)

In [None]:
100 - np.sum(y_test)/len(y_test) * 100

In [None]:
rf.score(X_test, y_test)*100

In [None]:
predictions = rf.predict(X_test)

In [None]:
np.sum(predictions)

In [None]:
print (y_test.astype(int).dot(predictions.astype(int))/np.sum(y_test)*100)

In [None]:
fig = plt.figure(figsize=(14, 10))

num_features = 30

y_pos = np.arange(num_features)
importances = np.vstack((rf.feature_importances_, X_train.columns.values)).T
topX = importances[importances[:,0].argsort()][::-1][:num_features]

plt.bar(y_pos, topX[:, 0])
plt.xticks(y_pos, topX[:, 1])
fig.autofmt_xdate()

In [3]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [4]:
h2o.init(ip="127.0.0.1",max_mem_size_GB = 4)

Checking whether there is an H2O instance running at http://127.0.0.1:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_121"; Java(TM) SE Runtime Environment (build 1.8.0_121-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)
  Starting server from /anaconda/envs/py35/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/3t/zdqb2kkj5jd6fkwqk5nxzn1c0000gn/T/tmp0kfplbhf
  JVM stdout: /var/folders/3t/zdqb2kkj5jd6fkwqk5nxzn1c0000gn/T/tmp0kfplbhf/h2o_joycewu_started_from_python.out
  JVM stderr: /var/folders/3t/zdqb2kkj5jd6fkwqk5nxzn1c0000gn/T/tmp0kfplbhf/h2o_joycewu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,05 secs
H2O cluster version:,3.10.4.3
H2O cluster version age:,11 days
H2O cluster name:,H2O_from_python_joycewu_aayuou
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [5]:
h2o.connect()

Connecting to H2O server at http://localhost:54321... successful.


0,1
H2O cluster uptime:,05 secs
H2O cluster version:,3.10.4.3
H2O cluster version age:,11 days
H2O cluster name:,H2O_from_python_joycewu_aayuou
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


<H2OConnection to http://localhost:54321, session _sid_9f3a>

In [10]:
dataFrame = h2o.upload_file(path=path+"no1hot.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [11]:
dataFrame = dataFrame.drop(['vehicle_type_code_1_nan',
 'vehicle_type_code_2_nan',
 'vehicle_type_code_3_nan',
 'vehicle_type_code_4_nan',
 'vehicle_type_code_5_nan',
 'conditions_nan', 'borough_nan', 'C1'])

In [16]:
rows = dataFrame.shape[0]
train_size = int(np.floor(0.8*rows))

In [17]:
train_df = dataFrame.head(rows=train_size)
test_df = dataFrame.tail(rows=rows-train_size)

In [None]:
rf_v1 = H2ORandomForestEstimator(
    model_id="rf_v1",
    ntrees=200,
    stopping_rounds=2,
    seed=1000000)

In [None]:
rf_v1.train(train_df.drop('injured_or_killed').columns, 'injured_or_killed', training_frame=train_df, validation_frame=test_df)

In [None]:
rf_v1.varimp(True)

In [None]:
fig = plt.figure(figsize=(14, 10))

num_features = 30

y_pos = np.arange(num_features)

plt.bar(y_pos, rf_v1.varimp(True)['percentage'][:num_features])
plt.xticks(y_pos, rf_v1.varimp(True)['variable'][:num_features])
fig.autofmt_xdate()

In [18]:
def score_confusion(confusion):
    sensitivity = confusion[1][1]/float(np.sum(confusion[1]))
    specificity = confusion[0][0]/float(np.sum(confusion[0]))
    return sensitivity, specificity

In [27]:
grid_search = []

In [28]:
n_trees = [50, 100, 200]
max_depth = [10, 20, 50, 100]
min_rows = [None, 10, 50, 100]
stopping_metric = ["AUTO", "AUC", "misclassification"]

for i in itertools.product(n_trees,max_depth,min_rows):
    
    if i not in [i[0] for i in grid_search]:
    
        rf = H2ORandomForestEstimator(
        model_id = "rf",
        ntrees=i[0],
        max_depth = i[1],
        min_rows = i[2],
        stopping_rounds=2,
        seed=1000000)

        rf.train(train_df.drop('injured_or_killed').columns, 'injured_or_killed', training_frame=train_df, validation_frame=test_df)

        results = [i,\
                            rf.auc(train=True),\
                            rf.auc(valid=True),\
                            score_confusion(rf.confusion_matrix(train=True).to_list()),\
                           score_confusion(rf.confusion_matrix(valid=True).to_list())]
        print ("%s finished!" % (str(i)))
        print ("Train AUC: %s, Valid AUC: %s, Train S&S: %s, Valid S&S: %s" % (results[1], results[2], results[3], results[4]))
        
        grid_search.append(results)   

        

drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, None, 'AUTO') finished!
Train AUC: 0.7945923573162722, Valid AUC: 0.681387793239885, Train S&S: (0.5562727242403015, 0.894964003600612), Valid S&S: (0.5838912694161756, 0.7090297365128946)
drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, None, 'AUC') finished!
Train AUC: 0.7945923573162722, Valid AUC: 0.681387793239885, Train S&S: (0.5562727242403015, 0.894964003600612), Valid S&S: (0.5838912694161756, 0.7090297365128946)
drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, None, 'misclassification') finished!
Train AUC: 0.7945923573162722, Valid AUC: 0.681387793239885, Train S&S: (0.5562727242403015, 0.894964003600612), Valid S&S: (0.5838912694161756, 0.7090297365128946)
drf Model Build progress: |███████████████████████████████████████████████| 100%
(50, 10, 10, 'AUTO') finished!
Train AUC: 0.7954983950772798, Valid AU

H2OJobCancelled: Job<$03017f00000132d4ffffffff$_a4484e178aa9dac7dda2d151cfdd4f50> was cancelled by the user.