In [26]:
import numpy as np
import pandas as pd
import pickle
from sklearn import naive_bayes
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import itertools
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
import datetime

In [27]:
#path = "../../../Google Drive/Data_science/NYU/Machine Learning/ML Project (Collisions)/data_for_training/v2/" #Joe
path = "../../../../Google Drive/ML Project (Collisions)/data_for_training/v4/" # Joyce
# path = "" # Lucas

## Load train, val, test split

In [28]:
target_variable = 'injured_or_killed'
column_names = [i for i in pickle.load(open(path+'collisions_1hot.pkl', 'rb')).columns.values if i != target_variable]

X_train = pd.DataFrame(pickle.load(open(path+'normalized_1hot/train_X.pkl', 'rb')), columns=column_names )
y_train = pd.DataFrame(pickle.load(open(path+'normalized_1hot/train_y.pkl', 'rb')), columns = [target_variable])
X_val = pd.DataFrame(pickle.load(open(path+'normalized_1hot/val_X.pkl', 'rb')), columns=column_names)
y_val = pd.DataFrame(pickle.load(open(path+'normalized_1hot/val_y.pkl', 'rb')), columns = [target_variable])
X_test = pd.DataFrame(pickle.load(open(path+'normalized_1hot/test_X.pkl', 'rb')), columns=column_names )
y_test = pd.DataFrame(pickle.load(open(path+'normalized_1hot/test_y.pkl', 'rb')), columns = [target_variable])

In [29]:
len(column_names)

242

## Train a multinomial on the categorical variables and a Gaussian/Bernoulli on the numeric

In [10]:
numeric = X_train._get_numeric_data().columns.values

In [11]:
categorical = [i for i in X_train.columns.values if i not in numeric and i != 'injured_or_killed']

In [None]:
multinomial = naive_bayes.MultinomialNB()
gaussian = naive_bayes.GaussianNB()
bernoulli = naive_bayes.BernoulliNB()

In [None]:
label_encodings = {}

In [None]:
for column in categorical:
    label_encodings[column] = LabelEncoder()
    label_encodings[column].fit(df[column].fillna('NaN'))
    df[column] = label_encodings[column].transform(df[column].fillna('NaN')) 

In [None]:
X_train = df.drop(['injured_or_killed'], axis=1).iloc[train_indices[0]:train_indices[1]]
y_train = df['injured_or_killed'].iloc[train_indices[0]:train_indices[1]]
X_val = df.drop(['injured_or_killed'], axis=1).iloc[val_indices[0]:val_indices[1]]
y_val = df['injured_or_killed'][val_indices[0]:val_indices[1]]
X_test = df.drop(['injured_or_killed'], axis=1).iloc[test_indices[0]:test_indices[1]]
y_test = df['injured_or_killed'][test_indices[0]:test_indices[1]]

In [None]:
multinomial.fit(X_train.loc[:, categorical], y_train)

In [None]:
gaussian.fit(X_train.loc[:, numeric], y_train)

In [None]:
bernoulli.fit(X_train.loc[:, numeric], y_train)

In [None]:
def generate_pred(X, model1, model2):
    pred_cat = [i[1] for i in model1.predict_proba(X.loc[:, categorical])]
    pred_num = [i[1] for i in model2.predict_proba(X.loc[:, numeric])]
    pred_final = np.multiply(pred_cat, pred_num)
    
    return pred_final

In [None]:
t_auc = metrics.roc_auc_score(y_train, generate_pred(X_train, multinomial, gaussian))
v_auc = metrics.roc_auc_score(y_val, generate_pred(X_val, multinomial, gaussian))

print (t_auc, v_auc)

In [None]:
t_auc = metrics.roc_auc_score(y_train, generate_pred(X_train, multinomial, bernoulli))
v_auc = metrics.roc_auc_score(y_val, generate_pred(X_val, multinomial, bernoulli))

print (t_auc, v_auc)

In [None]:
binary = [i for i in numeric if len(df[i].unique()) == 2]

In [None]:
nonbinary = [i for i in numeric if i not in binary]

In [None]:
nonbinary_gaussian = naive_bayes.GaussianNB()
binary_bernoulli = naive_bayes.BernoulliNB()

In [None]:
binary_bernoulli.fit(X_train.loc[:, binary], y_train)
nonbinary_gaussian.fit(X_train.loc[:, nonbinary], y_train)

In [None]:
def combine_three_pred(X):

    pred_cat = [i[1] for i in multinomial.predict_proba(X.loc[:, categorical])]
    pred_bin = [i[1] for i in binary_bernoulli.predict_proba(X.loc[:, binary])]
    pred_nonbin = [i[1] for i in nonbinary_gaussian.predict_proba(X.loc[:, nonbinary])]

    pred_final = np.multiply(np.multiply(pred_cat, pred_bin), pred_nonbin)
    
    return pred_final

In [None]:
t_auc = metrics.roc_auc_score(y_train, combine_three_pred(X_train))
v_auc = metrics.roc_auc_score(y_val, combine_three_pred(X_val))

print (t_auc, v_auc)

## 1 Hot Encoding version

Gaussian NB

In [30]:
one_hot_gaussian = naive_bayes.GaussianNB()

In [31]:
one_hot_gaussian.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None)

In [32]:
one_hot_pred_train = [i[1] for i in one_hot_gaussian.predict_proba(X_train)]
one_hot_pred_val = [i[1] for i in one_hot_gaussian.predict_proba(X_val)]

t_auc = metrics.roc_auc_score(y_train, one_hot_pred_train)
v_auc = metrics.roc_auc_score(y_val, one_hot_pred_val)

print (t_auc, v_auc)

0.726189248729 0.702795185309


Bernoulli NB

In [33]:
one_hot_bernoulli = naive_bayes.BernoulliNB()

In [34]:
one_hot_bernoulli.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [35]:
one_hot_pred_train = [i[1] for i in one_hot_bernoulli.predict_proba(X_train)]
one_hot_pred_val = [i[1] for i in one_hot_bernoulli.predict_proba(X_val)]

t_auc = metrics.roc_auc_score(y_train, one_hot_pred_train)
v_auc = metrics.roc_auc_score(y_val, one_hot_pred_val)

print (t_auc, v_auc)

0.735045343965 0.707063078097
