In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn import metrics
import matplotlib.pylab as plt
%matplotlib inline

In [46]:
path = "../../../Google Drive/Data_science/NYU/Machine Learning/ML Project (Collisions)/data_for_training/v4/normalized_1hot" #Joe
#path = "../../../../Google Drive/ML Project (Collisions)/" # Joyce
# path = "" # Lucas

In [3]:
def modelfit(alg, train_X, train_y, val_X=None, val_y=None, early_stopping_rounds=50):
    
    val_check = (not val_X is None)
    
    # Fit the algorithm on the data
    print(alg.get_params)
    alg.fit(train_X, train_y, eval_metric='auc')
        
    # Predict training set:
    dtrain_predictions = alg.predict(train_X)
    dtrain_predprob = alg.predict_proba(train_X)[:,1]

    # Predict val set:
    if val_check:
        dval_predictions = alg.predict(val_X)
        dval_predprob = alg.predict_proba(val_X)[:,1]
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(train_y, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(train_y, dtrain_predprob))

    if val_check:
        print("\nAccuracy : %.4g" % metrics.accuracy_score(val_y, dval_predictions))
        print("AUC Score (Val): %f" % metrics.roc_auc_score(val_y, dval_predprob))
    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    
    return alg

In [47]:
train_X = pkl.load(open(path+'/train_X.pkl','rb'))
train_y = pkl.load(open(path+'/train_y.pkl','rb'))

val_X = pkl.load(open(path+'/val_X.pkl','rb'))
val_y = pkl.load(open(path+'/val_y.pkl','rb'))

In [None]:
model = modelfit(xgb.XGBClassifier(learning_rate = 0.1, max_depth = 4, n_estimators=1000, reg_lambda = 1e2), train_X=train_X, train_y=train_y, val_X=val_X, val_y=val_y)

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=100.0,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)>


In [None]:
pkl.dump(model, open('./best_xgboost_all_data.pkl','wb'))

bike = 0.583272
one = 0.650619
multi = 0.687697

In [None]:
best_model = pkl.load(open('./best_xgboost_all_data.pkl','rb'))

In [21]:
bike_model = pkl.load(open('./best_xgboost_bike_data.pkl','rb'))

In [22]:
one_model = pkl.load(open('./best_xgboost_one_data.pkl','rb'))

In [33]:
multi_model = pkl.load(open('./best_xgboost_multi_data.pkl','rb'))

In [26]:
target_variable = 'injured_or_killed'
column_names = [i for i in pkl.load(open('../../../Google Drive/Data_science/NYU/Machine Learning/ML Project (Collisions)/data_for_training/v4/'+'collisions_1hot.pkl', 'rb')).columns.values if i != target_variable]

In [27]:
foo = pd.Series(bike_model.feature_importances_, index=column_names)

In [28]:
foo.sort_values(ascending=False)[:30]

zip_code                             0.165333
lat_long                             0.093333
uber_count_0.06                      0.052667
date_time                            0.045333
uber_count_0.03                      0.044667
dist_to_closest_subway               0.040667
num_unknown                          0.038000
drag_racing_0.005                    0.036667
drag_racing_0.06                     0.035333
uber_count_0.3                       0.034000
congestion/gridlock_0.005            0.032667
time_of_day                          0.032667
day_of_week                          0.030667
chronic_stoplight_violation_0.06     0.030667
chronic_stoplight_violation_0.005    0.029333
latitude                             0.023333
set_time                             0.023333
chronic_speeding_0.06                0.022000
chronic_stoplight_violation_0.03     0.019333
longitude                            0.018667
truck_route_violation_0.005          0.018667
congestion/gridlock_0.06          

In [None]:
val_preds = [best_model.predict_proba(val_X)[:,1]]

plt.figure(figsize=(8,8))
lw = 2
for val_pred in val_preds:
    fpr, tpr, _ = metrics.roc_curve(val_y, val_pred)
    plt.plot(fpr, tpr,
         lw=lw, label='AUC = %0.2f' % metrics.auc(fpr,tpr))

plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()