In [97]:
import pandas as pd
import numpy as np
from requests import get
from pandas import json_normalize
import time
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

In [98]:
modeling_data = pd.read_csv('/Users/patricknaylor/Desktop/CA_traffic/Data/selected_columns.csv')

In [99]:
print(modeling_data.nunique())

latitude                           281296
longitude                          315729
statewide_vehicle_type_at_fault        15
alcohol_involved                        2
truck_collision                         2
motorcycle_collision                    2
bicycle_collision                       2
pedestrian_collision                    2
lighting                                5
road_surface                            5
hit_and_run                             3
party_count                            14
injured_victims                        27
killed_victims                          7
location_type                           3
county_location                        58
population                              8
county_location_population             58
dtype: int64


In [100]:
def absolute_maximum_norm(column):
    return column / column.abs().max()

norm_columns = ['latitude', 'longitude', 'county_location_population']

for col in norm_columns:
    modeling_data[col] = absolute_maximum_norm(modeling_data[col])


In [101]:
modeling_data = pd.get_dummies(modeling_data, columns = ['alcohol_involved', 'statewide_vehicle_type_at_fault', 'truck_collision', 'motorcycle_collision', 'bicycle_collision', 'lighting',  'population', 'county_location', 'location_type', 'hit_and_run', 'road_surface'])


In [102]:
modeling_data['mortality_rate'] = modeling_data['killed_victims']/modeling_data['party_count']
modeling_data['injury_rate'] = modeling_data['injured_victims']/modeling_data['party_count']
modeling_data['is_killed'] = modeling_data['killed_victims'] != 0
modeling_data['is_killed'] = modeling_data['is_killed'].astype(int)

In [103]:
columns_list = list(modeling_data.columns)
removals = ['injured_victims', 'killed_victims',  'mortality_rate', 'injury_rate', 'is_killed', 'latitude', 'longitude']
for rem in removals:
    columns_list.remove(rem)
model_x = np.array(modeling_data[columns_list])
model_y = np.array(modeling_data['is_killed'])

In [104]:
print(np.shape(model_y[model_y == 0]))

(686203,)


In [105]:
x_oversample, y_oversample = resample(model_x[model_y == 1], model_y[model_y == 1], replace=True, n_samples=model_x[model_y ==0].shape[0])
X = np.vstack((model_x[model_y == 0], x_oversample))
y = np.hstack((model_y[model_y == 0], y_oversample))

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [107]:
rfc = RandomForestClassifier()
parameters = {'n_estimators': [50, 100, 500], 'max_depth': [2, 5, 10, 30], 'min_samples_split': [2, 5, 10]}
clf = GridSearchCV(rfc, parameters, verbose=3)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=2, min_samples_split=2, n_estimators=50;, score=0.743 total time=   7.5s
[CV 2/5] END max_depth=2, min_samples_split=2, n_estimators=50;, score=0.729 total time=   6.9s
[CV 3/5] END max_depth=2, min_samples_split=2, n_estimators=50;, score=0.744 total time=   6.8s
[CV 4/5] END max_depth=2, min_samples_split=2, n_estimators=50;, score=0.756 total time=   6.8s
[CV 5/5] END max_depth=2, min_samples_split=2, n_estimators=50;, score=0.737 total time=   6.9s
[CV 1/5] END max_depth=2, min_samples_split=2, n_estimators=100;, score=0.747 total time=  13.3s
[CV 2/5] END max_depth=2, min_samples_split=2, n_estimators=100;, score=0.743 total time=  13.5s
[CV 3/5] END max_depth=2, min_samples_split=2, n_estimators=100;, score=0.741 total time=  13.3s
[CV 4/5] END max_depth=2, min_samples_split=2, n_estimators=100;, score=0.753 total time=  13.5s
[CV 5/5] END max_depth=2, min_samples_split=2, n_estimators=100;, scor

[CV 5/5] END max_depth=5, min_samples_split=10, n_estimators=100;, score=0.772 total time=  23.5s
[CV 1/5] END max_depth=5, min_samples_split=10, n_estimators=500;, score=0.775 total time= 2.0min
[CV 2/5] END max_depth=5, min_samples_split=10, n_estimators=500;, score=0.777 total time= 2.0min
[CV 3/5] END max_depth=5, min_samples_split=10, n_estimators=500;, score=0.777 total time= 2.0min
[CV 4/5] END max_depth=5, min_samples_split=10, n_estimators=500;, score=0.776 total time= 2.0min
[CV 5/5] END max_depth=5, min_samples_split=10, n_estimators=500;, score=0.775 total time= 2.0min
[CV 1/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.789 total time=  19.3s
[CV 2/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.792 total time=  19.1s
[CV 3/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.791 total time=  19.4s
[CV 4/5] END max_depth=10, min_samples_split=2, n_estimators=50;, score=0.794 total time=  19.5s
[CV 5/5] END max_depth=1

[CV 4/5] END max_depth=30, min_samples_split=10, n_estimators=50;, score=0.881 total time=  30.9s
[CV 5/5] END max_depth=30, min_samples_split=10, n_estimators=50;, score=0.879 total time=  31.5s
[CV 1/5] END max_depth=30, min_samples_split=10, n_estimators=100;, score=0.878 total time= 1.0min
[CV 2/5] END max_depth=30, min_samples_split=10, n_estimators=100;, score=0.880 total time= 1.0min
[CV 3/5] END max_depth=30, min_samples_split=10, n_estimators=100;, score=0.879 total time= 1.0min
[CV 4/5] END max_depth=30, min_samples_split=10, n_estimators=100;, score=0.881 total time= 1.0min
[CV 5/5] END max_depth=30, min_samples_split=10, n_estimators=100;, score=0.879 total time= 1.0min
[CV 1/5] END max_depth=30, min_samples_split=10, n_estimators=500;, score=0.878 total time= 4.9min
[CV 2/5] END max_depth=30, min_samples_split=10, n_estimators=500;, score=0.880 total time= 4.9min
[CV 3/5] END max_depth=30, min_samples_split=10, n_estimators=500;, score=0.879 total time= 4.9min
[CV 4/5] END

In [108]:
print(clf.best_params_)

{'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 500}


In [109]:

pip = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(max_iter = 5000))),
  ('classification', RandomForestClassifier(**clf.best_params_))
])
pip.fit(X_train, y_train)



In [110]:
pred_train = pip.predict(X_train)
pred_test = pip.predict(X_test)
print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))
print(1-np.mean(y_test))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78    460036
           1       0.78      0.80      0.79    459476

    accuracy                           0.79    919512
   macro avg       0.79      0.79      0.79    919512
weighted avg       0.79      0.79      0.79    919512

              precision    recall  f1-score   support

           0       0.80      0.77      0.78    226167
           1       0.78      0.80      0.79    226727

    accuracy                           0.79    452894
   macro avg       0.79      0.79      0.79    452894
weighted avg       0.79      0.79      0.79    452894

0.4993817537878621


In [113]:
reg = RandomForestClassifier(**clf.best_params_)
reg.fit(X_train, y_train)
pred_train = reg.predict(X_train)
pred_test = reg.predict(X_test)


In [112]:
print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))
print(1-np.mean(y_test))

              precision    recall  f1-score   support

           0       0.73      0.83      0.77    460036
           1       0.80      0.69      0.74    459476

    accuracy                           0.76    919512
   macro avg       0.76      0.76      0.76    919512
weighted avg       0.76      0.76      0.76    919512

              precision    recall  f1-score   support

           0       0.73      0.83      0.77    226167
           1       0.80      0.69      0.74    226727

    accuracy                           0.76    452894
   macro avg       0.76      0.76      0.76    452894
weighted avg       0.76      0.76      0.76    452894

0.4993817537878621


# TODO:
# split train and test before oversample for more accurate test