# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 유성민, 김도운

## 1. 데이터 전처리

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [12]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

preprocessing = preprocessing.iloc[np.random.permutation(len(preprocessing))]

y = preprocessing.loc[:, 'TargetOrigin' ]
x = preprocessing.loc[:,'Location_East':]

from sklearn.model_selection import train_test_split

x = x[:1000]
y = y[:1000]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [13]:
count0, count1, count2, count3 = 0, 0, 0, 0
for i in y_test:
    if i == 0: count0 += 1
    elif i == 1: count1 += 1
    elif i == 2: count2 += 1
    else: count3 += 1
print(count0, count1, count2, count3)

1192 135 18 655


## 2. 모델 학습

# Prepare Model training and analysis

> editor: seongminyoo   
> Date of modification: 2020-05-25 04:49pm

`solving problem`
+ Classification Problem

`using model`
+ KNN
+ Decision Tree
+ Bagging
+ Random Forest
+ SVM
+ Neural Network classifier

`algorithm for each step`
1. Make pipeline
    - standard data, model
2. GridSearchCV by 10 fold or 5 fold
    - make parameter set
3. Test
    - best_esitimater is model that construct best hyper parameter set
4. Analysis result
    - use confusion matrix

# Make functions

In [14]:
# 최고 모델 만드는 함수
def make_model(model_name, model, param_grid, cv):
    try: # random_state 가능 모델
        pipe_svc = Pipeline([('scl', StandardScaler()), (model_name,model(random_state = 1))])
    except Exception: # random_state 불가능 모델
        print('error detection!')
        pipe_svc = Pipeline([('scl', StandardScaler()), (model_name,model())])
    finally:
        gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=cv)
        gs.fit(x_train,y_train)
        best_params = gs.best_params_
        print(best_params)
        return gs

# test function
def make_test(model):
        best_model=model.best_estimator_
        y_pred = best_model.predict(x_test)
        print(classification_report(y_test,y_pred,target_names=['class 0','class 1','class 2','class 3']))

In [15]:
from concurrent.futures import as_completed, ProcessPoolExecutor
import time
import numpy as np
import winprocess

def parallel_processing(model_name, model, param_grid, cv):
    executor = ProcessPoolExecutor(max_workers=50)
    fs = winprocess.submit(executor, make_model, model_name, model, param_grid, cv)
    return fs.result()

KeyError: 'Time'

# Dummy data for Debugging

# KNN

In [5]:
# hyper parameter sets
param_range = [i for i in range(1,11)]
param_grid = [
    {'knn__n_neighbors': param_range, 'knn__metric': ['euclidean']},
    {'knn__n_neighbors': param_range, 'knn__metric': ['manhattan']}]

In [6]:
best_model = parallel_processing('knn',KNeighborsClassifier,param_grid, 5)

In [7]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.66      0.97      0.78       123
     class 1       0.00      0.00      0.00         4
     class 2       0.00      0.00      0.00         3
     class 3       0.68      0.19      0.29        70

    accuracy                           0.66       200
   macro avg       0.34      0.29      0.27       200
weighted avg       0.64      0.66      0.58       200



  _warn_prf(average, modifier, msg_start, len(result))


# Decision Tree

In [8]:
# hyper parameter sets
param_range = [i for i in range(3,7)]
# 1 아래의 parmeter들과 위의 k가 한번 씩 모두 들어가서 brute-force를 진행함으로써 제일 좋은 파라미터를  결과로 도출해준다.
param_grid = [
    {'dt__max_depth': param_range, 'dt__criterion': ['entropy']},
    {'dt__max_depth': param_range, 'dt__criterion': ['gini']}]

In [9]:
# Decision Tree train and test
best_model = parallel_processing('dt',tree.DecisionTreeClassifier,param_grid,5)

In [10]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.68      0.93      0.79       123
     class 1       0.00      0.00      0.00         4
     class 2       0.00      0.00      0.00         3
     class 3       0.68      0.30      0.42        70

    accuracy                           0.68       200
   macro avg       0.34      0.31      0.30       200
weighted avg       0.66      0.68      0.63       200



  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest

In [11]:
# hyper parameter set for Random Forest
estimators = [100,250]
depth_range = [i for i in range(3,7)]
param_grid = [
    {'rf__max_depth': depth_range, 'rf__criterion': ['entropy'], 'rf__n_estimators':estimators},
    {'rf__max_depth': depth_range, 'rf__criterion': ['gini'], 'rf__n_estimators':estimators}]

In [12]:
# Random Forest train and test
best_model = parallel_processing('rf',RandomForestClassifier,param_grid,5)

In [13]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.67      1.00      0.80       123
     class 1       0.00      0.00      0.00         4
     class 2       0.00      0.00      0.00         3
     class 3       0.94      0.21      0.35        70

    accuracy                           0.69       200
   macro avg       0.40      0.30      0.29       200
weighted avg       0.74      0.69      0.61       200



  _warn_prf(average, modifier, msg_start, len(result))


# Bagging

In [37]:
# hyper parameter set for Bagging
estimators = [100, 250]
depth_range = [i for i in range(3,7)]
param_grid = [
    {'bg__base_estimator__max_depth': depth_range, 'bg__base_estimator__criterion': ['entropy'], 'bg__n_estimators':estimators},
    {'bg__base_estimator__max_depth': depth_range, 'bg__base_estimator__criterion': ['gini'], 'bg__n_estimators':estimators}
 ]

In [38]:
#Bagging train and test
pipe_svc = Pipeline([('scl', StandardScaler()), ('bg',BaggingClassifier(tree.DecisionTreeClassifier(random_state = 1),random_state=1))])
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=5,)
gs.fit(x_train,y_train)
print(gs.best_params_)
best_bg=gs.best_estimator_
y_pred = best_bg.predict(x_test) 
print(classification_report(y_test,y_pred,target_names=['class 0','class 1','class 2','class 3']))

{'bg__base_estimator__criterion': 'entropy', 'bg__base_estimator__max_depth': 3, 'bg__n_estimators': 250}
              precision    recall  f1-score   support

     class 0       0.66      1.00      0.79       131
     class 1       0.00      0.00      0.00         6
     class 2       0.00      0.00      0.00         2
     class 3       0.00      0.00      0.00        61

    accuracy                           0.66       200
   macro avg       0.16      0.25      0.20       200
weighted avg       0.43      0.66      0.52       200



  _warn_prf(average, modifier, msg_start, len(result))


# SVM

In [17]:
# hyper parameter set for SVM
param_range = [10e-6, 10e-4, 0.01, 1 ,10.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__kernel': ['rbf']},
    {'clf__C': param_range, 'clf__kernel': ['poly']},
    {'clf__C': param_range, 'clf__kernel': ['sigmid']}]

In [18]:
# SVM Forest train and test
best_model = parallel_processing('clf',SVC,param_grid,5)

In [19]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.61      1.00      0.76       123
     class 1       0.00      0.00      0.00         4
     class 2       0.00      0.00      0.00         3
     class 3       0.00      0.00      0.00        70

    accuracy                           0.61       200
   macro avg       0.15      0.25      0.19       200
weighted avg       0.38      0.61      0.47       200



  _warn_prf(average, modifier, msg_start, len(result))


# Neural Network Classifier

In [20]:
# hyper parameter set for Neural Network
param_learning_rate=['constant', 'adaptive']
param_activation = ['relu', 'identity', 'tanh', 'logistic']
param_solver=['sgd', 'adam']
param_hidden_layer_sizes=[(100,)]
param_grid = [
    {'nnc__learning_rate': param_learning_rate, 'nnc__hidden_layer_sizes':param_hidden_layer_sizes, 
     'nnc__activation':param_activation, 'nnc__solver':param_solver}
]

In [21]:
# Neural Network train and test
best_model = parallel_processing('nnc',MLPClassifier,param_grid, 5)

In [22]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.61      1.00      0.76       123
     class 1       0.00      0.00      0.00         4
     class 2       0.00      0.00      0.00         3
     class 3       0.00      0.00      0.00        70

    accuracy                           0.61       200
   macro avg       0.15      0.25      0.19       200
weighted avg       0.38      0.61      0.47       200



  _warn_prf(average, modifier, msg_start, len(result))
