# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 유성민, 김도운

## 1. 데이터 전처리

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [2]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

preprocessing = preprocessing.iloc[np.random.permutation(len(preprocessing))]

y = preprocessing.loc[:, 'TargetOrigin' ]
x = preprocessing.loc[:,'Location_East':]

from sklearn.model_selection import train_test_split

x = x[:1000]
y = y[:1000]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print(preprocessing)

        TargetOrigin  Target_0  Target_1  Target_2  Target_3  Location_East  \
168950             3         0         0         0         1              1   
217224             0         1         0         0         0              1   
243619             1         0         1         0         0              0   
217129             0         1         0         0         0              0   
104184             2         0         0         1         0              1   
...              ...       ...       ...       ...       ...            ...   
319717             3         0         0         0         1              1   
211960             0         1         0         0         0              0   
75879              2         0         0         1         0              0   
163007             3         0         0         0         1              0   
218972             0         1         0         0         0              0   

        Location_North  Location_South  Location_We

In [3]:
count0, count1, count2, count3 = 0, 0, 0, 0
for i in y_test:
    if i == 0: count0 += 1
    elif i == 1: count1 += 1
    elif i == 2: count2 += 1
    else: count3 += 1
print(count0, count1, count2, count3)

41 52 53 54


## 2. 모델 학습

# Prepare Model training and analysis

> editor: seongminyoo   
> Date of modification: 2020-05-25 04:49pm

`solving problem`
+ Classification Problem

`using model`
+ KNN
+ Decision Tree
+ Bagging
+ Random Forest
+ SVM
+ Neural Network classifier

`algorithm for each step`
1. Make pipeline
    - standard data, model
2. GridSearchCV by 10 fold or 5 fold
    - make parameter set
3. Test
    - best_esitimater is model that construct best hyper parameter set
4. Analysis result
    - use confusion matrix

# Make functions

In [4]:
# 최고 모델 만드는 함수
def make_model(model, param_grid, cv):
    try: # random_state 가능 모델
        gs = GridSearchCV(estimator=model(random_state = 1), param_grid=param_grid, scoring='accuracy', cv=cv)
    except Exception: # random_state 불가능 모델
        print('error detection!')
        gs = GridSearchCV(estimator=model(), param_grid=param_grid, scoring='accuracy', cv=cv)
    finally:
        gs.fit(x_train,y_train)
        best_params = gs.best_params_
        print(best_params)
        return gs

# test function
def make_test(model):
        best_model=model.best_estimator_
        y_pred = best_model.predict(x_test)
        print(classification_report(y_test,y_pred,target_names=['class 0','class 1','class 2','class 3']))

In [5]:
from concurrent.futures import as_completed, ProcessPoolExecutor
import time
import numpy as np
import winprocess

def parallel_processing(model, param_grid, cv):
    executor = ProcessPoolExecutor(max_workers=50)
    fs = winprocess.submit(executor, make_model,model, param_grid, cv)
    return fs.result()

# KNN

In [6]:
# hyper parameter sets
param_range = [i for i in range(1,11)]
param_grid = [
    {'n_neighbors': param_range, 'metric': ['euclidean']},
    {'n_neighbors': param_range, 'metric': ['manhattan']}]

In [7]:
best_model = parallel_processing(KNeighborsClassifier,param_grid, 5)

In [8]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.21      0.46      0.29        41
     class 1       0.39      0.33      0.35        52
     class 2       0.43      0.30      0.36        53
     class 3       0.30      0.17      0.21        54

    accuracy                           0.30       200
   macro avg       0.33      0.31      0.30       200
weighted avg       0.34      0.30      0.30       200



# Decision Tree

In [9]:
# hyper parameter sets
param_range = [i for i in range(3,7)]
# 1 아래의 parmeter들과 위의 k가 한번 씩 모두 들어가서 brute-force를 진행함으로써 제일 좋은 파라미터를  결과로 도출해준다.
param_grid = [
    {'max_depth': param_range, 'criterion': ['entropy']},
    {'max_depth': param_range, 'criterion': ['gini']}]

In [10]:
# Decision Tree train and test
best_model = parallel_processing(tree.DecisionTreeClassifier,param_grid,5)

In [11]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.21      0.44      0.28        41
     class 1       0.25      0.31      0.28        52
     class 2       0.31      0.28      0.29        53
     class 3       0.00      0.00      0.00        54

    accuracy                           0.24       200
   macro avg       0.19      0.26      0.21       200
weighted avg       0.19      0.24      0.21       200



# Random Forest

In [16]:
# hyper parameter set for Random Forest
estimators = [100,250]
depth_range = [i for i in range(3,7)]
param_grid = [
    {'max_depth': depth_range, 'criterion': ['entropy'], 'n_estimators':estimators},
    {'max_depth': depth_range, 'criterion': ['gini'], 'n_estimators':estimators}]

In [17]:
# Random Forest train and test
best_model = parallel_processing(RandomForestClassifier,param_grid,5)

In [18]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.24      0.46      0.31        41
     class 1       0.39      0.25      0.31        52
     class 2       0.40      0.60      0.48        53
     class 3       0.83      0.09      0.17        54

    accuracy                           0.34       200
   macro avg       0.46      0.35      0.32       200
weighted avg       0.48      0.34      0.32       200



# Bagging

In [19]:
# hyper parameter set for Bagging
estimators = [100, 250]
depth_range = [i for i in range(3,7)]
param_grid = [
    {'base_estimator__max_depth': depth_range, 'base_estimator__criterion': ['entropy'], 'n_estimators':estimators},
    {'base_estimator__max_depth': depth_range, 'base_estimator__criterion': ['gini'], 'n_estimators':estimators}
 ]

In [20]:
#Bagging train and test

gs = GridSearchCV(estimator=BaggingClassifier(tree.DecisionTreeClassifier(random_state = 1),random_state=1)
                  , param_grid=param_grid, scoring='accuracy', cv=5,)
gs.fit(x_train,y_train)
print(gs.best_params_)
best_bg=gs.best_estimator_
y_pred = best_bg.predict(x_test) 
print(classification_report(y_test,y_pred,target_names=['class 0','class 1','class 2','class 3']))

{'base_estimator__criterion': 'entropy', 'base_estimator__max_depth': 5, 'n_estimators': 250}
              precision    recall  f1-score   support

     class 0       0.17      0.34      0.23        41
     class 1       0.45      0.27      0.34        52
     class 2       0.40      0.49      0.44        53
     class 3       0.12      0.06      0.08        54

    accuracy                           0.28       200
   macro avg       0.29      0.29      0.27       200
weighted avg       0.29      0.28      0.27       200



# SVM

In [21]:
# hyper parameter set for SVM
param_range = [10e-6, 10e-4, 0.01, 1 ,10.0]
param_grid = [
    {'C': param_range, 'kernel': ['linear']},
    {'C': param_range, 'kernel': ['rbf']},
    {'C': param_range, 'kernel': ['poly']},
    {'C': param_range, 'kernel': ['sigmid']}]

In [22]:
# SVM Forest train and test
best_model = parallel_processing(SVC,param_grid,5)

In [23]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.24      0.27      0.25        41
     class 1       0.33      0.25      0.28        52
     class 2       0.47      0.53      0.50        53
     class 3       0.35      0.35      0.35        54

    accuracy                           0.36       200
   macro avg       0.35      0.35      0.35       200
weighted avg       0.35      0.35      0.35       200



# Neural Network Classifier

In [24]:
# hyper parameter set for Neural Network
param_learning_rate=['constant', 'adaptive']
param_activation = ['relu', 'identity', 'tanh', 'logistic']
param_solver=['sgd', 'adam']
param_hidden_layer_sizes=[(100,)]
param_grid = [
    {'learning_rate': param_learning_rate, 'hidden_layer_sizes':param_hidden_layer_sizes, 
     'activation':param_activation, 'solver':param_solver}
]

In [None]:
# Neural Network train and test
best_model = parallel_processing(MLPClassifier,param_grid, 5)

In [None]:
make_test(best_model)