# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 유성민, 김도운

## 1. 데이터 전처리

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [11]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

data1 = preprocessing.loc[ : , "Time" : "Type"]
data2 = preprocessing.loc[:, "AgeBand":"TargetOrigin"]
data = pd.concat([data1,data2], axis = 1)

x = data[['Time', 'Day', 'Location','RoadState','Weather', 'RoadShape', 'Type', 'AgeBand']]
y = data['TargetOrigin']

from sklearn.model_selection import train_test_split

x = x[:100000]
y = y[:100000]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## 2. 모델 학습

# Prepare Model training and analysis

> editor: seongminyoo   
> Date of modification: 2020-05-25 04:49pm

`solving problem`
+ Classification Problem

`using model`
+ KNN
+ Decision Tree
+ Bagging
+ Random Forest
+ SVM
+ Neural Network classifier

`algorithm for each step`
1. Make pipeline
    - standard data, model
2. GridSearchCV by 10 fold or 5 fold
    - make parameter set
3. Test
    - best_esitimater is model that construct best hyper parameter set
4. Analysis result
    - use confusion matrix

# Make functions

In [12]:
# 최고 모델 만드는 함수
def make_model(model_name, model, param_grid, cv):
    try: # random_state 가능 모델
        pipe_svc = Pipeline([('scl', StandardScaler()), (model_name,model(random_state = 1))])
    except Exception: # random_state 불가능 모델
        print('error detection!')
        pipe_svc = Pipeline([('scl', StandardScaler()), (model_name,model())])
    finally:
        gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=cv)
        gs.fit(x_train,y_train)
        best_params = gs.best_params_
        print(best_params)
        return gs

# test function
def make_test(model):
        best_model=model.best_estimator_
        y_pred = best_model.predict(x_test)
        print(classification_report(y_test,y_pred,target_names=['class 0','class 1']))

In [13]:
from concurrent.futures import as_completed, ProcessPoolExecutor
import time
import numpy as np
import winprocess

def parallel_processing(model_name, model, param_grid, cv):
    executor = ProcessPoolExecutor(max_workers=50)
    fs = winprocess.submit(executor, make_model, model_name, model, param_grid, cv)
    if fs.done(): 
        print('work done!')
    return fs.result()

# Dummy data for Debugging

In [14]:
 # dummy data for debug
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_features=8, n_informative=5,
                            n_redundant=3, n_clusters_per_class=1, random_state=4, n_samples =1000 )

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# KNN

In [15]:
# hyper parameter sets
param_range = [10,20,30] 
# param_range = [i for i in range(1,11)]
param_grid = [
    {'knn__n_neighbors': param_range, 'knn__metric': ['euclidean']},
    {'knn__n_neighbors': param_range, 'knn__metric': ['manhattan']}]

In [16]:
best_model = parallel_processing('knn',KNeighborsClassifier,param_grid, 5)

In [17]:
make_test(best_model)

              precision    recall  f1-score   support

     class 0       0.98      1.00      0.99        61
     class 1       1.00      0.97      0.99        39

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



# Decision Tree

In [None]:
# hyper parameter sets
param_range = [i for i in range(5,21)]
# 1 아래의 parmeter들과 위의 k가 한번 씩 모두 들어가서 brute-force를 진행함으로써 제일 좋은 파라미터를  결과로 도출해준다.
param_grid = [
    {'dt__max_depth': param_range, 'dt__criterion': ['entropy']},
    {'dt__max_depth': param_range, 'dt__criterion': ['gini']}]

In [None]:
# Decision Tree train and test
best_model = parallel_processing('dt',tree.DecisionTreeClassifier,param_grid,10)

In [None]:
make_test(best_model)

# Random Forest

In [None]:
# hyper parameter set for Random Forest
estimators = [10, 100 , 500]
depth_range = [i for i in range(5,11)]
param_grid = [
    {'rf__max_depth': depth_range, 'rf__criterion': ['entropy'], 'rf__n_estimators':estimators},
    {'rf__max_depth': depth_range, 'rf__criterion': ['gini'], 'rf__n_estimators':estimators}]

In [None]:
# Random Forest train and test
best_model = parallel_processing('rf',RandomForestClassifier,param_grid,5)

In [None]:
make_test(best_model)

# Bagging

In [None]:
# hyper parameter set for Bagging
estimators = [10,100, 500]
depth_range = [i for i in range(5,11)]
param_grid = [
    {'bg__base_estimator__max_depth': depth_range, 'bg__base_estimator__criterion': ['entropy'], 'bg__n_estimators':estimators},
    {'bg__base_estimator__max_depth': depth_range, 'bg__base_estimator__criterion': ['gini'], 'bg__n_estimators':estimators}
 ]

In [None]:
#Bagging train and test
pipe_svc = Pipeline([('scl', StandardScaler()), ('bg',BaggingClassifier(tree.DecisionTreeClassifier(random_state = 1),random_state=1))])
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)
gs.fit(x_train,y_train)
print(gs.best_params_)
best_bg=gs.best_estimator_
y_pred = best_bg.predict(x_test) 
print(classification_report(y_test,y_pred,target_names=['class 0','class 1','class 2','class 3']))

# SVM

In [None]:
# hyper parameter set for SVM
param_range = [10e-6, 10e-4, 0.01, 1 ,10.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__kernel': ['rbf']},
    {'clf__C': param_range, 'clf__kernel': ['poly']},
    {'clf__C': param_range, 'clf__kernel': ['sigmid']}]

In [None]:
# SVM Forest train and test
best_model = parallel_processing('clf',SVC,param_grid,5)

In [None]:
make_test(best_model)

# Neural Network Classifier

In [None]:
# hyper parameter set for Neural Network
param_learning_rate=['constant', 'adaptive']
param_activation = ['relu', 'identity', 'tanh', 'logistic']
param_solver=['sgd', 'adam']
param_hidden_layer_sizes=[(100,)]
param_grid = [
    {'nnc__learning_rate': param_learning_rate, 'nnc__hidden_layer_sizes':param_hidden_layer_sizes, 
     'nnc__activation':param_activation, 'nnc__solver':param_solver}
]

In [None]:
# Neural Network train and test
best_model = parallel_processing('nnc',MLPClassifier,param_grid, 5)

In [None]:
make_test(best_model)