# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 유성민, 김도운
## 2. 모델 학습

In [None]:
# 사전 준비
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

plt.style.use('seaborn')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
sample = pd.read_csv('../input/sample_accident.csv')

print("sample.shape:", sample.shape)

In [None]:
sample

# Read Preprocessing data

In [None]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

print("sample.shape:", preprocessing.shape)

# 초기 샘플 데이터
preprocessing

# Make train data

In [None]:
# date, age 제외하기, target = 0,1,2,3으로 변경
data1 = preprocessing.loc[ : , "Time" : "Type"]
data2 = preprocessing.loc[:, "AgeBand"]
data = pd.concat([data1,data2], axis = 1)
data

In [None]:
###### 타겟 데이터를 트레인 테스트 분리하기 전에 X, y를 배열 형태로 만들기

# Prepare Model training and analysis

> editor: seongminyoo   
> Date of modification: 2020-05-25 04:49pm

`solving problem`
+ Classification Problem

`using model`
+ KNN
+ Decision Tree
+ Bagging
+ Random Forest
+ SVM
+ Neural Network classifier

`algorithm for each step`
1. Make pipeline
    - standard data, model
2. GridSearchCV by 10 fold or 5 fold
    - make parameter set
3. Test
    - best_esitimater is model that construct best hyper parameter set
4. Analysis result
    - use confusion matrix

In [None]:
# Data split
# target 0,1,2,3으로 수정필요
# target data를 array 형태로 나타내야됨(y 수정 필요)
from sklearn.model_selection import train_test_split

x_train, y_train, x_test, y_test = train_test_split(X, y, test_size = 0.1, random_state = 1)

# import using package

In [112]:
# package
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Make functions

In [113]:
# 최고 모델 만드는 함수
def make_model(model_name, model, param_grid, cv):
    try: # random_state 가능 모델
        pipe_svc = Pipeline([('scl', StandardScaler()), (model_name,model(random_state = 1))])
    except Exception: # random_state 불가능 모델
        print('error detection!')
        pipe_svc = Pipeline([('scl', StandardScaler()), (model_name,model())])
    finally:
        gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=cv, n_jobs=-1)
        gs.fit(x_train,y_train)
        best_params = gs.best_params_
        print(best_params)
        return gs

# test function
def make_test(model):
        best_model=model.best_estimator_
        y_pred = best_model.predict(x_test)
        print(classification_report(y_test,y_pred,target_names=['class 0','class 1','class 2', 'class 3']))

# Dummy data for Debugging

In [114]:
# # dummy data for debug
# from sklearn.datasets import make_classification
# from sklearn.model_selection import train_test_split

# X, y = make_classification(n_features=8, n_informative=5,
#                            n_redundant=3, n_clusters_per_class=1, random_state=4, n_samples =1000 )

# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# KNN

In [115]:
# hyper parameter sets
param_range = [1000,2000,3000,4000,5000] # dummy데이터보다 이웃수가 더 많아서 테스트 못함
# param_range = [i for i in range(1,11)]
param_grid = [
    {'knn__n_neighbors': param_range, 'knn__metric': ['euclidean']},
    {'knn__n_neighbors': param_range, 'knn__metric': ['manhattan']}]

In [None]:
# KNN train and test
make_test(make_model('knn',KNeighborsClassifier,param_grid,10))

error detection!


# Decision Tree

In [44]:
# hyper parameter sets
param_range = [i for i in range(5,21)]
# 1 아래의 parmeter들과 위의 k가 한번 씩 모두 들어가서 brute-force를 진행함으로써 제일 좋은 파라미터를  결과로 도출해준다.
param_grid = [
    {'dt__max_depth': param_range, 'dt__criterion': ['entropy']},
    {'dt__max_depth': param_range, 'dt__criterion': ['gini']}]

In [45]:
# Decision Tree train and test
make_test(make_model('dt',tree.DecisionTreeClassifier,param_grid,10))

{'dt__criterion': 'gini', 'dt__max_depth': 6}
              precision    recall  f1-score   support

     class 0       0.96      0.98      0.97        54
     class 1       0.98      0.96      0.97        46

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



# Random Forest

In [15]:
# hyper parameter set for Random Forest
estimators = [10, 100 , 500]
depth_range = [i for i in range(5,11)]
param_grid = [
    {'rf__max_depth': depth_range, 'rf__criterion': ['entropy'], 'rf__n_estimators':estimators},
    {'rf__max_depth': depth_range, 'rf__criterion': ['gini'], 'rf__n_estimators':estimators}]

In [16]:
# Random Forest train and test
make_test(make_model('rf',RandomForestClassifier,param_grid,10))

{'rf__criterion': 'entropy', 'rf__max_depth': 9, 'rf__n_estimators': 500}
              precision    recall  f1-score   support

     class 0       0.98      0.96      0.97        53
     class 1       0.96      0.98      0.97        47

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100



# Bagging

In [52]:
# hyper parameter set for Bagging
estimators = [10,100, 500]
depth_range = [i for i in range(5,11)]
param_grid = [
    {'bg__base_estimator__max_depth': depth_range, 'bg__base_estimator__criterion': ['entropy'], 'bg__n_estimators':estimators},
    {'bg__base_estimator__max_depth': depth_range, 'bg__base_estimator__criterion': ['gini'], 'bg__n_estimators':estimators}
 ]

In [56]:
#Bagging train and test
pipe_svc = Pipeline([('scl', StandardScaler()), ('bg',BaggingClassifier(tree.DecisionTreeClassifier(random_state = 1),random_state=1))])
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)
gs.fit(x_train,y_train)
print(gs.best_params_)
best_bg=gs.best_estimator_
y_pred = best_bg.predict(x_test) 
print(classification_report(y_test,y_pred,target_names=['class 0','class 1']))

{'bg__base_estimator__criterion': 'entropy', 'bg__base_estimator__max_depth': 6, 'bg__n_estimators': 10}
              precision    recall  f1-score   support

     class 0       0.98      1.00      0.99        54
     class 1       1.00      0.98      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



# SVM

In [57]:
# hyper parameter set for SVM
param_range = [10e-6, 10e-4, 0.01, 1 ,10.0]
param_grid = [
    {'clf__C': param_range, 'clf__kernel': ['linear']},
    {'clf__C': param_range, 'clf__kernel': ['rbf']},
    {'clf__C': param_range, 'clf__kernel': ['poly']},
    {'clf__C': param_range, 'clf__kernel': ['sigmid']}]

In [58]:
make_test(make_model('clf',SVC,param_grid,10))

{'clf__C': 10.0, 'clf__kernel': 'rbf'}
              precision    recall  f1-score   support

     class 0       0.98      1.00      0.99        54
     class 1       1.00      0.98      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



# Neural Network Classifier

In [None]:
# hyper parameter set for Neural Network
param_grid = [
    
]

In [None]:
# Neural Network test
make_test(make_model('nnc',MLPClassifier,10))