# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 김용현
## 1. Preprocess

In [1]:
# 사전 준비
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

plt.style.use('seaborn')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
sample = pd.read_csv('../input/sample_accident.csv')

print("sample.shape:", sample.shape)

# 초기 샘플 데이터
sample

sample.shape: (362297, 10)


Unnamed: 0,발생일,발생시간,요일,발생지_시군구,사고내용,노면상태,기상상태,도로형태_대분류,가해자차종,가해자연령
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54세
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57세
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51세
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56세
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33세
...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34세
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56세
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57세
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59세


In [3]:
### 전처리 작업 공간 ###

# one-hot encoding 함수
def make_onehot(df, target, bins, labels):

    # Edit target type to category
    target['Target'] = pd.cut(target['Target'], bins=bins, labels=labels)

    # DEBUG: Check NaN row
    print(target.loc[target.isnull()['Target'],:])
    
    # Drop NaN values
    target.dropna(how='any', inplace=True)
    
    # One-hot Encoding using pandas.get_dummies()
    target = pd.get_dummies(target['Target'])

    return target



In [4]:
# Feature: 가해자연령

# 나이 불명인 열 삭제
sample = sample[sample['가해자연령'] != '불명']

# 문자열 제거: ex 54'세' 제거
sample['가해자연령'] = sample['가해자연령'].str[:-1]

# dtype: object -> int
sample['가해자연령'] = sample['가해자연령'].astype('int')


In [5]:
# bins, labels
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
labels = ['10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s']

# 나이대 분류
sample['Age'] = pd.cut(sample['가해자연령'], bins=bins, labels=labels)

# Drop NaN values
sample.dropna(how='any', inplace=True)

sample

Unnamed: 0,발생일,발생시간,요일,발생지_시군구,사고내용,노면상태,기상상태,도로형태_대분류,가해자차종,가해자연령,Age
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54,50s
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57,50s
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51,50s
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56,50s
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33,30s
...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34,30s
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56,50s
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57,50s
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59,50s


In [6]:
# 각 feature들의 type
sample.dtypes

발생일            int64
발생시간          object
요일            object
발생지_시군구       object
사고내용          object
노면상태          object
기상상태          object
도로형태_대분류      object
가해자차종         object
가해자연령          int64
Age         category
dtype: object

## 2. Save preprocessing data

In [7]:
# 전처리 데이터 저장(input/preprocessing.csv)
sample.to_csv('../input/preprocessing.csv', encoding='utf-8-sig',index_label=False)

In [8]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

print("sample.shape:", preprocessing.shape)

# 초기 샘플 데이터
preprocessing

sample.shape: (350777, 11)


Unnamed: 0,발생일,발생시간,요일,발생지_시군구,사고내용,노면상태,기상상태,도로형태_대분류,가해자차종,가해자연령,Age
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54,50s
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57,50s
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51,50s
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56,50s
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33,30s
...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34,30s
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56,50s
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57,50s
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59,50s
