# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 김용현
## 1. Preprocess

In [1]:
# 사전 준비
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

plt.style.use('seaborn')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
sample = pd.read_csv('../input/sample_accident.csv')

print("sample.shape:", sample.shape)

# 초기 샘플 데이터
sample

sample.shape: (362297, 10)


Unnamed: 0,발생일,발생시간,요일,발생지_시군구,사고내용,노면상태,기상상태,도로형태_대분류,가해자차종,가해자연령
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54세
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57세
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51세
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56세
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33세
...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34세
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56세
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57세
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59세


In [3]:
### 전처리 작업 공간 ###

# one-hot encoding 함수
def make_onehot(df, target, bins, labels):

    # Edit target type to category
    target['Target'] = pd.cut(target['Target'], bins=bins, labels=labels)

    # DEBUG: Check NaN row
    print(target.loc[target.isnull()['Target'],:])
    
    # Drop NaN values
    target.dropna(how='any', inplace=True)
    
    # One-hot Encoding using pandas.get_dummies()
    target = pd.get_dummies(target['Target'])

    return target



In [4]:
#Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age

sample.rename(columns={"발생일":"Date", "발생시간":"Time", "요일":"Day", "발생지_시군구":"Location", "사고내용":"Target", "노면상태":"RoadState", "기상상태":"Weather", "도로형태_대분류":"RoadShape","가해자차종":"Type", "가해자연령":"Age"}, inplace=True)

sample

Unnamed: 0,Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54세
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57세
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51세
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56세
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33세
...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34세
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56세
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57세
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59세


In [5]:
# Feature: 가해자연령

# 나이 불명인 열 삭제
sample = sample[sample['Age'] != '불명']

# 문자열 제거: ex 54'세' 제거
sample['Age'] = sample['Age'].str[:-1]

# dtype: object -> int
sample['Age'] = sample['Age'].astype('int')


In [6]:
# bins, labels
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
labels = ['10s', '20s', '30s', '40s', '50s', '60s', '70s', '80s']

# 나이대 분류
sample['AgeBand'] = pd.cut(sample['Age'], bins=bins, labels=labels)

# Drop NaN values
sample.dropna(how='any', inplace=True)

sample

Unnamed: 0,Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age,AgeBand
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54,50s
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57,50s
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51,50s
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56,50s
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33,30s
...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34,30s
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56,50s
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57,50s
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59,50s


In [7]:
DayArray =['일','월','화','수','목','금','토']
print(DayArray[0])
for a in range(0,len(DayArray)):
    sample['Day'] = np.where(sample['Day'] == DayArray[a],a,sample['Day'])

일


In [8]:
# Feature: Time

sample["Time"] = sample['Time'].str[:-1]

sample['Time'] = sample['Time'].astype('int')

In [9]:
# Feature: Target

# 데이터명 수정
sample['Target'] = np.where(sample['Target'] == '경상', 'Minor', sample['Target'])
sample['Target'] = np.where(sample['Target'] == '부상신고', 'Wound', sample['Target'])
sample['Target'] = np.where(sample['Target'] == '사망', 'Death', sample['Target'])
sample['Target'] = np.where(sample['Target'] == '중상', 'Heavy', sample['Target'])

temp = sample['Target']

#
sample['TargetOrgin'] = temp

# One-hot encoding
sample = pd.get_dummies(sample, columns=['Target'], prefix=['Target'])



In [10]:
temp = pd.get_dummies(sample.Location)
temp.columns[0]

'강남구'

In [11]:
def preprocessing(feature):
    temp = pd.get_dummies(feature)
    temp = temp.columns
    print(temp)
    for a in range(0,len(temp)):
        feature = np.where(feature == temp[a],a,feature)
    return feature

In [12]:
sample.Location=preprocessing(sample.Location)
sample.Location
sample.RoadState = preprocessing(sample.RoadState)
sample.Weather = preprocessing(sample.Weather)
sample.RoadShape = preprocessing(sample.RoadShape)

Index(['강남구', '강동구', '강북구', '강서구', '관악구', '광진구', '구로구', '금천구', '노원구', '도봉구',
       '동대문구', '동작구', '마포구', '서대문구', '서초구', '성동구', '성북구', '송파구', '양천구', '영등포구',
       '용산구', '은평구', '종로구', '중구', '중랑구'],
      dtype='object')
Index(['건조', '기타', '서리/결빙', '적설', '젖음/습기', '침수', '해빙'], dtype='object')
Index(['기타/불명', '눈', '맑음', '비', '안개', '흐림'], dtype='object')
Index(['교차로', '기타', '기타/불명', '단일로', '불명', '철길건널목'], dtype='object')


In [13]:
sample

Unnamed: 0,Date,Time,Day,Location,RoadState,Weather,RoadShape,Type,Age,AgeBand,TargetOrgin,Target_Death,Target_Heavy,Target_Minor,Target_Wound
0,20100101,0,5,12,4,2,3,승용차,54,50s,Minor,0,0,1,0
1,20100101,0,5,11,2,2,3,승합차,57,50s,Heavy,0,1,0,0
2,20100101,0,5,4,2,5,3,승용차,51,50s,Minor,0,0,1,0
3,20100101,0,5,3,0,2,0,승용차,56,50s,Minor,0,0,1,0
4,20100101,0,5,6,0,2,0,승용차,33,30s,Minor,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23,1,11,0,2,3,승용차,34,30s,Minor,0,0,1,0
362293,20181231,23,1,3,0,2,0,승용차,56,50s,Wound,0,0,0,1
362294,20181231,23,1,1,0,2,0,승용차,57,50s,Minor,0,0,1,0
362295,20181231,23,1,1,0,2,3,승용차,59,50s,Minor,0,0,1,0


In [14]:
# 각 feature들의 type
sample.dtypes

Date               int64
Time               int32
Day               object
Location          object
RoadState         object
Weather           object
RoadShape         object
Type              object
Age                int32
AgeBand         category
TargetOrgin       object
Target_Death       uint8
Target_Heavy       uint8
Target_Minor       uint8
Target_Wound       uint8
dtype: object

## 2. Save preprocessing data

In [15]:
# 전처리 데이터 저장(input/preprocessing.csv)
sample.to_csv('../input/preprocessing.csv', encoding='utf-8-sig',index_label=False)

In [16]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

print("sample.shape:", preprocessing.shape)

# 초기 샘플 데이터
preprocessing

sample.shape: (350777, 15)


Unnamed: 0,Date,Time,Day,Location,RoadState,Weather,RoadShape,Type,Age,AgeBand,TargetOrgin,Target_Death,Target_Heavy,Target_Minor,Target_Wound
0,20100101,0,5,12,4,2,3,승용차,54,50s,Minor,0,0,1,0
1,20100101,0,5,11,2,2,3,승합차,57,50s,Heavy,0,1,0,0
2,20100101,0,5,4,2,5,3,승용차,51,50s,Minor,0,0,1,0
3,20100101,0,5,3,0,2,0,승용차,56,50s,Minor,0,0,1,0
4,20100101,0,5,6,0,2,0,승용차,33,30s,Minor,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23,1,11,0,2,3,승용차,34,30s,Minor,0,0,1,0
362293,20181231,23,1,3,0,2,0,승용차,56,50s,Wound,0,0,0,1
362294,20181231,23,1,1,0,2,0,승용차,57,50s,Minor,0,0,1,0
362295,20181231,23,1,1,0,2,3,승용차,59,50s,Minor,0,0,1,0
