# Prediction of Traffic Accident Risk
Team 7: 어서오십쇼HUMAN  
Editor: 김용현
## 1. Preprocess

In [106]:
# 사전 준비
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

plt.style.use('seaborn')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

%matplotlib inline

In [107]:
import matplotlib
from matplotlib import font_manager, rc
import platform

if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
# Mac 인 경우
    rc('font', family='AppleGothic')
    
matplotlib.rcParams['axes.unicode_minus'] = False   


In [108]:
sample = pd.read_csv('../input/sample_accident.csv')

print("sample.shape:", sample.shape)

# 초기 샘플 데이터
sample

sample.shape: (362297, 10)


Unnamed: 0,발생일,발생시간,요일,발생지_시군구,사고내용,노면상태,기상상태,도로형태_대분류,가해자차종,가해자연령
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54세
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57세
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51세
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56세
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33세
...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34세
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56세
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57세
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59세


In [109]:
#Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age

sample.rename(columns={"발생일":"Date", "발생시간":"Time", "요일":"Day", "발생지_시군구":"Location", "사고내용":"Target", "노면상태":"RoadState", "기상상태":"Weather", "도로형태_대분류":"RoadShape","가해자차종":"Type", "가해자연령":"Age"}, inplace=True)

sample

Unnamed: 0,Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54세
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57세
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51세
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56세
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33세
...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34세
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56세
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57세
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59세


In [110]:
# Feature: 가해자연령

# 나이 불명인 열 삭제
sample = sample[sample['Age'] != '불명']

# 문자열 제거: ex 54'세' 제거
sample['Age'] = sample['Age'].str[:-1]

# dtype: object -> int
sample['Age'] = sample['Age'].astype('int')


In [111]:
# bins, labels
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
labels = ['10', '20', '30', '40', '50', '60', '70', '80']

# 나이대 분류
sample['AgeBand'] = pd.cut(sample['Age'], bins=bins, labels=labels)

# Drop NaN values
sample.dropna(how='any', inplace=True)

sample

Unnamed: 0,Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age,AgeBand
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54,50
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57,50
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51,50
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56,50
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33,30
...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34,30
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56,50
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57,50
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59,50


In [112]:
sample

Unnamed: 0,Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age,AgeBand
0,20100101,00시,금,마포구,경상,젖음/습기,맑음,단일로,승용차,54,50
1,20100101,00시,금,동작구,중상,서리/결빙,맑음,단일로,승합차,57,50
2,20100101,00시,금,관악구,경상,서리/결빙,흐림,단일로,승용차,51,50
3,20100101,00시,금,강서구,경상,건조,맑음,교차로,승용차,56,50
4,20100101,00시,금,구로구,경상,건조,맑음,교차로,승용차,33,30
...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,23시,월,동작구,경상,건조,맑음,단일로,승용차,34,30
362293,20181231,23시,월,강서구,부상신고,건조,맑음,교차로,승용차,56,50
362294,20181231,23시,월,강동구,경상,건조,맑음,교차로,승용차,57,50
362295,20181231,23시,월,강동구,경상,건조,맑음,단일로,승용차,59,50


In [113]:
sample.dtypes

Date            int64
Time           object
Day            object
Location       object
Target         object
RoadState      object
Weather        object
RoadShape      object
Type           object
Age             int32
AgeBand      category
dtype: object

In [114]:
# Feature: Time

sample["Time"] = sample['Time'].str[:-1]

sample['Time'] = sample['Time'].astype('int')

bins = [0,6,12,18,24]
labels=['dawn','day','afternoon','night']

sample['Time'] = pd.cut(sample['Time'], bins=bins, labels=labels)

# Drop NaN values
sample.dropna(how='any', inplace=True)
sample

Unnamed: 0,Date,Time,Day,Location,Target,RoadState,Weather,RoadShape,Type,Age,AgeBand
9,20100101,dawn,금,종로구,경상,건조,맑음,단일로,승용차,34,30
10,20100101,dawn,금,서대문구,중상,건조,맑음,교차로,승용차,53,50
11,20100101,dawn,금,강북구,경상,젖음/습기,흐림,단일로,승용차,27,20
12,20100101,dawn,금,동대문구,경상,서리/결빙,흐림,단일로,승용차,50,40
13,20100101,dawn,금,마포구,경상,건조,맑음,단일로,승용차,50,40
...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,night,월,동작구,경상,건조,맑음,단일로,승용차,34,30
362293,20181231,night,월,강서구,부상신고,건조,맑음,교차로,승용차,56,50
362294,20181231,night,월,강동구,경상,건조,맑음,교차로,승용차,57,50
362295,20181231,night,월,강동구,경상,건조,맑음,단일로,승용차,59,50


In [115]:
# Feature: Location


North = ['은평구', '마포구', '서대문구', '종로구', '중구', '용산구']
South = ['서초구', '강남구', '송파구', '강동구']
East = ['성동구', '성북구', '광진구', '중랑구', '강북구', '도봉구', '동대문구', '노원구']
West = ['강서구', '양천구', '구로구', '영등포구', '동작구', '관악구', '금천구']


for label in North:
    sample['Location'] = np.where(sample['Location'] == label, 'North', sample['Location'])
    
for label in South:
    sample['Location'] = np.where(sample['Location'] == label, 'South', sample['Location'])    

for label in East:
    sample['Location'] = np.where(sample['Location'] == label, 'East', sample['Location'])
    
for label in West:
    sample['Location'] = np.where(sample['Location'] == label, 'West', sample['Location'])

    

locationData = pd.crosstab(sample.Location, sample.Target, margins=True)
locationData

Target,경상,부상신고,사망,중상,All
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East,55700,6687,924,31466,94777
North,39388,5013,646,22316,67363
South,51063,5234,654,25518,82469
West,55811,5751,943,28337,90842
All,201962,22685,3167,107637,335451


In [116]:
# Feature: Target
TargetNameArr = ['Minor','Wound','Death','Heavy']

# 데이터명 수정
sample['Target'] = np.where(sample['Target'] == '경상', '0' ,sample['Target'])
sample['Target'] = np.where(sample['Target'] == '부상신고', '1',sample['Target'])
sample['Target'] = np.where(sample['Target'] == '사망', '2',sample['Target'])
sample['Target'] = np.where(sample['Target'] == '중상', '3',sample['Target'])

sample['Target'] = sample['Target'].astype('int')
temp = sample['Target']

#
sample['TargetOrigin'] = temp

# One-hot encoding
sample = pd.get_dummies(sample, columns=['Target'], prefix=['Target'])



In [117]:

LocationNameArr = ['East', 'North', 'South', 'West']
sample = pd.get_dummies(sample,columns=['Location'],prefix=['Location'])
sample

Unnamed: 0,Date,Time,Day,RoadState,Weather,RoadShape,Type,Age,AgeBand,TargetOrigin,Target_0,Target_1,Target_2,Target_3,Location_East,Location_North,Location_South,Location_West
9,20100101,dawn,금,건조,맑음,단일로,승용차,34,30,0,1,0,0,0,0,1,0,0
10,20100101,dawn,금,건조,맑음,교차로,승용차,53,50,3,0,0,0,1,0,1,0,0
11,20100101,dawn,금,젖음/습기,흐림,단일로,승용차,27,20,0,1,0,0,0,1,0,0,0
12,20100101,dawn,금,서리/결빙,흐림,단일로,승용차,50,40,0,1,0,0,0,1,0,0,0
13,20100101,dawn,금,건조,맑음,단일로,승용차,50,40,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362292,20181231,night,월,건조,맑음,단일로,승용차,34,30,0,1,0,0,0,0,0,0,1
362293,20181231,night,월,건조,맑음,교차로,승용차,56,50,1,0,1,0,0,0,0,0,1
362294,20181231,night,월,건조,맑음,교차로,승용차,57,50,0,1,0,0,0,0,0,1,0
362295,20181231,night,월,건조,맑음,단일로,승용차,59,50,0,1,0,0,0,0,0,1,0


In [118]:
RoadStateNameArr = ['건조', '기타', '서리/결빙', '적설', '젖음/습기', '침수', '해빙']

sample = pd.get_dummies(sample,columns=['RoadState'],prefix=['RoadState'])
sample = pd.get_dummies(sample,columns=['Weather'],prefix=['Weather'])

sample = pd.get_dummies(sample,columns=['RoadShape'],prefix=['RoadShape'])

sample = pd.get_dummies(sample,columns=['Type'],prefix=['Type'])
sample = pd.get_dummies(sample,columns=['Day'],prefix=['Type'])
sample = pd.get_dummies(sample,columns=['Time'],prefix=['Time'])
sample = pd.get_dummies(sample,columns=['AgeBand'],prefix=['AgeBand'])
sample = sample.drop(['Age','Date'],axis="columns")
sample


Unnamed: 0,TargetOrigin,Target_0,Target_1,Target_2,Target_3,Location_East,Location_North,Location_South,Location_West,RoadState_건조,...,Time_afternoon,Time_night,AgeBand_10,AgeBand_20,AgeBand_30,AgeBand_40,AgeBand_50,AgeBand_60,AgeBand_70,AgeBand_80
9,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
10,3,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
11,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
12,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
13,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362292,0,1,0,0,0,0,0,0,1,1,...,0,1,0,0,1,0,0,0,0,0
362293,1,0,1,0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0
362294,0,1,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
362295,0,1,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0


In [119]:
sample = sample.sort_values(by=['TargetOrigin'], axis=0)
print(np.where(sample['TargetOrigin'] == 3))
data0 = sample[:3000]
data1 =sample[202000:205000]
data2 = sample[224800:227800]
data3=sample[228000:231000]
data = pd.concat([data0,data1,data2,data3])
data

(array([227814, 227815, 227816, ..., 335448, 335449, 335450], dtype=int64),)


Unnamed: 0,TargetOrigin,Target_0,Target_1,Target_2,Target_3,Location_East,Location_North,Location_South,Location_West,RoadState_건조,...,Time_afternoon,Time_night,AgeBand_10,AgeBand_20,AgeBand_30,AgeBand_40,AgeBand_50,AgeBand_60,AgeBand_70,AgeBand_80
9,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
216728,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
216730,0,1,0,0,0,0,0,1,0,1,...,0,1,1,0,0,0,0,0,0,0
216732,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
216733,0,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319861,3,0,0,0,1,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
23198,3,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,0
319862,3,0,0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
28230,3,0,0,0,1,0,0,0,1,1,...,1,0,0,0,0,0,1,0,0,0


In [14]:
# 각 타입별 타겟 데이터
typeData = pd.crosstab(sample.Type, sample.TargetOrigin, margins=True)
weatherData = pd.crosstab(sample.Weather, sample.TargetOrigin, margins=True)
roadShapeDate = pd.crosstab(sample.RoadShape, sample.TargetOrigin, margins=True)
roadStateData = pd.crosstab(sample.RoadState, sample.TargetOrigin, margins=True)

AttributeError: 'DataFrame' object has no attribute 'Type'

In [None]:
# Feature: Type.all 500개 미만 type 제거
for data, index in zip(typeData.All, typeData.index):
    if data < 500:
        sample = sample[sample['Type'] != index]
        
# Feature: Weather, RoadShape, RoadState 결측값(불명) 제거
sample = sample[sample['Weather'] != '불명']

sample = sample[sample['RoadShape'] != '불명']



In [None]:
sample

In [None]:
sample.dtypes

In [None]:
sample[['AgeBand', 'TargetOrigin','Target_0','Target_1','Target_2','Target_3']].groupby(['AgeBand'], as_index=True).count()



In [None]:
pd.crosstab(sample['AgeBand'], sample['TargetOrigin'], margins=True)

In [None]:
temp = pd.get_dummies(sample.Location)
temp.columns[0]

In [None]:
sample

In [None]:
# 각 feature들의 type
sample.dtypes

In [None]:
sample

In [None]:
pd.crosstab(sample['Type'], sample['TargetOrigin'], margins=True)

In [None]:
f, ax = plt.subplots(2, 2, figsize=(18, 18))

sample[['Type', 'Target_0']].groupby(['Type'], as_index=True).mean().plot.bar(ax=ax[0][0])
ax[0][0].set_title('Target_0 rate about Type')

sample[['Type', 'Target_1']].groupby(['Type'], as_index=True).mean().plot.bar(ax=ax[0][1])
ax[0][1].set_title('Target_1 rate about Type')

sample[['Type', 'Target_2']].groupby(['Type'], as_index=True).mean().plot.bar(ax=ax[1][0])
ax[1][0].set_title('Target_2 rate about Type')

sample[['Type', 'Target_3']].groupby(['Type'], as_index=True).mean().plot.bar(ax=ax[1][1])
ax[1][1].set_title('Target_3 rate about Type')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 9))
sns.countplot('Type', hue='TargetOrigin', data=sample, ax=ax)
ax.set_title('Type list about AgeBand')
plt.xticks(fontsize =15,rotation =45)
plt.show()

In [None]:
f, ax = plt.subplots(2, 2, figsize=(18, 18))

sample[['AgeBand', 'Target_0']].groupby(['AgeBand'], as_index=True).mean().plot.bar(ax=ax[0][0])
ax[0][0].set_title('Target_0 rate about AgeBand')

sample[['AgeBand', 'Target_1']].groupby(['AgeBand'], as_index=True).mean().plot.bar(ax=ax[0][1])
ax[0][1].set_title('Target_1 rate about AgeBand')

sample[['AgeBand', 'Target_2']].groupby(['AgeBand'], as_index=True).mean().plot.bar(ax=ax[1][0])
ax[1][0].set_title('Target_2 rate about AgeBand')

sample[['AgeBand', 'Target_3']].groupby(['AgeBand'], as_index=True).mean().plot.bar(ax=ax[1][1])
ax[1][1].set_title('Target_3 rate about AgeBand')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 9))
sns.countplot('AgeBand', hue='TargetOrigin', data=sample, ax=ax)
ax.set_title('AgeBand list about AgeBand')

plt.show()

In [None]:
f, ax = plt.subplots(2, 2, figsize=(18, 18))

sample[['Weather', 'Target_0']].groupby(['Weather'], as_index=True).mean().plot.bar(ax=ax[0][0])
ax[0][0].set_title('Target_0 rate about Weather')

sample[['Weather', 'Target_1']].groupby(['Weather'], as_index=True).mean().plot.bar(ax=ax[0][1])
ax[0][1].set_title('Target_1 rate about Weather')

sample[['Weather', 'Target_2']].groupby(['Weather'], as_index=True).mean().plot.bar(ax=ax[1][0])
ax[1][0].set_title('Target_2 rate about Weather')

sample[['Weather', 'Target_3']].groupby(['Weather'], as_index=True).mean().plot.bar(ax=ax[1][1])
ax[1][1].set_title('Target_3 rate about Weather')

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 9))
sns.countplot('Weather', hue='TargetOrigin', data=sample, ax=ax)
ax.set_title('Weather list about AgeBand')

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.kdeplot(sample[sample['Target_2'] == 1]['Age'], ax=ax)
sns.kdeplot(sample[sample['Target_2'] == 0]['Age'], ax=ax)
plt.legend(['Death == 1', 'Death == 0'])
plt.show()

In [None]:
def preprocessing(feature):
    temp = pd.get_dummies(feature)
    temp = temp.columns
    print(temp)
    for a in range(0,len(temp)):
        feature = np.where(feature == temp[a],a,feature)
    return feature

In [None]:
DayArray =['일','월','화','수','목','금','토']
print(DayArray[0])
for a in range(0,len(DayArray)):
    sample['Day'] = np.where(sample['Day'] == DayArray[a],a,sample['Day'])

In [None]:
# label 바꾸는 과정

sample.Location=preprocessing(sample.Location)
sample.Location
sample.RoadState = preprocessing(sample.RoadState)
sample.Weather = preprocessing(sample.Weather)
sample.RoadShape = preprocessing(sample.RoadShape)
sample.Type = preprocessing(sample.Type)
sample.Time = preprocessing(sample.Time)

In [None]:

sample['Location'] = sample['Location'].astype('int')

sample['Day'] = sample['Day'].astype('int')
sample['RoadState'] = sample['RoadState'].astype('int')
sample['Weather'] = sample['Weather'].astype('int')
sample['RoadShape'] = sample['RoadShape'].astype('int')
sample['Type'] = sample['Type'].astype('int')
sample['Time']=sample['Time'].astype('int')

In [None]:
# DEBUG: Target: 2, 3 레이블 합침

sample['TargetOrigin'] = np.where(sample['TargetOrigin'] == 3, 2, sample['TargetOrigin'])

In [None]:
sample = sample.iloc[np.random.permutation(len(sample))]

## 2. Save preprocessing data

In [120]:
# 전처리 데이터 저장(input/preprocessing.csv)
#sample.to_csv('../input/preprocessing.csv', encoding='utf-8-sig',index_label=False)
data.to_csv('../input/preprocessing.csv', encoding='utf-8-sig',index_label=False)

In [121]:
preprocessing = pd.read_csv('../input/preprocessing.csv')

print("sample.shape:", preprocessing.shape)

# 초기 샘플 데이터
preprocessing

sample.shape: (12000, 60)


Unnamed: 0,TargetOrigin,Target_0,Target_1,Target_2,Target_3,Location_East,Location_North,Location_South,Location_West,RoadState_건조,...,Time_afternoon,Time_night,AgeBand_10,AgeBand_20,AgeBand_30,AgeBand_40,AgeBand_50,AgeBand_60,AgeBand_70,AgeBand_80
9,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
216728,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
216730,0,1,0,0,0,0,0,1,0,1,...,0,1,1,0,0,0,0,0,0,0
216732,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0
216733,0,1,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319861,3,0,0,0,1,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
23198,3,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,0
319862,3,0,0,0,1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
28230,3,0,0,0,1,0,0,0,1,1,...,1,0,0,0,0,0,1,0,0,0


## 3.EDA and Correlation Coefficient analysis

In [None]:
sample

In [None]:
from sklearn.preprocessing import StandardScaler
sample2 = sample.drop(['Date','TargetOrigin','AgeBand','Target_0','Target_1','Target_2','Target_3'],axis = "columns")

tsample = sample2[sample['TargetOrigin']==0]
std_scaler = StandardScaler()
std_scaler.fit(tsample)
tsample = std_scaler.transform(tsample)   #normalize ,데이터 정규화 작업
tsample.shape

tsample2 = sample2[sample['TargetOrigin']==1]
std_scaler = StandardScaler()
std_scaler.fit(tsample2)
tsample2 = std_scaler.transform(tsample2)   #normalize ,데이터 정규화 작업
tsample2.shape

tsample3 = sample2[sample['TargetOrigin']==2]
std_scaler = StandardScaler()
std_scaler.fit(tsample3)
tsample3 = std_scaler.transform(tsample3)   #normalize ,데이터 정규화 작업
tsample3.shape

tsample4 = sample2[sample['TargetOrigin']==3]
std_scaler = StandardScaler()
std_scaler.fit(tsample4)
tsample4 = std_scaler.transform(tsample4)   #normalize ,데이터 정규화 작업
tsample4.shape

In [None]:
corr = np.corrcoef(np.transpose(tsample))
corr2 = np.corrcoef(np.transpose(tsample2))
corr3 = np.corrcoef(np.transpose(tsample3))
corr4 = np.corrcoef(np.transpose(tsample4))


In [None]:
corr

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
label = sample.columns[1:9]
heatmap2 = sns.heatmap(corr,cbar = True,annot = True,square = True,fmt = '.2f',annot_kws = {'size':5},
                      yticklabels = label,xticklabels = label)
plt.show()

In [None]:

heatmap2 = sns.heatmap(corr2,cbar = True,annot = True,square = True,fmt = '.2f',annot_kws = {'size':5},
                      yticklabels = label,xticklabels = label)
plt.show()

In [None]:

heatmap2 = sns.heatmap(corr3,cbar = True,annot = True,square = True,fmt = '.2f',annot_kws = {'size':5},
                      yticklabels = label,xticklabels = label)
plt.show()

In [None]:
heatmap2 = sns.heatmap(corr4,cbar = True,annot = True,square = True,fmt = '.2f',annot_kws = {'size':5},
                      yticklabels = label,xticklabels = label)
plt.show()