In [1]:
import pandas as pd
import numpy as np

### 1. 서울 자치구 별 범죄수 데이터 전처리

In [40]:
crime_list = pd.read_csv('./data/seoul_crime_len(2014~2018).txt', sep='\t', encoding='UTF8', header=None)

In [41]:
crime_list.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,122,123,124,125,126,127,128,129,130,131
0,5대범죄,5대범죄현황,2014,2014,2014,2014,2014,2014,2014,2014,...,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018
1,5대범죄,5대범죄현황,합계,종로구,중구,용산구,성동구,광진구,동대문구,중랑구,...,강서구,구로구,금천구,영등포구,동작구,관악구,서초구,강남구,송파구,강동구
2,합계,발생,130674,5021,5231,3799,3582,6268,4363,5353,...,4629,4810,3293,5840,3100,5026,4726,7513,5807,3919
3,합계,검거,79061,4610,3188,2340,2048,3531,2882,3259,...,3469,3338,2567,3922,2090,3653,3183,5196,4051,2789
4,살인,발생,158,3,6,1,1,8,12,11,...,11,8,6,17,3,9,5,10,5,3


In [42]:
# 값이 '검거'인 항목을 제거하고 1번 컬럼 삭제
occur_crime = crime_list[1] != '검거'
crime_list = crime_list[occur_crime]
crime_list.drop([1], axis=1, inplace=True)
crime_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 0 to 12
Columns: 131 entries, 0 to 131
dtypes: object(131)
memory usage: 8.2+ KB


In [43]:
#전치행렬 변환 및 컬럼 재정의
t_crime_list = crime_list.T
t_crime_list.columns = ['년도', '지역', '범죄합계', '살인', '강도', '강간강제추행', '절도', '폭력']
t_crime_list.drop([0],axis=0, inplace = True)
t_crime_list.reset_index(drop=True, inplace=True)
t_crime_list

Unnamed: 0,년도,지역,범죄합계,살인,강도,강간강제추행,절도,폭력
0,2014,합계,130674,158,343,5462,59393,65318
1,2014,종로구,5021,3,12,226,2272,2508
2,2014,중구,5231,6,13,221,2576,2415
3,2014,용산구,3799,1,7,213,1560,2018
4,2014,성동구,3582,1,5,141,1753,1682
...,...,...,...,...,...,...,...,...
125,2018,관악구,5026,9,10,352,2012,2643
126,2018,서초구,4726,5,5,470,1851,2395
127,2018,강남구,7513,10,14,600,3004,3885
128,2018,송파구,5807,5,6,309,2352,3135


In [44]:
#데이터 형변환
t_crime_list[['범죄합계', '살인', '강도', '강간강제추행', '절도', '폭력']] = t_crime_list[['범죄합계', '살인', '강도', '강간강제추행', '절도', '폭력']].astype(int)
t_crime_list

Unnamed: 0,년도,지역,범죄합계,살인,강도,강간강제추행,절도,폭력
0,2014,합계,130674,158,343,5462,59393,65318
1,2014,종로구,5021,3,12,226,2272,2508
2,2014,중구,5231,6,13,221,2576,2415
3,2014,용산구,3799,1,7,213,1560,2018
4,2014,성동구,3582,1,5,141,1753,1682
...,...,...,...,...,...,...,...,...
125,2018,관악구,5026,9,10,352,2012,2643
126,2018,서초구,4726,5,5,470,1851,2395
127,2018,강남구,7513,10,14,600,3004,3885
128,2018,송파구,5807,5,6,309,2352,3135


In [45]:
#지역 항목중 '합계'인 항목 삭제
sum = t_crime_list['지역'] != '합계'
t_crime_list = t_crime_list[sum]

In [46]:
#피벗 테이블 생성
pivot_t_crime_list = t_crime_list.pivot_table(index=['년도','지역'])
pivot_t_crime_list

Unnamed: 0_level_0,Unnamed: 1_level_0,강간강제추행,강도,범죄합계,살인,절도,폭력
년도,지역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014,강남구,512,37,8851,13,3895,4394
2014,강동구,148,10,5392,5,2425,2804
2014,강북구,189,21,4030,6,1494,2320
2014,강서구,214,9,5289,8,2341,2717
2014,관악구,361,25,6781,7,3029,3359
...,...,...,...,...,...,...,...
2018,용산구,331,3,3411,1,1096,1980
2018,은평구,188,1,3590,8,1278,2115
2018,종로구,236,3,3690,6,1483,1962
2018,중구,207,11,4030,2,1855,1955


In [61]:
pivot_t_crime_list.index(s'지역')

TypeError: 'MultiIndex' object is not callable

### 2. CCTV 전처리

In [47]:
cctv_list = pd.read_excel('./data/seoul_CCTV_len(2014~2018.xlsx')
cctv_list

Unnamed: 0,기관명,소계,2014,2015,2016,2017,2018
0,강남구,2766,430,546,765,577,448
1,강동구,1055,59,144,194,273,385
2,강북구,793,74,145,254,1,319
3,강서구,1125,230,187,190,264,254
4,관악구,3080,487,609,619,694,671
5,광진구,986,87,64,21,468,346
6,구로구,1807,187,268,326,540,486
7,금천구,1348,101,382,136,199,530
8,노원구,1105,80,461,298,110,156
9,도봉구,587,185,59,155,117,71


In [10]:
# 불필요 항목 제거('소계')
cctv_list = cctv_list.drop(['소계'], axis=1)

In [11]:
# 인덱스 이름 변경 및 컬럼 재정의
t_cctv_list = cctv_list.T
t_cctv_list.rename(index={'기관명':'년도'}, inplace=True)
t_cctv_list.columns = t_cctv_list.loc['년도']
t_cctv_list.drop(['년도'], axis=0, inplace=True)
t_cctv_list

년도,강남구,강동구,강북구,강서구,관악구,광진구,구로구,금천구,노원구,도봉구,...,성동구,성북구,송파구,양천구,영등포구,용산구,은평구,종로구,중구,중랑구
2014,430,59,74,230,487,87,187,101,80,185,...,101,241,21,169,217,107,343,132,80,770
2015,546,144,145,187,609,64,268,382,461,59,...,258,279,166,172,366,102,180,195,245,102
2016,765,194,254,190,619,21,326,136,298,155,...,201,388,100,349,289,89,296,148,270,121
2017,577,273,1,264,694,468,540,199,110,117,...,933,285,116,137,371,60,229,281,317,66
2018,448,385,319,254,671,346,486,530,156,71,...,294,643,396,830,793,73,367,101,328,9


In [12]:
#연도 및 지역별 CCTV 갯수 합
for i in t_cctv_list.index:
    for j in t_cctv_list.columns:
        pivot_t_crime_list.loc[(str(i),j ),'CCTV'] = int(t_cctv_list.loc[i, j])
    
pivot_t_crime_list

Unnamed: 0_level_0,Unnamed: 1_level_0,강간강제추행,강도,범죄합계,살인,절도,폭력,CCTV
년도,지역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014,강남구,512,37,8851,13,3895,4394,430.0
2014,강동구,148,10,5392,5,2425,2804,59.0
2014,강북구,189,21,4030,6,1494,2320,74.0
2014,강서구,214,9,5289,8,2341,2717,230.0
2014,관악구,361,25,6781,7,3029,3359,487.0
...,...,...,...,...,...,...,...,...
2018,용산구,331,3,3411,1,1096,1980,73.0
2018,은평구,188,1,3590,8,1278,2115,367.0
2018,종로구,236,3,3690,6,1483,1962,101.0
2018,중구,207,11,4030,2,1855,1955,328.0


### 3. 인구수 데이터 전처리

In [13]:
#결측치 처리 및 필요없는 컬럼 지우기
popul_df = pd.read_excel('./data/seoul_population_(2014~2018).xls')
popul_df['기간'].fillna(method='ffill', inplace=True)
popul_df = popul_df[popul_df['자치구'] != '합계']
popul_df

Unnamed: 0,기간,자치구,세대,합계,한국인,등록외국인
1,2014.0,종로구,73689,167350,159551,7799
2,2014.0,중구,60481,137466,129940,7526
3,2014.0,용산구,109487,251651,239381,12270
4,2014.0,성동구,126714,306597,299416,7181
5,2014.0,광진구,159650,381017,368325,12692
...,...,...,...,...,...,...
515,2018.0,관악구,262222,520040,501957,18083
516,2018.0,서초구,172918,438163,433951,4212
517,2018.0,강남구,228775,547453,542364,5089
518,2018.0,송파구,270866,673507,666635,6872


In [14]:
popul_df['기간'] = popul_df['기간'].astype(int)
popul_df['기간'] = popul_df['기간'].astype(str)
popul_df

Unnamed: 0,기간,자치구,세대,합계,한국인,등록외국인
1,2014,종로구,73689,167350,159551,7799
2,2014,중구,60481,137466,129940,7526
3,2014,용산구,109487,251651,239381,12270
4,2014,성동구,126714,306597,299416,7181
5,2014,광진구,159650,381017,368325,12692
...,...,...,...,...,...,...
515,2018,관악구,262222,520040,501957,18083
516,2018,서초구,172918,438163,433951,4212
517,2018,강남구,228775,547453,542364,5089
518,2018,송파구,270866,673507,666635,6872


In [15]:
pivot_popul_df = popul_df.pivot_table(index=['기간','자치구']).astype(int)
pivot_popul_df

Unnamed: 0_level_0,Unnamed: 1_level_0,등록외국인,세대,한국인,합계
기간,자치구,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,강남구,5359,234324,571509,576868
2014,강동구,4605,186239,479586,484191
2014,강북구,3308,141139,336363,339671
2014,강서구,6331,230553,576574,582906
2014,관악구,18120,248703,515688,533809
...,...,...,...,...,...
2018,용산구,15714,108691,229494,245208
2018,은평구,4413,203982,484289,488703
2018,종로구,9809,73741,153697,163507
2018,중구,9450,61182,125913,135364


In [27]:
#기존 테이블과 병합 및 컬럼 형변환
pivot_popul_df.loc

#pivot_t_crime_list
for i in pivot_popul_df.index:
    pivot_t_crime_list.loc[i, '세대'] = pivot_popul_df.loc[i , '세대']
    pivot_t_crime_list.loc[i, '한국인'] = pivot_popul_df.loc[i , '한국인']
    pivot_t_crime_list.loc[i, '등록외국인'] = pivot_popul_df.loc[i , '등록외국인']
    pivot_t_crime_list.loc[i, '인구합계'] = pivot_popul_df.loc[i , '합계']

In [29]:
pivot_t_crime_list[['CCTV', '세대', '한국인', '등록외국인', '인구합계']] = pivot_t_crime_list[['CCTV', '세대', '한국인', '등록외국인', '인구합계']].astype(int)

In [30]:
pivot_t_crime_list.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 125 entries, (2014, 강남구) to (2018, 중랑구)
Data columns (total 11 columns):
강간강제추행    125 non-null int32
강도        125 non-null int32
범죄합계      125 non-null int32
살인        125 non-null int32
절도        125 non-null int32
폭력        125 non-null int32
CCTV      125 non-null int32
세대        125 non-null int32
한국인       125 non-null int32
등록외국인     125 non-null int32
인구합계      125 non-null int32
dtypes: int32(11)
memory usage: 11.8+ KB


In [32]:
pivot_t_crime_list.to_csv('./data/20191227_seoul_crime_sum.csv')

In [34]:
pivot_t_crime_list.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 125 entries, (2014, 강남구) to (2018, 중랑구)
Data columns (total 11 columns):
강간강제추행    125 non-null int32
강도        125 non-null int32
범죄합계      125 non-null int32
살인        125 non-null int32
절도        125 non-null int32
폭력        125 non-null int32
CCTV      125 non-null int32
세대        125 non-null int32
한국인       125 non-null int32
등록외국인     125 non-null int32
인구합계      125 non-null int32
dtypes: int32(11)
memory usage: 11.8+ KB
