In [2]:
import pandas as pd
import numpy as np

### 1. 서울 자치구 별 범죄수 데이터 전처리

In [137]:
crime_list = pd.read_csv('./data/seoul_crime_len(2014~2018).txt', sep='\t', encoding='UTF8', header=None)
crime_list.head(5)
# 값이 '검거'인 항목을 제거하고 1번 컬럼 삭제
occur_crime = crime_list[1] != '검거'
crime_list = crime_list[occur_crime]
crime_list.drop([1], axis=1, inplace=True)
display(crime_list.shape)

#전치행렬 변환 및 컬럼 재정의
t_crime_list = crime_list.T
t_crime_list.columns = ['년도', '지역', '범죄합계', '살인', '강도', '강간강제추행', '절도', '폭력']
t_crime_list.drop([0],axis=0, inplace = True)
t_crime_list.reset_index(drop=True, inplace=True)
print(t_crime_list.info())

#데이터 형변환
t_crime_list[['년도','범죄합계', '살인', '강도', '강간강제추행', '절도', '폭력']] = t_crime_list[['년도','범죄합계', '살인', '강도', '강간강제추행', '절도', '폭력']].astype(int)
t_crime_list
#지역 항목중 '합계'인 항목 삭제
sum = t_crime_list['지역'] != '합계'
t_crime_list = t_crime_list[sum]
#피벗 테이블 생성
pivot_t_crime_list = t_crime_list.pivot_table(index=['년도','지역'])
pivot_t_crime_list

(8, 131)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 8 columns):
년도        130 non-null object
지역        130 non-null object
범죄합계      130 non-null object
살인        130 non-null object
강도        130 non-null object
강간강제추행    130 non-null object
절도        130 non-null object
폭력        130 non-null object
dtypes: object(8)
memory usage: 8.2+ KB
None


Unnamed: 0_level_0,Unnamed: 1_level_0,강간강제추행,강도,범죄합계,살인,절도,폭력
년도,지역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014,강남구,512,37,8851,13,3895,4394
2014,강동구,148,10,5392,5,2425,2804
2014,강북구,189,21,4030,6,1494,2320
2014,강서구,214,9,5289,8,2341,2717
2014,관악구,361,25,6781,7,3029,3359
...,...,...,...,...,...,...,...
2018,용산구,331,3,3411,1,1096,1980
2018,은평구,188,1,3590,8,1278,2115
2018,종로구,236,3,3690,6,1483,1962
2018,중구,207,11,4030,2,1855,1955


### 2. CCTV 전처리

In [138]:
cctv_list = pd.read_excel('./data/seoul_CCTV_2014~2018.xlsx')
cctv_list['기관명'] = cctv_list['기관명'].str.replace(' ','')
# 불필요 항목 제거('소계')
cctv_list.drop(['소계'], axis=1,inplace=True)
display(cctv_list.shape)
# 인덱스 이름 변경 및 컬럼 재정의
t_cctv_list = cctv_list.T
t_cctv_list.rename(index={'기관명':'년도'}, inplace=True)
t_cctv_list.columns = t_cctv_list.loc['년도']
t_cctv_list.drop(['년도'], axis=0, inplace=True)
#print(t_cctv_list)
#연도 및 지역별 CCTV 갯수 합
for i in t_cctv_list.index:
    for j in t_cctv_list.columns:
        pivot_t_crime_list.loc[(i,j ), 'CCTV'] = int(t_cctv_list.loc[i, j])
    
print(pivot_t_crime_list)

(25, 6)

          강간강제추행  강도  범죄합계  살인    절도    폭력    CCTV
년도   지역                                           
2014 강남구     512  37  8851  13  3895  4394   430.0
     강동구     148  10  5392   5  2425  2804    59.0
     강북구     189  21  4030   6  1494  2320    74.0
     강서구     214   9  5289   8  2341  2717   230.0
     관악구     361  25  6781   7  3029  3359   487.0
...          ...  ..   ...  ..   ...   ...     ...
2018 용산구     331   3  3411   1  1096  1980   431.0
     은평구     188   1  3590   8  1278  2115  1415.0
     종로구     236   3  3690   6  1483  1962   857.0
     중구      207  11  4030   2  1855  1955  1240.0
     중랑구     174   3  4288   4  1526  2581  1068.0

[125 rows x 7 columns]


### 3. 인구수 데이터 전처리

In [140]:
#결측치 처리 및 필요없는 컬럼 지우기
popul_df = pd.read_excel('./data/seoul_population_(2014~2018).xls')
display(popul_df.shape)
popul_df['기간'].fillna(method='ffill', inplace=True)
popul_df = popul_df[popul_df['지역'] != '합계']


#형변환
popul_df['기간'] = popul_df['기간'].astype(int)
popul_df.info()

#피벗테이블은 기본 평균으로 병합됨
pivot_popul_df = popul_df.pivot_table(index=['기간','지역']).astype(int) 
pivot_popul_df.info()

#기존 테이블과 병합 및 컬럼 형변환
pivot_popul_df.loc


(520, 6)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 1 to 519
Data columns (total 6 columns):
기간       500 non-null int32
지역       500 non-null object
세대       500 non-null int64
합계       500 non-null int64
한국인      500 non-null int64
등록외국인    500 non-null int64
dtypes: int32(1), int64(4), object(1)
memory usage: 25.4+ KB
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 125 entries, (2014, 강남구) to (2018, 중랑구)
Data columns (total 4 columns):
등록외국인    125 non-null int32
세대       125 non-null int32
한국인      125 non-null int32
합계       125 non-null int32
dtypes: int32(4)
memory usage: 2.6+ KB


<pandas.core.indexing._LocIndexer at 0x1e107c7f778>

In [141]:
#pivot_t_crime_list
for i in pivot_popul_df.index:
    pivot_t_crime_list.loc[i, '세대'] = pivot_popul_df.loc[i , '세대']
    pivot_t_crime_list.loc[i, '한국인'] = pivot_popul_df.loc[i , '한국인']
    pivot_t_crime_list.loc[i, '등록외국인'] = pivot_popul_df.loc[i , '등록외국인']
    pivot_t_crime_list.loc[i, '인구합계'] = pivot_popul_df.loc[i , '합계']
pivot_t_crime_list

Unnamed: 0_level_0,Unnamed: 1_level_0,강간강제추행,강도,범죄합계,살인,절도,폭력,CCTV,세대,한국인,등록외국인,인구합계
년도,지역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014,강남구,512,37,8851,13,3895,4394,430.0,234324.0,571509.0,5359.0,576868.0
2014,강동구,148,10,5392,5,2425,2804,59.0,186239.0,479586.0,4605.0,484191.0
2014,강북구,189,21,4030,6,1494,2320,74.0,141139.0,336363.0,3308.0,339671.0
2014,강서구,214,9,5289,8,2341,2717,230.0,230553.0,576574.0,6331.0,582906.0
2014,관악구,361,25,6781,7,3029,3359,487.0,248703.0,515688.0,18120.0,533809.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2018,용산구,331,3,3411,1,1096,1980,431.0,108691.0,229494.0,15714.0,245208.0
2018,은평구,188,1,3590,8,1278,2115,1415.0,203982.0,484289.0,4413.0,488703.0
2018,종로구,236,3,3690,6,1483,1962,857.0,73741.0,153697.0,9809.0,163507.0
2018,중구,207,11,4030,2,1855,1955,1240.0,61182.0,125913.0,9450.0,135364.0


In [142]:
#컬럼 형변환 - int
pivot_t_crime_list[['CCTV', '세대', '한국인', '등록외국인', '인구합계']] = pivot_t_crime_list[['CCTV', '세대', '한국인', '등록외국인', '인구합계']].astype(int)

In [143]:
pivot_t_crime_list.to_csv('./data/20191227_seoul_crime_sum.csv')

In [144]:
pivot_t_crime_list.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 125 entries, (2014, 강남구) to (2018, 중랑구)
Data columns (total 11 columns):
강간강제추행    125 non-null int32
강도        125 non-null int32
범죄합계      125 non-null int32
살인        125 non-null int32
절도        125 non-null int32
폭력        125 non-null int32
CCTV      125 non-null int32
세대        125 non-null int32
한국인       125 non-null int32
등록외국인     125 non-null int32
인구합계      125 non-null int32
dtypes: int32(11)
memory usage: 11.8+ KB


## 4. 식품위생업 데이터


In [184]:
nndf_2014 = pd.read_table('./data/서울시 식품위생업 현황 (구별) 통계(2014).txt', header=None)
nndf_2015 = pd.read_table('./data/서울시 식품위생업 현황 (구별) 통계(2015).txt', header=None)
nndf_2015.drop([0,1],axis=1 ,inplace=True)
nndf_2016 = pd.read_table('./data/서울시 식품위생업 현황 (구별) 통계(2016).txt', header=None)
nndf_2016.drop([0,1],axis=1 ,inplace=True)
nndf_2017 = pd.read_table('./data/서울시 식품위생업 현황 (구별) 통계(2017).txt', header=None)
nndf_2017.drop([0,1],axis=1 ,inplace=True)
nndf_2018 = pd.read_table('./data/서울시 식품위생업 현황 (구별) 통계(2018).txt', header=None)
nndf_2018.drop([0,1],axis=1 ,inplace=True)

In [188]:
nndf_2014_to_2018 = pd.concat([nndf_2014,nndf_2015,nndf_2016,nndf_2017,nndf_2018],axis=1)
nndf_sp = (nndf_2014_to_2018[1] == '단란주점') | (nndf_2014_to_2018[1] == '유흥주점') | (nndf_2014_to_2018[1] == '구분(2)')
nndf_2014_to_2018 = nndf_2014_to_2018[nndf_sp]
nndf_2014_to_2018 = nndf_2014_to_2018.T
nndf_2014_to_2018.drop([0,1,2],axis=0 ,inplace=True)
nndf_2014_to_2018.columns = ['년도','지역','단란주점','유흥주점']
nndf_2014_to_2018.reset_index(drop=True,inplace=True)
#nndf_2014_to_2018.set_index('년도',inplace=True)
display(nndf_2014_to_2018.head(3))
nndf_2014_to_2018[['년도','단란주점','유흥주점']] = nndf_2014_to_2018[['년도','단란주점','유흥주점']].astype(int)
nndf_2014_to_2018.info()
pivot_nndf_2014_to_2018 = nndf_2014_to_2018.pivot_table(index=['년도','지역'])
pivot_nndf_2014_to_2018
pivot_nndf_2014_to_2018.to_csv('./data/20191231_seoul_식품위생업_데이터_2014_to_2018.csv')
pivot_nndf_2014_to_2018

Unnamed: 0,년도,지역,단란주점,유흥주점
0,2014,종로구,136,224
1,2014,중구,101,257
2,2014,용산구,121,35


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 4 columns):
년도      125 non-null int32
지역      125 non-null object
단란주점    125 non-null int32
유흥주점    125 non-null int32
dtypes: int32(3), object(1)
memory usage: 2.6+ KB


Unnamed: 0_level_0,Unnamed: 1_level_0,단란주점,유흥주점
년도,지역,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,강남구,328,276
2014,강동구,105,151
2014,강북구,129,74
2014,강서구,169,114
2014,관악구,100,228
...,...,...,...
2018,용산구,108,29
2018,은평구,137,82
2018,종로구,130,210
2018,중구,92,233


In [189]:
conc = pd.concat([pivot_t_crime_list,pivot_nndf_2014_to_2018], axis=1)
conc

Unnamed: 0_level_0,Unnamed: 1_level_0,강간강제추행,강도,범죄합계,살인,절도,폭력,CCTV,세대,한국인,등록외국인,인구합계,단란주점,유흥주점
년도,지역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014,강남구,512,37,8851,13,3895,4394,430,234324,571509,5359,576868,328,276
2014,강동구,148,10,5392,5,2425,2804,59,186239,479586,4605,484191,105,151
2014,강북구,189,21,4030,6,1494,2320,74,141139,336363,3308,339671,129,74
2014,강서구,214,9,5289,8,2341,2717,230,230553,576574,6331,582906,169,114
2014,관악구,361,25,6781,7,3029,3359,487,248703,515688,18120,533809,100,228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,용산구,331,3,3411,1,1096,1980,431,108691,229494,15714,245208,108,29
2018,은평구,188,1,3590,8,1278,2115,1415,203982,484289,4413,488703,137,82
2018,종로구,236,3,3690,6,1483,1962,857,73741,153697,9809,163507,130,210
2018,중구,207,11,4030,2,1855,1955,1240,61182,125913,9450,135364,92,233
