In [4]:
import os
import glob
import re

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# 데이터 불러오기 & 데이터 프레임에 담기 
def dataFrame(addr):
    data = pd.read_table(addr)
    df = pd.DataFrame(data)
    df.drop(df.index[[0,1]], inplace=True)
    df.drop(['전체인구.1', '전체인구.2'], axis=1, inplace=True)
    df.columns=['period', 'gu', 'dong', 'pop','olds','male_olds','female_olds']
    df = df[df.dong != '소계']
    df.reset_index(drop = True, inplace=True)
    
    df["pop"] = df["pop"].replace(',', '', regex=True)
    df["olds"] = df["olds"].replace(',', '', regex=True)
    df["male_olds"] = df["male_olds"].replace(',', '', regex=True)
    df["female_olds"] = df["female_olds"].replace(',', '', regex=True)
    
    df_1 = df.astype({'pop':int, 'olds':int, 'male_olds':int, 'female_olds':int})
    df_1["proportion"] = (df_1["olds"]/df_1["pop"])*100
    df_1["proportion"] = df_1["proportion"].round(2)
    
    return df_1

In [7]:
# 2010년 데이터 가공하기
addr = "data/population/서울시 고령자현황 (동별) 통계 (2010_1).txt"
df_total = dataFrame(addr)
df_total

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds,proportion
0,2010.1/4,합계,합계,10464171,949680,408178,541502,9.08
1,2010.1/4,종로구,사직동,10433,1392,596,796,13.34
2,2010.1/4,종로구,삼청동,3689,557,236,321,15.10
3,2010.1/4,종로구,부암동,11566,1456,677,779,12.59
4,2010.1/4,종로구,평창동,19713,2346,1000,1346,11.90
...,...,...,...,...,...,...,...,...
420,2010.1/4,강동구,둔촌1동,20851,1045,380,665,5.01
421,2010.1/4,강동구,둔촌2동,28261,2142,851,1291,7.58
422,2010.1/4,강동구,암사1동,39990,3218,1392,1826,8.05
423,2010.1/4,강동구,천호2동,39169,3457,1511,1946,8.83


In [8]:
for i in range(2,5):
    addr = "data/population/서울시 고령자현황 (동별) 통계 (2010_"+str(i)+").txt"
    DF = dataFrame(addr)
    df_total = pd.concat([df_total, DF], axis=0)
    

df_total

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds,proportion
0,2010.1/4,합계,합계,10464171,949680,408178,541502,9.08
1,2010.1/4,종로구,사직동,10433,1392,596,796,13.34
2,2010.1/4,종로구,삼청동,3689,557,236,321,15.10
3,2010.1/4,종로구,부암동,11566,1456,677,779,12.59
4,2010.1/4,종로구,평창동,19713,2346,1000,1346,11.90
...,...,...,...,...,...,...,...,...
420,2010.4/4,강동구,둔촌1동,20759,1107,408,699,5.33
421,2010.4/4,강동구,둔촌2동,30132,2399,983,1416,7.96
422,2010.4/4,강동구,암사1동,40183,3387,1469,1918,8.43
423,2010.4/4,강동구,천호2동,39458,3713,1646,2067,9.41


In [6]:
# 데이터 타입 확인
type(df_total)

pandas.core.frame.DataFrame

In [22]:
# 2011년 이후 데이터 불러오기 - 2011~2019 데이터 concat

df_1119 = pd.DataFrame()

for j in range(2011, 2020):
    df_tmp = pd.DataFrame()
    
    for k in range(1,5):        
        addr = "data/population/서울시 고령자현황 (동별) 통계 ("+str(j)+"_"+str(k)+").txt"     
        data = pd.read_table(addr)
        df= pd.DataFrame(data)
        
        df_tmp = pd.concat([df_tmp, df])
        print(addr)
     
    print("-----")
    df_1119 = pd.concat([df_1119, df_tmp], axis = 0, sort=False)

data/population/서울시 고령자현황 (동별) 통계 (2011_1).txt
data/population/서울시 고령자현황 (동별) 통계 (2011_2).txt
data/population/서울시 고령자현황 (동별) 통계 (2011_3).txt
data/population/서울시 고령자현황 (동별) 통계 (2011_4).txt
-----
data/population/서울시 고령자현황 (동별) 통계 (2012_1).txt
data/population/서울시 고령자현황 (동별) 통계 (2012_2).txt
data/population/서울시 고령자현황 (동별) 통계 (2012_3).txt
data/population/서울시 고령자현황 (동별) 통계 (2012_4).txt
-----
data/population/서울시 고령자현황 (동별) 통계 (2013_1).txt
data/population/서울시 고령자현황 (동별) 통계 (2013_2).txt
data/population/서울시 고령자현황 (동별) 통계 (2013_3).txt
data/population/서울시 고령자현황 (동별) 통계 (2013_4).txt
-----
data/population/서울시 고령자현황 (동별) 통계 (2014_1).txt
data/population/서울시 고령자현황 (동별) 통계 (2014_2).txt
data/population/서울시 고령자현황 (동별) 통계 (2014_3).txt
data/population/서울시 고령자현황 (동별) 통계 (2014_4).txt
-----
data/population/서울시 고령자현황 (동별) 통계 (2015_1).txt
data/population/서울시 고령자현황 (동별) 통계 (2015_2).txt
data/population/서울시 고령자현황 (동별) 통계 (2015_3).txt
data/population/서울시 고령자현황 (동별) 통계 (2015_4).txt
-----
data/population/서울시 고령자현황 (동별)

In [33]:
df_tmp

Unnamed: 0,기간,자치구,동,전체인구,전체인구.1,전체인구.2,65세이상 인구,65세이상 인구.1,65세이상 인구.2,65세이상 인구.3,65세이상 인구.4,65세이상 인구.5,65세이상 인구.6,65세이상 인구.7,65세이상 인구.8
0,기간,자치구,동,전체인구,전체인구,전체인구,합계,합계,합계,내국인,내국인,내국인,외국인,외국인,외국인
1,기간,자치구,동,계,남자,여자,계,남자,여자,계,남자,여자,계,남자,여자
2,2019.1/4,합계,합계,10054979,4909387,5145592,1436125,634125,802000,1429163,630451,798712,6962,3674,3288
3,2019.1/4,종로구,소계,162913,78963,83950,26981,11922,15059,26776,11805,14971,205,117,88
4,2019.1/4,종로구,사직동,9782,4432,5350,1770,752,1018,1761,747,1014,9,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,2019.4/4,강동구,둔촌1동,283,134,149,94,35,59,93,34,59,1,1,-
448,2019.4/4,강동구,둔촌2동,27752,13701,14051,4010,1877,2133,4009,1876,2133,1,1,-
449,2019.4/4,강동구,암사1동,36693,18145,18548,5623,2472,3151,5609,2466,3143,14,6,8
450,2019.4/4,강동구,천호2동,35904,17865,18039,5692,2581,3111,5672,2571,3101,20,10,10


In [23]:
# 11년 ~ 19년 데이터 확인
df_1119

Unnamed: 0,기간,자치구,동,전체인구,전체인구.1,전체인구.2,65세이상 인구,65세이상 인구.1,65세이상 인구.2,65세이상 인구.3,65세이상 인구.4,65세이상 인구.5,65세이상 인구.6,65세이상 인구.7,65세이상 인구.8
0,기간,자치구,동,전체인구,전체인구,전체인구,합계,합계,합계,내국인,내국인,내국인,등록외국인,등록외국인,등록외국인
1,기간,자치구,동,계,남자,여자,계,남자,여자,계,남자,여자,계,남자,여자
2,2011.1/4,합계,합계,10581728,5238674,5343054,1018534,440762,577772,1013180,438147,575033,5354,2615,2739
3,2011.1/4,종로구,소계,179068,89119,89949,22595,9846,12749,22434,9767,12667,161,79,82
4,2011.1/4,종로구,사직동,10622,5012,5610,1549,660,889,1539,655,884,10,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,2019.4/4,강동구,둔촌1동,283,134,149,94,35,59,93,34,59,1,1,-
448,2019.4/4,강동구,둔촌2동,27752,13701,14051,4010,1877,2133,4009,1876,2133,1,1,-
449,2019.4/4,강동구,암사1동,36693,18145,18548,5623,2472,3151,5609,2466,3143,14,6,8
450,2019.4/4,강동구,천호2동,35904,17865,18039,5692,2581,3111,5672,2571,3101,20,10,10


In [24]:
# 데이터 확인2
df_1119["기간"].unique()

array(['기간', '2011.1/4', '2011.2/4', '2011.3/4', '2011.4/4', '2012.1/4',
       '2012.2/4', '2012.3/4', '2012.4/4', '2013.1/4', '2013.2/4',
       '2013.3/4', '2013.4/4', '2014.1/4', '2014.2/4', '2014.3/4',
       '2014.4/4', '2015.1/4', '2015.2/4', '2015.3/4', '2015.4/4',
       '2016.1/4', '2016.2/4', '2016.3/4', '2016.4/4', '2017.1/4',
       '2017.2/4', '2017.3/4', '2017.4/4', '2018.1/4', '2018.2/4',
       '2018.3/4', '2018.4/4', '2019.1/4', '2019.2/4', '2019.3/4',
       '2019.4/4'], dtype=object)

In [25]:
# 2020 1분기 데이터 불러오기

df_20 = pd.DataFrame(pd.read_table("data/population/서울시 고령자현황 (동별) 통계 (2020_1).txt"))
df_20

Unnamed: 0,기간,자치구,동,전체인구,전체인구.1,전체인구.2,65세이상 인구,65세이상 인구.1,65세이상 인구.2,65세이상 인구.3,65세이상 인구.4,65세이상 인구.5,65세이상 인구.6,65세이상 인구.7,65세이상 인구.8
0,기간,자치구,동,전체인구,전체인구,전체인구,합계,합계,합계,내국인,내국인,내국인,외국인,외국인,외국인
1,기간,자치구,동,계,남자,여자,계,남자,여자,계,남자,여자,계,남자,여자
2,2020.1/4,합계,합계,10013781,4874995,5138786,1518239,671095,847144,1510460,667011,843449,7779,4084,3695
3,2020.1/4,종로구,소계,161984,78271,83713,28073,12459,15614,27855,12335,15520,218,124,94
4,2020.1/4,종로구,사직동,9841,4469,5372,1818,775,1043,1805,768,1037,13,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,132,147,95,35,60,94,34,60,1,1,-
449,2020.1/4,강동구,둔촌2동,27520,13611,13909,4123,1917,2206,4118,1916,2202,5,1,4
450,2020.1/4,강동구,암사1동,36492,18011,18481,5751,2509,3242,5738,2504,3234,13,5,8
451,2020.1/4,강동구,천호2동,35714,17759,17955,5815,2628,3187,5792,2617,3175,23,11,12


In [26]:
# 2011~2020 합치기
df_1120 = pd.concat([df_1119, df_20], axis=0)
df_1120

Unnamed: 0,기간,자치구,동,전체인구,전체인구.1,전체인구.2,65세이상 인구,65세이상 인구.1,65세이상 인구.2,65세이상 인구.3,65세이상 인구.4,65세이상 인구.5,65세이상 인구.6,65세이상 인구.7,65세이상 인구.8
0,기간,자치구,동,전체인구,전체인구,전체인구,합계,합계,합계,내국인,내국인,내국인,등록외국인,등록외국인,등록외국인
1,기간,자치구,동,계,남자,여자,계,남자,여자,계,남자,여자,계,남자,여자
2,2011.1/4,합계,합계,10581728,5238674,5343054,1018534,440762,577772,1013180,438147,575033,5354,2615,2739
3,2011.1/4,종로구,소계,179068,89119,89949,22595,9846,12749,22434,9767,12667,161,79,82
4,2011.1/4,종로구,사직동,10622,5012,5610,1549,660,889,1539,655,884,10,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,132,147,95,35,60,94,34,60,1,1,-
449,2020.1/4,강동구,둔촌2동,27520,13611,13909,4123,1917,2206,4118,1916,2202,5,1,4
450,2020.1/4,강동구,암사1동,36492,18011,18481,5751,2509,3242,5738,2504,3234,13,5,8
451,2020.1/4,강동구,천호2동,35714,17759,17955,5815,2628,3187,5792,2617,3175,23,11,12


In [28]:
df_1120["기간"].unique()

array(['기간', '2011.1/4', '2011.2/4', '2011.3/4', '2011.4/4', '2012.1/4',
       '2012.2/4', '2012.3/4', '2012.4/4', '2013.1/4', '2013.2/4',
       '2013.3/4', '2013.4/4', '2014.1/4', '2014.2/4', '2014.3/4',
       '2014.4/4', '2015.1/4', '2015.2/4', '2015.3/4', '2015.4/4',
       '2016.1/4', '2016.2/4', '2016.3/4', '2016.4/4', '2017.1/4',
       '2017.2/4', '2017.3/4', '2017.4/4', '2018.1/4', '2018.2/4',
       '2018.3/4', '2018.4/4', '2019.1/4', '2019.2/4', '2019.3/4',
       '2019.4/4', '2020.1/4'], dtype=object)

In [29]:
# 필요없는 컬럼 삭제
# 전체인구 1, 전체인구 2, 65세이상 인구 3,4,5,6,7,8
df_1120.drop(['전체인구.1', '전체인구.2', '65세이상 인구.3','65세이상 인구.4',
              '65세이상 인구.5','65세이상 인구.6','65세이상 인구.7','65세이상 인구.8'], 
             axis=1, inplace=True)

df_1120

Unnamed: 0,기간,자치구,동,전체인구,65세이상 인구,65세이상 인구.1,65세이상 인구.2
0,기간,자치구,동,전체인구,합계,합계,합계
1,기간,자치구,동,계,계,남자,여자
2,2011.1/4,합계,합계,10581728,1018534,440762,577772
3,2011.1/4,종로구,소계,179068,22595,9846,12749
4,2011.1/4,종로구,사직동,10622,1549,660,889
...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187


In [30]:
# 컬럼이름 수정
df_1120.columns=['period', 'gu', 'dong', 'pop','olds','male_olds','female_olds']
df_1120

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds
0,기간,자치구,동,전체인구,합계,합계,합계
1,기간,자치구,동,계,계,남자,여자
2,2011.1/4,합계,합계,10581728,1018534,440762,577772
3,2011.1/4,종로구,소계,179068,22595,9846,12749
4,2011.1/4,종로구,사직동,10622,1549,660,889
...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187


In [32]:
# 불필요한 row 삭제
df_1120 = df_1120[df_1120.period != '기간']
df_1120 = df_1120[df_1120.dong != '소계']

df_1120

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds
2,2011.1/4,합계,합계,10581728,1018534,440762,577772
4,2011.1/4,종로구,사직동,10622,1549,660,889
5,2011.1/4,종로구,삼청동,3687,591,253,338
6,2011.1/4,종로구,부암동,11635,1544,718,826
7,2011.1/4,종로구,평창동,19974,2523,1039,1484
...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187


In [34]:
# 데이터 프레임 안에 ',' 지우기
df_1120["pop"] = df_1120["pop"].replace(',', '', regex=True)
df_1120["olds"] = df_1120["olds"].replace(',', '', regex=True)
df_1120["male_olds"] = df_1120["male_olds"].replace(',', '', regex=True)
df_1120["female_olds"] = df_1120["female_olds"].replace(',', '', regex=True)
df_1120

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds
2,2011.1/4,합계,합계,10581728,1018534,440762,577772
4,2011.1/4,종로구,사직동,10622,1549,660,889
5,2011.1/4,종로구,삼청동,3687,591,253,338
6,2011.1/4,종로구,부암동,11635,1544,718,826
7,2011.1/4,종로구,평창동,19974,2523,1039,1484
...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187


In [35]:
# 행정동이 구분이 안가는 인원수 삭제

df_1120 = df_1120[df_1120.dong != '기타']
df_1120

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds
2,2011.1/4,합계,합계,10581728,1018534,440762,577772
4,2011.1/4,종로구,사직동,10622,1549,660,889
5,2011.1/4,종로구,삼청동,3687,591,253,338
6,2011.1/4,종로구,부암동,11635,1544,718,826
7,2011.1/4,종로구,평창동,19974,2523,1039,1484
...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187


In [36]:
# proportion 계산 (노인인구/전체인구 * 100)
df_1120_propo = df_1120.astype({'pop':int, 'olds':int, 'male_olds':int, 'female_olds':int})

df_1120_propo["proportion"] = (df_1120_propo["olds"]/df_1120_propo["pop"])*100
df_1120_propo["proportion"] = df_1120_propo["proportion"].round(2)

df_1120_propo

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds,proportion
2,2011.1/4,합계,합계,10581728,1018534,440762,577772,9.63
4,2011.1/4,종로구,사직동,10622,1549,660,889,14.58
5,2011.1/4,종로구,삼청동,3687,591,253,338,16.03
6,2011.1/4,종로구,부암동,11635,1544,718,826,13.27
7,2011.1/4,종로구,평창동,19974,2523,1039,1484,12.63
...,...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60,34.05
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206,14.98
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242,15.76
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187,16.28


In [39]:
# 데이터 타입 확인
df_1120_propo["period"].unique()

array(['2011.1/4', '2011.2/4', '2011.3/4', '2011.4/4', '2012.1/4',
       '2012.2/4', '2012.3/4', '2012.4/4', '2013.1/4', '2013.2/4',
       '2013.3/4', '2013.4/4', '2014.1/4', '2014.2/4', '2014.3/4',
       '2014.4/4', '2015.1/4', '2015.2/4', '2015.3/4', '2015.4/4',
       '2016.1/4', '2016.2/4', '2016.3/4', '2016.4/4', '2017.1/4',
       '2017.2/4', '2017.3/4', '2017.4/4', '2018.1/4', '2018.2/4',
       '2018.3/4', '2018.4/4', '2019.1/4', '2019.2/4', '2019.3/4',
       '2019.4/4', '2020.1/4'], dtype=object)

In [40]:
# 10~20 전부 concat
df_total = pd.concat([df_total, df_1120_propo])
df_total

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds,proportion
0,2010.1/4,합계,합계,10464171,949680,408178,541502,9.08
1,2010.1/4,종로구,사직동,10433,1392,596,796,13.34
2,2010.1/4,종로구,삼청동,3689,557,236,321,15.10
3,2010.1/4,종로구,부암동,11566,1456,677,779,12.59
4,2010.1/4,종로구,평창동,19713,2346,1000,1346,11.90
...,...,...,...,...,...,...,...,...
448,2020.1/4,강동구,둔촌1동,279,95,35,60,34.05
449,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206,14.98
450,2020.1/4,강동구,암사1동,36492,5751,2509,3242,15.76
451,2020.1/4,강동구,천호2동,35714,5815,2628,3187,16.28


In [21]:
# 데이터 타입 확인
type(df_total)

pandas.core.frame.DataFrame

In [41]:
# 인덱스 재배열
df_total.reset_index(drop=True, inplace=True)
df_total

Unnamed: 0,period,gu,dong,pop,olds,male_olds,female_olds,proportion
0,2010.1/4,합계,합계,10464171,949680,408178,541502,9.08
1,2010.1/4,종로구,사직동,10433,1392,596,796,13.34
2,2010.1/4,종로구,삼청동,3689,557,236,321,15.10
3,2010.1/4,종로구,부암동,11566,1456,677,779,12.59
4,2010.1/4,종로구,평창동,19713,2346,1000,1346,11.90
...,...,...,...,...,...,...,...,...
17409,2020.1/4,강동구,둔촌1동,279,95,35,60,34.05
17410,2020.1/4,강동구,둔촌2동,27520,4123,1917,2206,14.98
17411,2020.1/4,강동구,암사1동,36492,5751,2509,3242,15.76
17412,2020.1/4,강동구,천호2동,35714,5815,2628,3187,16.28


In [42]:
# CSV 저장 (재사용 할 수 있도록)
# df_total.to_csv("data_csv/클러스터링용/olds_population(cp949).csv", sep=',', encoding = "cp949", index=False)