Data from [KOSIS Resident Registration Population Status](https://kosis.kr/statisticsList/statisticsListIndex.do?parentId=A.1&vwcd=MT_ZTITLE&menuId=M_01_01#content-group)

Download **NON pivot** monthly data (2008-2025) from `Resident registration population by administrative district (city/county/district)/by generation)` dataset


In [1]:
import pandas as pd
import numpy as np
import os
import io

DATA_DIR = '../../data'
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
CLEANED_DATA_DIR = os.path.join(DATA_DIR, 'cleaned')

POPULATION_RAW_DATA_DIR = os.path.join(RAW_DATA_DIR, 'population')
POPULATION_CLEANED_DATA_DIR = os.path.join(CLEANED_DATA_DIR, 'population')

In [2]:
# Fix formatting issues in population data and load into DataFrames

dfs = []
for filename in os.listdir(POPULATION_RAW_DATA_DIR):
    if filename.endswith('.csv'):
        file_path = os.path.join(POPULATION_RAW_DATA_DIR, filename)
        print(f'Processing file: {file_path}')
        with open(file_path, 'r', encoding='euc-kr') as f:
            content = f.read()
        
        # Delete first 2 lines
        content = content.split('\n', 2)[2]
        
        # Convert to DataFrame
        file = io.StringIO(content)
        df = pd.read_csv(file)
        dfs.append(df)

Processing file: ../../data/raw/population/101_DT_1B04006_M_2008.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2009.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2010.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2011.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2012.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2013.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2014.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2015.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2016.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2017.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2018.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2019.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2020.csv
Processing file: ../../data/raw/population/101_DT_1B04006_M_2021.csv
Processing file: ../../data/raw/po

In [3]:
# Combine into single DataFrame
combined_df: pd.DataFrame = pd.concat(dfs, ignore_index=True)
combined_df

Unnamed: 0,C행정구역(시군구)별,행정구역(시군구)별,C연령별,연령별,시점,총인구수 (명),남자인구수 (명),여자인구수 (명)
0,'00,전국,'000,계,200801,49297732.0,24706068.0,24591664.0
1,'00,전국,'000,계,200802,49329973.0,24722970.0,24607003.0
2,'00,전국,'000,계,200803,49324670.0,24717226.0,24607444.0
3,'00,전국,'000,계,200804,49355153.0,24732772.0,24622381.0
4,'00,전국,'000,계,200805,49380296.0,24745488.0,24634808.0
...,...,...,...,...,...,...,...,...
6250045,'50130,서귀포시,'440,100세 이상,202508,65.0,7.0,58.0
6250046,'50130,서귀포시,'440,100세 이상,202509,62.0,7.0,55.0
6250047,'50130,서귀포시,'440,100세 이상,202510,65.0,7.0,58.0
6250048,'50130,서귀포시,'440,100세 이상,202511,60.0,7.0,53.0


In [4]:
# Check for nulls + summary stats
display(combined_df.isnull().sum())
display(combined_df.info())
display(combined_df.describe())

C행정구역(시군구)별    0
행정구역(시군구)별     0
C연령별           0
연령별            0
시점             0
총인구수 (명)       1
남자인구수 (명)      0
여자인구수 (명)      0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6250050 entries, 0 to 6250049
Data columns (total 8 columns):
 #   Column       Dtype  
---  ------       -----  
 0   C행정구역(시군구)별  object 
 1   행정구역(시군구)별   object 
 2   C연령별         object 
 3   연령별          object 
 4   시점           int64  
 5   총인구수 (명)     float64
 6   남자인구수 (명)    float64
 7   여자인구수 (명)    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 381.5+ MB


None

Unnamed: 0,시점,총인구수 (명),남자인구수 (명),여자인구수 (명)
count,6250050.0,6250049.0,6250050.0,6250050.0
mean,201666.6,11244.53,5618.242,5626.283
std,520.5085,321982.0,160857.7,161132.9
min,200801.0,0.0,0.0,0.0
25%,201208.0,411.0,184.0,217.0
50%,201702.0,1311.0,630.0,687.0
75%,202108.0,3884.0,1964.0,1922.0
max,202512.0,51851430.0,25868640.0,25990780.0


In [5]:
# Check row with null
combined_df[combined_df.isnull().any(axis=1)]

Unnamed: 0,C행정구역(시군구)별,행정구역(시군구)별,C연령별,연령별,시점,총인구수 (명),남자인구수 (명),여자인구수 (명)
2957359,'44250,계룡시,'4304,98세,201608,,0.0,0.0


In [6]:
# Just one row has nulls, drop it
combined_df.dropna(inplace=True)

In [7]:
# Convert 'Disctrict' to english

# Start by getting korean names with df['District'].unique() and translating them manually with Google Translate
korean_names = list(df["행정구역(시군구)별"].unique())
print(f"Total unique district names: {len(korean_names)}")
# Print in a way that is easy to copy-paste
for i in range(0, len(korean_names), 30):
    print(','.join([name.strip() for name in korean_names[i:i+30]]))

Total unique district names: 271
전국,서울특별시,종로구,중구,용산구,성동구,광진구,동대문구,중랑구,성북구,강북구,도봉구,노원구,은평구,서대문구,마포구,양천구,강서구,구로구,금천구,영등포구,동작구,관악구,서초구,강남구,송파구,강동구,부산광역시,서구,동구
영도구,부산진구,동래구,남구,북구,해운대구,사하구,금정구,연제구,수영구,사상구,기장군,대구광역시,수성구,달서구,달성군,군위군,인천광역시,중구영종출장소,중구용유출장소,미추홀구,연수구,남동구,부평구,계양구,서구검단출장소,강화군,옹진군,광주광역시,광산구
대전광역시,유성구,대덕구,울산광역시,울주군,세종특별자치시,세종특별자치시,경기도,북부출장소,수원시,장안구,권선구,팔달구,영통구,성남시,수정구,중원구,분당구,의정부시,안양시,만안구,동안구,부천시,원미구,소사구,오정구,광명시,평택시,송탄출장소,안중출장소
동두천시,안산시,상록구,단원구,고양시,덕양구,일산동구,일산서구,과천시,구리시,남양주시,풍양출장소,오산시,시흥시,군포시,의왕시,하남시,용인시,처인구,기흥구,수지구,파주시,이천시,안성시,김포시,화성시,화성시동부출장소,화성시동탄출장소,광주시,양주시
포천시,여주시,연천군,가평군,양평군,강원특별자치도,동해출장소,춘천시,원주시,강릉시,동해시,태백시,속초시,삼척시,홍천군,횡성군,영월군,평창군,정선군,철원군,화천군,양구군,인제군,고성군,양양군,충청북도,청주시,상당구,서원구,흥덕구
청원구,충주시,제천시,보은군,옥천군,영동군,증평군,진천군,괴산군,음성군,단양군,충청남도,천안시,동남구,서북구,공주시,보령시,아산시,서산시,논산시,계룡시,당진시,금산군,부여군,서천군,청양군,홍성군,예산군,태안군,전북특별자치도
전주시,완산구,덕진구,군산시,익산시,정읍시,남원시,김제시,완주군,진안군,무주군,장수군,임실군,순창군,고창군,부안군,전라남도,목포시,여수시,순천시,나주시,광양시,담양군,곡성군,구례군,고흥군,보성군,화순군,장흥군,강진군
해남군,영암군,무안군,함평군,영광군,장성군,완도군,진도군,신안군,경상북도,포항시,경주시,김천시,안동시,

In [8]:
# Fix english names
english_names = """Nationwide, Seoul, Jongno-gu, Jung-gu, Yongsan-gu, Seongdong-gu, Gwangjin-gu, Dongdaemun-gu, Jungnang-gu, Seongbuk-gu, Gangbuk-gu, Dobong-gu, Nowon-gu, Eunpyeong-gu, Seodaemun-gu, Mapo-gu, Yangcheon-gu, Gangseo-gu, Guro-gu, Geumcheon-gu, Yeongdeungpo-gu, Dongjak-gu, Gwanak-gu, Seocho-gu, Gangnam-gu, Songpa-gu, Gangdong-gu, Busan, Seo-gu, Dong-gu,
Yeongdo-gu, Busanjin-gu, Dongrae-gu, Nam-gu, Buk-gu, Haeundae-gu, Saha-gu, Geumjeong-gu, Yeonje-gu, Suyeong-gu, Sasang-gu, Gijang-gun, Daegu Metropolitan City, Suseong-gu, Dalseo-gu, Dalseong-gun, Gunwi-gun, Incheon Metropolitan City, Jung-gu Yeongjong Branch Office, Jung-gu Yongyu Branch Office, Michuhol-gu, Yeonsu-gu, Namdong-gu, Bupyeong-gu, Gyeyang-gu, Seo-gu Geomdan Branch Office, Ganghwa-gun, Ongjin-gun, Gwangju Metropolitan City, Gwangsan-gu,
Daejeon Metropolitan City, Yuseong District, Daedeok District, Ulsan Metropolitan City, Ulju County, Sejong Special Self-Governing City, Sejong Special Self-Governing City, Gyeonggi Province, Northern Branch Office, Suwon City, Jangan District, Gwonseon District, Paldal District, Yeongtong District, Seongnam City, Sujeong District, Jungwon District, Bundang District, Uijeongbu City, Anyang City, Manan District, Dongan District, Bucheon City, Wonmi District, Sosa District, Ojeong District, Gwangmyeong City, Pyeongtaek City, Songtan Branch Office, Anjung Branch Office,
Dongducheon-si, Ansan-si, Sangnok-gu, Danwon-gu, Goyang-si, Deogyang-gu, Ilsandong-gu, Ilsanseo-gu, Gwacheon-si, Guri-si, Namyangju-si, Pungyang Branch Office, Osan-si, Siheung-si, Gunpo-si, Uiwang-si, Hanam-si, Yongin-si, Cheoin-gu, Giheung-gu, Suji-gu, Paju-si, Icheon-si, Anseong-si, Gimpo-si, Hwaseong-si, Hwaseong-si Dongbu Branch Office, Hwaseong-si Dongtan Branch Office, Gwangju-si, Yangju-si,
Pocheon-si, Yeoju-si, Yeoncheon-gun, Gapyeong-gun, Yangpyeong-gun, Gangwon-do Special Self-Governing Province, Donghae Branch Office, Chuncheon-si, Wonju-si, Gangneung-si, Donghae-si, Taebaek-si, Sokcho-si, Samcheok-si, Hongcheon-gun, Hoengseong-gun, Yeongwol-gun, Pyeongchang-gun, Jeongseon-gun, Cheorwon-gun, Hwacheon-gun, Yanggu-gun, Inje-gun, Goseong-gun, Yangyang-gun, Chungcheongbuk-do, Cheongju-si, Sangdang-gu, Seowon-gu, Heungdeok-gu,
Cheongwon-gu, Chungju-si, Jecheon-si, Boeun-gun, Okcheon-gun, Yeongdong-gun, Jeungpyeong-gun, Jincheon-gun, Goesan-gun, Eumseong-gun, Danyang-gun, South Chungcheong Province, Cheonan-si, Dongnam-gu, Seobuk-gu, Gongju-si, Boryeong-si, Asan-si, Seosan-si, Nonsan-si, Gyeryong-si, Dangjin-si, Geumsan-gun, Buyeo-gun, Seocheon-gun, Cheongyang-gun, Hongseong-gun, Yesan-gun, Taean-gun, North Jeolla Province,
Jeonju-si, Wansan-gu, Deokjin-gu, Gunsan-si, Iksan-si, Jeongeup-si, Namwon-si, Gimje-si, Wanju-gun, Jinan-gun, Muju-gun, Jangsu-gun, Imsil-gun, Sunchang-gun, Gochang-gun, Buan-gun, Jeollanam-do, Mokpo-si, Yeosu-si, Suncheon-si, Naju-si, Gwangyang-si, Damyang-gun, Gokseong-gun, Gurye-gun, Goheung-gun, Boseong-gun, Hwasun-gun, Jangheung-gun, Gangjin-gun,
Haenam County, Yeongam County, Muan County, Hampyeong County, Yeonggwang County, Jangseong County, Wando County, Jindo County, Sinan County, Gyeongsangbuk-do, Pohang City, Gyeongju City, Gimcheon City, Andong City, Gumi City, Yeongju City, Yeongcheon City, Sangju City, Mungyeong City, Gyeongsan City, Uiseong County, Cheongsong County, Yeongyang County, Yeongdeok County, Cheongdo County, Goryeong County, Seongju County, Chilgok County, Yecheon County, Bonghwa County,
Uljin County, Ulleung County, Gyeongsangnam-do, Changwon City (integrated), Uichang District, Seongsan District, Masan Happo District, Masan Hoewon District, Jinhae District, Jinju City, Tongyeong City, Sacheon City, Sacheon Namyang Branch Office, Gimhae City, Jangyu Branch Office, Miryang City, Geoje City, Yangsan City, Yangsan City Ungsang Branch Office, Uiryeong County, Haman County, Changnyeong County, Namhae County, Hadong County, Sancheong County, Hamyang County, Geochang County, Hapcheon County, Jeju Special Self-Governing Province, Jeju City,
Seogwipo City"""

# Line 3 needed some coercing in going translate to fix
english_names = [name.strip() for name in english_names.split(',')]

assert len(korean_names) == len(english_names), "Korean and English names lists must be of same length"

# Remove endings lise 'County', 'City', 'District' to standardize and for easier merging later
def simplify_name(name: str) -> str:
    name = name.removesuffix('County').split('City')[0].split('District')[0].removesuffix('Province').strip() \
        .removesuffix('Branch Office').removesuffix('(Integrated)').removesuffix('Metropolitan').removesuffix('Special Self-Governing') \
        .removesuffix('-gun').removesuffix('-gu').removesuffix('-si').strip()
    return name

cleaned_name_mapping = {k: simplify_name(v) for k, v in zip(korean_names, english_names)}

In [9]:
# Find the duplicate values
found = set()
for v in cleaned_name_mapping.values():
    if v not in found:
        found.add(v)
    else:
        print(f'Duplicate found: {v}')

Duplicate found: Sejong
Duplicate found: Gwangju
Duplicate found: Donghae
Duplicate found: Yangsan
Duplicate found: Jeju


In [10]:
# All the duplicates seem to be counties that got promoted to cities or subsets of a single place (special self-governing province, etc.), so we can just keep them
cleaned_name_mapping

{'전국': 'Nationwide',
 '서울특별시': 'Seoul',
 ' 종로구': 'Jongno',
 ' 중구': 'Jung',
 ' 용산구': 'Yongsan',
 ' 성동구': 'Seongdong',
 ' 광진구': 'Gwangjin',
 ' 동대문구': 'Dongdaemun',
 ' 중랑구': 'Jungnang',
 ' 성북구': 'Seongbuk',
 ' 강북구': 'Gangbuk',
 ' 도봉구': 'Dobong',
 ' 노원구': 'Nowon',
 ' 은평구': 'Eunpyeong',
 ' 서대문구': 'Seodaemun',
 ' 마포구': 'Mapo',
 ' 양천구': 'Yangcheon',
 ' 강서구': 'Gangseo',
 ' 구로구': 'Guro',
 ' 금천구': 'Geumcheon',
 ' 영등포구': 'Yeongdeungpo',
 ' 동작구': 'Dongjak',
 ' 관악구': 'Gwanak',
 ' 서초구': 'Seocho',
 ' 강남구': 'Gangnam',
 ' 송파구': 'Songpa',
 ' 강동구': 'Gangdong',
 '부산광역시': 'Busan',
 ' 서구': 'Seo',
 ' 동구': 'Dong',
 ' 영도구': 'Yeongdo',
 ' 부산진구': 'Busanjin',
 ' 동래구': 'Dongrae',
 ' 남구': 'Nam',
 ' 북구': 'Buk',
 ' 해운대구': 'Haeundae',
 ' 사하구': 'Saha',
 ' 금정구': 'Geumjeong',
 ' 연제구': 'Yeonje',
 ' 수영구': 'Suyeong',
 ' 사상구': 'Sasang',
 ' 기장군': 'Gijang',
 '대구광역시': 'Daegu',
 ' 수성구': 'Suseong',
 ' 달서구': 'Dalseo',
 ' 달성군': 'Dalseong',
 ' 군위군': 'Gunwi',
 '인천광역시': 'Incheon',
 ' 중구영종출장소': 'Jung-gu Yeongjong',
 ' 중구용유출장소': 'Jung-g

In [11]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    global district_locations

    df = df.copy()

    df.columns = ['del1', 'District', 'del2', 'Age', 'Date', 'Population', 'Population_Male', 'Population_Female']
    df.drop(columns=['del1', 'del2'], inplace=True)
    
    # Strip whitespace from District names
    df['District'] = df['District'].map(cleaned_name_mapping)

    # Convert Population columns to int
    df.Population = df.Population.astype(int)
    df.Population_Male = df.Population_Male.astype(int)
    df.Population_Female = df.Population_Female.astype(int)

    # Define age group mappings
    # These correspond to age groups: 0-5, 6-12, 13-18, 19-64, 65+ which are the age groups for fare calculation by Seoul Metro
    age_mapping = {
        '계': 'Total',  # Total
        **{f'{i}세': '0-5' for i in range(0, 6)},
        **{f'{i}세': '6-12' for i in range(6, 13)},
        **{f'{i}세': '13-18' for i in range(13, 19)},
        **{f'{i}세': '19-64' for i in range(19, 65)},
        **{f'{i}세': '65+' for i in range(65, 100)},
        '100세 이상': '65+'
    }
    
    df['AgeGroup'] = df['Age'].map(age_mapping)
    
    # Pivot the dataframe to convert Age into wide format
    df = df.pivot_table(
        index=['District', 'Date'],
        columns='AgeGroup',
        values=['Population', 'Population_Male', 'Population_Female'],
        aggfunc='sum'
    )
    
    # # Flatten multi-level columns
    df.columns = ['_'.join(col).strip() for col in df.columns.values]
    df.reset_index(inplace=True)
    
    # Fix date
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m')

    return df

cleaned_df = clean_df(combined_df)

cleaned_df

Unnamed: 0,District,Date,Population_0-5,Population_13-18,Population_19-64,Population_6-12,Population_65+,Population_Total,Population_Female_0-5,Population_Female_13-18,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
0,Andong,2008-01-01,8291,12383,105297,13233,28651,167855,3991,5457,51523,6278,17048,84297,4300,6926,53774,6955,11603,83558
1,Andong,2008-02-01,8295,12328,105082,13158,28873,167736,3981,5448,51431,6263,17157,84280,4314,6880,53651,6895,11716,83456
2,Andong,2008-03-01,8274,12313,104957,13103,29021,167668,3985,5433,51397,6244,17250,84309,4289,6880,53560,6859,11771,83359
3,Andong,2008-04-01,8236,12319,104791,13078,29116,167540,3961,5439,51307,6241,17321,84269,4275,6880,53484,6837,11795,83271
4,Andong,2008-05-01,8238,12337,104652,13032,29127,167386,3961,5452,51227,6212,17347,84199,4277,6885,53425,6820,11780,83187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54834,Yuseong,2025-08-01,15229,22894,255046,25150,47997,366316,7301,11093,124123,12272,25899,180688,7928,11801,130923,12878,22098,185628
54835,Yuseong,2025-09-01,15193,22903,255116,25039,48265,366516,7295,11098,124094,12216,26074,180777,7898,11805,131022,12823,22191,185739
54836,Yuseong,2025-10-01,15186,22894,254806,24935,48534,366355,7294,11104,123968,12163,26205,180734,7892,11790,130838,12772,22329,185621
54837,Yuseong,2025-11-01,15206,22976,255117,24814,48836,366949,7281,11123,124146,12117,26347,181014,7925,11853,130971,12697,22489,185935


In [12]:
# Verify Districts
df_districts = set(cleaned_df['District'].unique())
df_districts

{'Andong',
 'Anjung',
 'Ansan',
 'Anseong',
 'Anyang',
 'Asan',
 'Boeun',
 'Bonghwa',
 'Boryeong',
 'Boseong',
 'Buan',
 'Bucheon',
 'Buk',
 'Bundang',
 'Bupyeong',
 'Busan',
 'Busanjin',
 'Buyeo',
 'Changnyeong',
 'Changwon',
 'Cheoin',
 'Cheonan',
 'Cheongdo',
 'Cheongju',
 'Cheongsong',
 'Cheongwon',
 'Cheongyang',
 'Cheorwon',
 'Chilgok',
 'Chuncheon',
 'Chungcheongbuk-do',
 'Chungju',
 'Daedeok',
 'Daegu',
 'Daejeon',
 'Dalseo',
 'Dalseong',
 'Damyang',
 'Dangjin',
 'Danwon',
 'Danyang',
 'Deogyang',
 'Deokjin',
 'Dobong',
 'Dong',
 'Dongan',
 'Dongdaemun',
 'Dongducheon',
 'Donghae',
 'Dongjak',
 'Dongnam',
 'Dongrae',
 'Eumseong',
 'Eunpyeong',
 'Gangbuk',
 'Gangdong',
 'Ganghwa',
 'Gangjin',
 'Gangnam',
 'Gangneung',
 'Gangseo',
 'Gangwon-do',
 'Gapyeong',
 'Geochang',
 'Geoje',
 'Geumcheon',
 'Geumjeong',
 'Geumsan',
 'Giheung',
 'Gijang',
 'Gimcheon',
 'Gimhae',
 'Gimje',
 'Gimpo',
 'Gochang',
 'Goesan',
 'Goheung',
 'Gokseong',
 'Gongju',
 'Goryeong',
 'Goseong',
 'Goyang',


In [13]:
# More stats after cleaning
display(cleaned_df.info())
display(cleaned_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54839 entries, 0 to 54838
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   District                 54839 non-null  object        
 1   Date                     54839 non-null  datetime64[ns]
 2   Population_0-5           54839 non-null  int64         
 3   Population_13-18         54839 non-null  int64         
 4   Population_19-64         54839 non-null  int64         
 5   Population_6-12          54839 non-null  int64         
 6   Population_65+           54839 non-null  int64         
 7   Population_Total         54839 non-null  int64         
 8   Population_Female_0-5    54839 non-null  int64         
 9   Population_Female_13-18  54839 non-null  int64         
 10  Population_Female_19-64  54839 non-null  int64         
 11  Population_Female_6-12   54839 non-null  int64         
 12  Population_Female_65+    54839 n

None

Unnamed: 0,Date,Population_0-5,Population_13-18,Population_19-64,Population_6-12,Population_65+,Population_Total,Population_Female_0-5,Population_Female_13-18,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
count,54839,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0,54839.0
mean,2017-02-03 03:03:37.618848,30131.37,43584.57,432064.3,43197.93,90650.29,639628.5,14642.3,20801.52,211755.4,20857.73,51997.39,320054.3,15489.07,22783.05,220308.9,22340.19,38652.9,319574.1
min,2008-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2012-08-01 00:00:00,2127.0,3439.0,34335.0,3122.5,15080.5,60096.0,1033.0,1609.0,15869.5,1510.0,8961.0,30038.0,1092.0,1832.0,18430.0,1615.0,6071.0,29921.5
50%,2017-03-01 00:00:00,8837.0,12949.0,137418.0,12812.0,26960.0,206780.0,4291.0,6171.0,66595.0,6190.0,15629.0,102164.0,4539.0,6738.0,70782.0,6613.0,11342.0,103881.0
75%,2021-09-01 00:00:00,19517.5,28176.5,274530.5,28219.5,50009.0,398204.0,9481.0,13525.5,136529.5,13631.0,28436.0,199288.5,10015.5,14689.0,138219.5,14568.0,21576.0,197716.5
max,2025-12-01 00:00:00,2821111.0,4242291.0,35370580.0,4511106.0,10840820.0,51851430.0,1369765.0,1992673.0,17318780.0,2148262.0,6006673.0,25990780.0,1452531.0,2250538.0,18051800.0,2362844.0,4834149.0,25868640.0
std,,161081.3,231074.1,2273562.0,228116.8,490449.1,3362038.0,78254.66,110124.9,1114938.0,110080.0,279751.2,1682786.0,82827.7,120965.2,1158689.0,118043.8,210879.0,1679298.0


In [14]:
# Save cleaned data
os.makedirs(POPULATION_CLEANED_DATA_DIR, exist_ok=True)

cleaned_df.to_csv(os.path.join(POPULATION_CLEANED_DATA_DIR, "population-by-age-groups.csv"), index=False)

print("Population data cleaned and saved!")

Population data cleaned and saved!


In [15]:
# Take subset of only seoul districts
seoul_districts = [
    "Seoul",
    "Jongno",
    "Jung",
    "Yongsan",
    "Seongdong",
    "Gwangjin",
    "Dongdaemun",
    "Jungnang",
    "Seongbuk",
    "Gangbuk",
    "Dobong",
    "Nowon",
    "Eunpyeong",
    "Seodaemun",
    "Mapo",
    "Yangcheon",
    "Gangseo",
    "Guro",
    "Geumcheon",
    "Yeongdeungpo",
    "Dongjak",
    "Gwanak",
    "Seocho",
    "Gangnam",
    "Songpa",
    "Gangdong",
]

seoul_df = cleaned_df[cleaned_df['District'].isin(seoul_districts)]

assert seoul_df['District'].nunique() == len(seoul_districts), "Some Seoul districts are missing!"

seoul_df.to_csv(os.path.join(POPULATION_CLEANED_DATA_DIR, "population-by-age-groups-seoul.csv"), index=False)

print("Seoul population data cleaned and saved!")

Seoul population data cleaned and saved!
