Data from https://kosis.kr/statHtml/statHtml.do?sso=ok&returnurl=https%3A%2F%2Fkosis.kr%3A443%2FstatHtml%2FstatHtml.do%3Fconn_path%3DI2%26tblId%3DDT_13301_013%26language%3Den%26orgId%3D133%26

In [3]:
import pandas as pd
import numpy as np
import os

DATA_DIR = '../../data'
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
CLEANED_DATA_DIR = os.path.join(DATA_DIR, 'cleaned')

TAX_RAW_DATA_DIR = os.path.join(RAW_DATA_DIR, 'tax')
TAX_CLEANED_DATA_DIR = os.path.join(CLEANED_DATA_DIR, 'tax')

In [5]:
tax_df = pd.read_csv(os.path.join(TAX_RAW_DATA_DIR, '133_DT_13301_013_20260115162139.csv'), encoding='euc-kr')
display(tax_df.columns)
display(tax_df)

Index(['By type of tax', 'By district tax office', 'Item', 'UNIT', '2002 Year',
       '2003 Year', '2004 Year', '2005 Year', '2006 Year', '2007 Year',
       '2008 Year', '2009 Year', '2010 Year', '2011 Year', '2012 Year',
       '2013 Year', '2014 Year', '2015 Year', '2016 Year', '2017 Year',
       '2018 Year', '2019 Year', '2020 Year', '2021 Year', '2022 Year',
       '2023 Year', '2024 Year', 'Unnamed: 27'],
      dtype='object')

Unnamed: 0,By type of tax,By district tax office,Item,UNIT,2002 Year,2003 Year,2004 Year,2005 Year,2006 Year,2007 Year,...,2016 Year,2017 Year,2018 Year,2019 Year,2020 Year,2021 Year,2022 Year,2023 Year,2024 Year,Unnamed: 27
0,Total,Total,Tax Revenue[In million won],In million won,,,,93944230,130260883,153062775,...,233329122,255593190,283535518,284412645,277275291,334471443,384249472,335672348,328389625,
1,Total,Seoul Rto,Tax Revenue[In million won],In million won,37082078,39652254,38768431,43003723,49234337,59619400,...,69964067,76963042,86937600,90880285,95017952,115649580,134239085,120700049,115385296,
2,Total,Jongno,Tax Revenue[In million won],In million won,3253060,3842828,3890928,2993144,3189243,3683704,...,5274107,5659321,5817711,6470634,6254894,8380858,7646290,6697593,5903139,
3,Total,Jungbu,Tax Revenue[In million won],In million won,2444410,2519571,2007063,1971823,3451423,3464565,...,2530467,2860813,1928312,1851805,1416451,1841790,1992693,1642453,1558253,
4,Total,Namdaemun,Tax Revenue[In million won],In million won,5578244,5070271,5031381,7031400,8315783,10563742,...,10176566,11591400,13928708,13720566,14788765,18231214,20130223,20475908,18136318,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6890,Comprehensive Real Estate Holding Tax,Tongyeong,Tax Revenue[In million won],In million won,,,,115,691,1156,...,4140,5709,4113,4374,4992,8599,10107,7062,5430,
6891,Comprehensive Real Estate Holding Tax,Jinju,Tax Revenue[In million won],In million won,,,,417,1439,3201,...,20571,25998,26087,29516,43755,67518,75838,66220,58685,
6892,Comprehensive Real Estate Holding Tax,Jeju,Tax Revenue[In million won],In million won,,,,2095,7247,11264,...,12963,21274,24958,38721,42386,70537,85163,74741,66359,
6893,Comprehensive Real Estate Holding Tax,Tax On Lmport,Tax Revenue[In million won],In million won,,,,-,-,-,...,-,-,,,,,,-,-,


In [24]:
cleaned_tax_df = tax_df.copy()

# Remove unnecessary columns
cleaned_tax_df.drop(columns=['Unnamed: 27'], inplace=True)

In [15]:
# Check if we can remove unit column
display(cleaned_tax_df['UNIT'].unique())
display(cleaned_tax_df['Item'].unique())

array(['In million won'], dtype=object)

array(['Tax Revenue[In million won]'], dtype=object)

In [25]:
# They both only have one value so we can remove them
cleaned_tax_df.drop(columns=['UNIT', 'Item'], inplace=True)

cleaned_tax_df.rename(columns={'By type of tax': 'Type', 'By district tax office': 'District'}, inplace=True)

In [26]:
# Convert from wide format (years as columns) to long format
# Note revenue is in units of 1 million won
cleaned_tax_df = cleaned_tax_df.melt(id_vars=['Type', 'District'], var_name='Year', value_name='Revenue')

cleaned_tax_df

Unnamed: 0,Type,District,Year,Revenue
0,Total,Total,2002 Year,
1,Total,Seoul Rto,2002 Year,37082078
2,Total,Jongno,2002 Year,3253060
3,Total,Jungbu,2002 Year,2444410
4,Total,Namdaemun,2002 Year,5578244
...,...,...,...,...
158580,Comprehensive Real Estate Holding Tax,Tongyeong,2024 Year,5430
158581,Comprehensive Real Estate Holding Tax,Jinju,2024 Year,58685
158582,Comprehensive Real Estate Holding Tax,Jeju,2024 Year,66359
158583,Comprehensive Real Estate Holding Tax,Tax On Lmport,2024 Year,-


In [27]:
# Convert year to int
cleaned_tax_df['Year'] = cleaned_tax_df['Year'].str.removesuffix('Year').astype(int)

In [29]:
# Replace - with NaN and convert Revenue to numeric
cleaned_tax_df['Revenue'] = pd.to_numeric(cleaned_tax_df['Revenue'].replace('-', np.nan).replace('**', np.nan))

In [30]:
display(cleaned_tax_df.isnull().sum())
cleaned_tax_df

Type            0
District        0
Year            0
Revenue     77382
dtype: int64

Unnamed: 0,Type,District,Year,Revenue
0,Total,Total,2002,
1,Total,Seoul Rto,2002,37082078.0
2,Total,Jongno,2002,3253060.0
3,Total,Jungbu,2002,2444410.0
4,Total,Namdaemun,2002,5578244.0
...,...,...,...,...
158580,Comprehensive Real Estate Holding Tax,Tongyeong,2024,5430.0
158581,Comprehensive Real Estate Holding Tax,Jinju,2024,58685.0
158582,Comprehensive Real Estate Holding Tax,Jeju,2024,66359.0
158583,Comprehensive Real Estate Holding Tax,Tax On Lmport,2024,


In [31]:
# Save cleaned data
os.makedirs(TAX_CLEANED_DATA_DIR, exist_ok=True)

cleaned_tax_df.to_csv(os.path.join(TAX_CLEANED_DATA_DIR, 'tax-data.csv'), index=False)