Data from https://kosis.kr/statHtml/statHtml.do?sso=ok&returnurl=https%3A%2F%2Fkosis.kr%3A443%2FstatHtml%2FstatHtml.do%3Fconn_path%3DI2%26tblId%3DDT_PLCAHTUSE%26language%3Den%26orgId%3D408%26

In [2]:
import pandas as pd
import numpy as np
import os

DATA_DIR = '../../data'
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
CLEANED_DATA_DIR = os.path.join(DATA_DIR, 'cleaned')

LAND_PRICE_RAW_DATA_DIR = os.path.join(RAW_DATA_DIR, 'land-price')
LAND_PRICE_CLEANED_DATA_DIR = os.path.join(CLEANED_DATA_DIR, 'land-price')

In [None]:
land_price_df = pd.read_csv(os.path.join(LAND_PRICE_RAW_DATA_DIR, '408_DT_PLCAHTUSE_20260115142347.csv'))
display(land_price_df.columns)
display(land_price_df)

Index(['By region', 'By classification', 'PERIOD',
       'Land Price Change Rates[%]', 'Unnamed: 4'],
      dtype='object')

Unnamed: 0,By region,By classification,PERIOD,Land Price Change Rates[%],Unnamed: 4
0,Seoul,Use district_Dwelling District,2005.01 Month,.173,
1,Seoul,Use district_Dwelling District,2005.02 Month,.117,
2,Seoul,Use district_Dwelling District,2005.03 Month,.375,
3,Seoul,Use district_Dwelling District,2005.04 Month,.672,
4,Seoul,Use district_Dwelling District,2005.05 Month,.549,
...,...,...,...,...,...
93600,Ongjin-gun,Use district_Semi-Agriculture District,2015.02 Month,-,
93601,Ongjin-gun,Use district_Semi-Agriculture District,2015.03 Month,-,
93602,Ongjin-gun,Use district_Semi-Agriculture District,2015.04 Month,-,
93603,Ongjin-gun,Use district_Semi-Agriculture District,2015.05 Month,-,


In [None]:
# Remove unnecessary column
land_price_cleaned_df = land_price_df.copy()

land_price_cleaned_df.drop(columns=[land_price_cleaned_df.columns[4]], inplace=True)

# Rename columns
land_price_cleaned_df.columns = ['District', 'Land_Type', 'Date', 'Price_Change']

# Convert Date to datetime
land_price_cleaned_df['Date'] = pd.to_datetime(land_price_cleaned_df['Date'].str.removesuffix('Month').str.strip(), format='%Y.%m')

# Remove -gu suffix from district names
land_price_cleaned_df['District'] = land_price_cleaned_df['District'].str.removesuffix('-gu').str.removesuffix('-gun')

# Fix Land_Type names
land_price_cleaned_df['Land_Type'] = land_price_cleaned_df['Land_Type'].str.removeprefix('Use district_').str.removesuffix('District').str.strip()

# Convert - to NaN and change column type to float
land_price_cleaned_df['Price_Change'] = pd.to_numeric(land_price_cleaned_df['Price_Change'].replace('-', np.nan), errors='coerce')

land_price_cleaned_df

Unnamed: 0,District,Land_Type,Date,Price_Change
0,Seoul,Dwelling,2005-01-01,0.173
1,Seoul,Dwelling,2005-02-01,0.117
2,Seoul,Dwelling,2005-03-01,0.375
3,Seoul,Dwelling,2005-04-01,0.672
4,Seoul,Dwelling,2005-05-01,0.549
...,...,...,...,...
93600,Ongjin,Semi-Agriculture,2015-02-01,
93601,Ongjin,Semi-Agriculture,2015-03-01,
93602,Ongjin,Semi-Agriculture,2015-04-01,
93603,Ongjin,Semi-Agriculture,2015-05-01,


In [None]:
# Save cleaned data
os.makedirs(LAND_PRICE_CLEANED_DATA_DIR, exist_ok=True)

land_price_cleaned_df.to_csv(os.path.join(LAND_PRICE_CLEANED_DATA_DIR, 'land-price.csv'), index=False)