In [26]:
# Combine all the cleaned data into a single dataset for training and evaluation of models

import os
import pandas as pd
import numpy as np

DATA_DIR = "../../data/cleaned"

STATION_PASSENGER_COUNT_DATA_DIR = os.path.join(DATA_DIR, "station-passenger-count")
TAX_DATA_DIR = os.path.join(DATA_DIR, "tax")
STATION_LOCATION_DATA_DIR = os.path.join(DATA_DIR, "station-location")
LAND_PRICE_CHANGE_DATA_DIR = os.path.join(DATA_DIR, "land-price")
DISTRICT_LOCATION_DATA_DIR = os.path.join(DATA_DIR, "district-location")
POPULATION_DATA_DIR = os.path.join(DATA_DIR, "population")

In [27]:
# Start by loading the station passenger count data (label)
station_passenger_count_df = pd.read_csv(os.path.join(STATION_PASSENGER_COUNT_DATA_DIR, "passenger-count-monthly-2008-2024.csv"))
station_passenger_count_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total
0,2008-01-31,150,False,1,1503741
1,2008-02-29,150,False,1,1394346
2,2008-03-31,150,False,1,1518669
3,2008-04-30,150,False,1,1495350
4,2008-05-31,150,False,1,1565671
...,...,...,...,...,...
111275,2024-08-31,828,True,8,197693
111276,2024-09-30,828,True,8,196702
111277,2024-10-31,828,True,8,228296
111278,2024-11-30,828,True,8,233108


In [45]:
# Make year and month columns for merging later
station_passenger_count_df['Date'] = pd.to_datetime(station_passenger_count_df['Date'])
station_passenger_count_df['Year'] = station_passenger_count_df['Date'].dt.year
station_passenger_count_df['Month'] = station_passenger_count_df['Date'].dt.month

station_passenger_count_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,Year,Month
0,2008-01-31,150,False,1,1503741,2008,1
1,2008-02-29,150,False,1,1394346,2008,2
2,2008-03-31,150,False,1,1518669,2008,3
3,2008-04-30,150,False,1,1495350,2008,4
4,2008-05-31,150,False,1,1565671,2008,5
...,...,...,...,...,...,...,...
111275,2024-08-31,828,True,8,197693,2024,8
111276,2024-09-30,828,True,8,196702,2024,9
111277,2024-10-31,828,True,8,228296,2024,10
111278,2024-11-30,828,True,8,233108,2024,11


In [46]:
# Load the station location data
station_location_df = pd.read_csv(os.path.join(STATION_LOCATION_DATA_DIR, "station-locations.csv"))
station_location_df

Unnamed: 0,line,name,lat,lng,no
0,01호선,소요산,37.948100,127.061034,100
1,01호선,동두천,37.927878,127.054790,101
2,01호선,보산,37.913702,127.057277,102
3,01호선,동두천중앙,37.901885,127.056482,103
4,01호선,지행,37.892334,127.055716,104
...,...,...,...,...,...
704,경강선,신둔도예촌,37.317185,127.404760,K416
705,경강선,이천,37.265579,127.442260,K417
706,경강선,부발,37.260192,127.490277,K418
707,경강선,세종대왕릉,37.295309,127.570938,K419


In [47]:
# Try to merge the two datasets on station number
# First we need to convert no in station_location_df to be an int
station_location_df['no'] = pd.to_numeric(station_location_df['no'], errors='coerce')

station_passenger_count_location_df = pd.merge(station_passenger_count_df, station_location_df, left_on='Station Number', right_on='no', how='left')

# Check how many missing values there are in the merged dataframe
print(station_passenger_count_location_df.count())
display(station_passenger_count_location_df.isnull().sum())
station_passenger_count_location_df

Date              111280
Station Number    111280
Boarding          111280
Line              111280
Total             111280
Year              111280
Month             111280
line              105692
name              105692
lat               105692
lng               105692
no                105692
dtype: int64


Date                 0
Station Number       0
Boarding             0
Line                 0
Total                0
Year                 0
Month                0
line              5588
name              5588
lat               5588
lng               5588
no                5588
dtype: int64

Unnamed: 0,Date,Station Number,Boarding,Line,Total,Year,Month,line,name,lat,lng,no
0,2008-01-31,150,False,1,1503741,2008,1,01호선,송내,37.4876,126.753664,150.0
1,2008-02-29,150,False,1,1394346,2008,2,01호선,송내,37.4876,126.753664,150.0
2,2008-03-31,150,False,1,1518669,2008,3,01호선,송내,37.4876,126.753664,150.0
3,2008-04-30,150,False,1,1495350,2008,4,01호선,송내,37.4876,126.753664,150.0
4,2008-05-31,150,False,1,1565671,2008,5,01호선,송내,37.4876,126.753664,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...
111275,2024-08-31,828,True,8,197693,2024,8,,,,,
111276,2024-09-30,828,True,8,196702,2024,9,,,,,
111277,2024-10-31,828,True,8,228296,2024,10,,,,,
111278,2024-11-30,828,True,8,233108,2024,11,,,,,


In [48]:
# Most of the data has locations so we will drop rows with missing location data and remove extra columns
station_passenger_count_location_df.dropna(inplace=True)
station_passenger_count_location_df.drop(columns=['no', 'name', 'line'], inplace=True)

station_passenger_count_location_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,Year,Month,lat,lng
0,2008-01-31,150,False,1,1503741,2008,1,37.48760,126.753664
1,2008-02-29,150,False,1,1394346,2008,2,37.48760,126.753664
2,2008-03-31,150,False,1,1518669,2008,3,37.48760,126.753664
3,2008-04-30,150,False,1,1495350,2008,4,37.48760,126.753664
4,2008-05-31,150,False,1,1565671,2008,5,37.48760,126.753664
...,...,...,...,...,...,...,...,...,...
110793,2024-08-31,826,True,8,161087,2024,8,37.43213,127.129087
110794,2024-09-30,826,True,8,155338,2024,9,37.43213,127.129087
110795,2024-10-31,826,True,8,171370,2024,10,37.43213,127.129087
110796,2024-11-30,826,True,8,172851,2024,11,37.43213,127.129087


In [49]:
import geopandas as gpd

# Add district information based on lat/lng
district_location_gdf = gpd.read_file(os.path.join(DISTRICT_LOCATION_DATA_DIR, "district-location.geojson"))
district_location_gdf

Unnamed: 0,GID_2,GID_0,COUNTRY,GID_1,NAME_1,NL_NAME_1,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2,geometry
0,KOR.1.1_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Buk,,,Gu,District,,,"MULTIPOLYGON (((129.0256 35.1917, 129.0211 35...."
1,KOR.1.2_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Busanjin,,,Gu,District,,,"MULTIPOLYGON (((129.0332 35.1371, 129.0256 35...."
2,KOR.1.3_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Dong,,,Gu,District,,,"MULTIPOLYGON (((129.0365 35.1129, 129.0323 35...."
3,KOR.1.4_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Dongnae,,,Gu,District,,,"MULTIPOLYGON (((129.116 35.2011, 129.1143 35.1..."
4,KOR.1.5_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Gangseo,,,Gu,District,,,"MULTIPOLYGON (((128.7742 35.0148, 128.7763 35...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,KOR.17.1_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Buk,,,Gu,District,,,"MULTIPOLYGON (((129.4133 35.5547, 129.4076 35...."
225,KOR.17.2_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Dong,,,Gu,District,,,"MULTIPOLYGON (((129.3998 35.4666, 129.4065 35...."
226,KOR.17.3_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Jung,,,Gu,District,,,"MULTIPOLYGON (((129.3515 35.5594, 129.3493 35...."
227,KOR.17.4_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Nam,,,Gu,District,,,"MULTIPOLYGON (((129.3665 35.5424, 129.3694 35...."


In [50]:
station_passenger_count_location_gdf = gpd.GeoDataFrame(
    station_passenger_count_location_df,
    geometry=gpd.points_from_xy(station_passenger_count_location_df.lng, station_passenger_count_location_df.lat),
    crs="EPSG:4326"
)

district_location_gdf = district_location_gdf.to_crs("EPSG:4326")

station_passenger_count_location_district_gdf = gpd.sjoin(
    station_passenger_count_location_gdf,
    district_location_gdf[["NAME_1", "NAME_2", "geometry"]],
    how="left",
    predicate="within"
)

station_passenger_count_location_district_gdf

Unnamed: 0,Date,Station Number,Boarding,Line,Total,Year,Month,lat,lng,geometry,index_right,NAME_1,NAME_2
0,2008-01-31,150,False,1,1503741,2008,1,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
1,2008-02-29,150,False,1,1394346,2008,2,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
2,2008-03-31,150,False,1,1518669,2008,3,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
3,2008-04-30,150,False,1,1495350,2008,4,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
4,2008-05-31,150,False,1,1565671,2008,5,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110793,2024-08-31,826,True,8,161087,2024,8,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam
110794,2024-09-30,826,True,8,155338,2024,9,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam
110795,2024-10-31,826,True,8,171370,2024,10,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam
110796,2024-11-30,826,True,8,172851,2024,11,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam


In [51]:
station_passenger_count_location_district_gdf.isnull().sum()

Date                 0
Station Number       0
Boarding             0
Line                 0
Total                0
Year                 0
Month                0
lat                  0
lng                  0
geometry             0
index_right       2040
NAME_1            2040
NAME_2            2040
dtype: int64

In [52]:
# Drop rows with missing district information and remove extra columns
station_passenger_count_location_district_gdf.dropna(inplace=True)
station_passenger_count_location_district_gdf.drop(columns=['index_right', 'geometry'], inplace=True)
station_passenger_count_location_district_gdf.rename(columns={"NAME_1": "Province", "NAME_2": "District"}, inplace=True)

station_passenger_count_location_district_df = station_passenger_count_location_district_gdf
station_passenger_count_location_district_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,Year,Month,lat,lng,Province,District
0,2008-01-31,150,False,1,1503741,2008,1,37.48760,126.753664,Gyeonggi-do,Bucheon
1,2008-02-29,150,False,1,1394346,2008,2,37.48760,126.753664,Gyeonggi-do,Bucheon
2,2008-03-31,150,False,1,1518669,2008,3,37.48760,126.753664,Gyeonggi-do,Bucheon
3,2008-04-30,150,False,1,1495350,2008,4,37.48760,126.753664,Gyeonggi-do,Bucheon
4,2008-05-31,150,False,1,1565671,2008,5,37.48760,126.753664,Gyeonggi-do,Bucheon
...,...,...,...,...,...,...,...,...,...,...,...
110793,2024-08-31,826,True,8,161087,2024,8,37.43213,127.129087,Gyeonggi-do,Seongnam
110794,2024-09-30,826,True,8,155338,2024,9,37.43213,127.129087,Gyeonggi-do,Seongnam
110795,2024-10-31,826,True,8,171370,2024,10,37.43213,127.129087,Gyeonggi-do,Seongnam
110796,2024-11-30,826,True,8,172851,2024,11,37.43213,127.129087,Gyeonggi-do,Seongnam


In [53]:
# Add Tax Data
tax_df = pd.read_csv(os.path.join(TAX_DATA_DIR, "tax-data.csv"))
tax_df

Unnamed: 0,Type,District,Year,Revenue
0,Total,Total,2002,
1,Total,Seoul Rto,2002,37082078.0
2,Total,Jongno,2002,3253060.0
3,Total,Jungbu,2002,2444410.0
4,Total,Namdaemun,2002,5578244.0
...,...,...,...,...
158580,Comprehensive Real Estate Holding Tax,Tongyeong,2024,5430.0
158581,Comprehensive Real Estate Holding Tax,Jinju,2024,58685.0
158582,Comprehensive Real Estate Holding Tax,Jeju,2024,66359.0
158583,Comprehensive Real Estate Holding Tax,Tax On Lmport,2024,


In [54]:
# Convert to wide format based on Type column
tax_wide_df = tax_df.pivot_table(index=['District', 'Year'], columns='Type', values='Revenue', aggfunc='sum').reset_index()
display(tax_wide_df.info())
tax_wide_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3358 entries, 0 to 3357
Data columns (total 46 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   District                                   3358 non-null   object 
 1   Year                                       3358 non-null   int64  
 2   Asset Revaluation Tax                      3358 non-null   float64
 3   Business Incometax                         3358 non-null   float64
 4   Capital Gains Tax                          3358 non-null   float64
 5   Child Tax Credit                           3358 non-null   float64
 6   Comprehensive Real Estate Holding Tax      3358 non-null   float64
 7   Corporation Tax                            3358 non-null   float64
 8   Defense Tax                                3358 non-null   float64
 9   Direct Tax                                 2668 non-null   float64
 10  Dividend Income Tax     

None

Type,District,Year,Asset Revaluation Tax,Business Incometax,Capital Gains Tax,Child Tax Credit,Comprehensive Real Estate Holding Tax,Corporation Tax,Defense Tax,Direct Tax,...,Securities Transaction Tax,Selective Excise Tax,Special Tax For Rural Development,Stamp Tax,Taxpayers Association,Telephone Tax,Total,Transportation Tax,Transportation¡¤Energy ¡¤ Environment Tax,Value Added Tax
0,Andong,2002,43.0,0.0,0.0,0.0,0.0,11097.0,1.0,56170.0,...,121.0,200.0,595.0,2.0,0.0,0.0,97490.0,0.0,0.0,34024.0
1,Andong,2003,0.0,0.0,0.0,0.0,0.0,12858.0,0.0,52459.0,...,116.0,136.0,427.0,1.0,0.0,0.0,104370.0,0.0,0.0,45750.0
2,Andong,2004,0.0,0.0,0.0,0.0,0.0,18840.0,0.0,68744.0,...,109.0,207.0,515.0,0.0,0.0,0.0,131360.0,49.0,0.0,55744.0
3,Andong,2005,0.0,0.0,0.0,0.0,91.0,18291.0,0.0,66847.0,...,175.0,91.0,661.0,1.0,0.0,0.0,118999.0,1.0,0.0,44751.0
4,Andong,2006,0.0,0.0,0.0,0.0,567.0,21563.0,0.0,79520.0,...,210.0,235.0,840.0,0.0,0.0,0.0,131333.0,23.0,0.0,43727.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353,Yongsan,2020,0.0,24539.0,1002829.0,-1080.0,142051.0,746891.0,1.0,0.0,...,14597.0,960.0,35826.0,1883.0,19227.0,0.0,5652493.0,0.0,0.0,761193.0
3354,Yongsan,2021,0.0,32173.0,1136402.0,-951.0,217155.0,737690.0,0.0,0.0,...,14024.0,863.0,65334.0,941.0,22100.0,0.0,8236918.0,0.0,0.0,771902.0
3355,Yongsan,2022,0.0,38371.0,1202413.0,-758.0,212684.0,917213.0,0.0,0.0,...,10048.0,1416.0,58867.0,838.0,13654.0,0.0,9690786.0,0.0,0.0,802817.0
3356,Yongsan,2023,0.0,43506.0,1250789.0,-779.0,138853.0,861437.0,0.0,0.0,...,20417.0,2022.0,53775.0,953.0,11053.0,0.0,9190764.0,0.0,0.0,799314.0


In [55]:
# Combine with main dataframe
station_passenger_count_location_district_tax_df = pd.merge(station_passenger_count_location_district_df, tax_wide_df, left_on=['Year', 'District'], right_on=['Year', 'District'], how='left')

display(station_passenger_count_location_district_tax_df.info())
station_passenger_count_location_district_tax_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104468 entries, 0 to 104467
Data columns (total 55 columns):
 #   Column                                     Non-Null Count   Dtype         
---  ------                                     --------------   -----         
 0   Date                                       104468 non-null  datetime64[ns]
 1   Station Number                             104468 non-null  int64         
 2   Boarding                                   104468 non-null  bool          
 3   Line                                       104468 non-null  int64         
 4   Total_x                                    104468 non-null  int64         
 5   Year                                       104468 non-null  int32         
 6   Month                                      104468 non-null  int32         
 7   lat                                        104468 non-null  float64       
 8   lng                                        104468 non-null  float64       
 9   Prov

None

Unnamed: 0,Date,Station Number,Boarding,Line,Total_x,Year,Month,lat,lng,Province,...,Securities Transaction Tax,Selective Excise Tax,Special Tax For Rural Development,Stamp Tax,Taxpayers Association,Telephone Tax,Total_y,Transportation Tax,Transportation¡¤Energy ¡¤ Environment Tax,Value Added Tax
0,2008-01-31,150,False,1,1503741,2008,1,37.48760,126.753664,Gyeonggi-do,...,1488.0,2194.0,4270.0,5.0,0.0,1.0,764565.0,0.0,-27.0,255021.0
1,2008-02-29,150,False,1,1394346,2008,2,37.48760,126.753664,Gyeonggi-do,...,1488.0,2194.0,4270.0,5.0,0.0,1.0,764565.0,0.0,-27.0,255021.0
2,2008-03-31,150,False,1,1518669,2008,3,37.48760,126.753664,Gyeonggi-do,...,1488.0,2194.0,4270.0,5.0,0.0,1.0,764565.0,0.0,-27.0,255021.0
3,2008-04-30,150,False,1,1495350,2008,4,37.48760,126.753664,Gyeonggi-do,...,1488.0,2194.0,4270.0,5.0,0.0,1.0,764565.0,0.0,-27.0,255021.0
4,2008-05-31,150,False,1,1565671,2008,5,37.48760,126.753664,Gyeonggi-do,...,1488.0,2194.0,4270.0,5.0,0.0,1.0,764565.0,0.0,-27.0,255021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104463,2024-08-31,826,True,8,161087,2024,8,37.43213,127.129087,Gyeonggi-do,...,802.0,245.0,14337.0,70.0,0.0,0.0,1781928.0,0.0,-19.0,529404.0
104464,2024-09-30,826,True,8,155338,2024,9,37.43213,127.129087,Gyeonggi-do,...,802.0,245.0,14337.0,70.0,0.0,0.0,1781928.0,0.0,-19.0,529404.0
104465,2024-10-31,826,True,8,171370,2024,10,37.43213,127.129087,Gyeonggi-do,...,802.0,245.0,14337.0,70.0,0.0,0.0,1781928.0,0.0,-19.0,529404.0
104466,2024-11-30,826,True,8,172851,2024,11,37.43213,127.129087,Gyeonggi-do,...,802.0,245.0,14337.0,70.0,0.0,0.0,1781928.0,0.0,-19.0,529404.0


In [56]:
station_passenger_count_location_district_tax_df.isnull().sum()

Date                                             0
Station Number                                   0
Boarding                                         0
Line                                             0
Total_x                                          0
Year                                             0
Month                                            0
lat                                              0
lng                                              0
Province                                         0
District                                         0
Asset Revaluation Tax                        19992
Business Incometax                           19992
Capital Gains Tax                            19992
Child Tax Credit                             19992
Comprehensive Real Estate Holding Tax        19992
Corporation Tax                              19992
Defense Tax                                  19992
Direct Tax                                   30666
Dividend Income Tax            

In [57]:
# For right now, we will fill missing tax data with 0
station_passenger_count_location_district_tax_df.fillna(0, inplace=True)

In [58]:
# Fix column names
station_passenger_count_location_district_tax_df.rename(columns={'Total_x': 'Passengers', 'Total_y': 'Total_Tax'}, inplace=True)

In [60]:
# Add population data
population_df = pd.read_csv(os.path.join(POPULATION_DATA_DIR, "population-by-age-groups.csv"), index_col=0)
population_df

Unnamed: 0,District,Date,Population_0-5,Population_13-18,Population_19-64,Population_6-12,Population_65+,Population_Total,Population_Female_0-5,Population_Female_13-18,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
0,Andong,2008-01-01,8291,12383,105297,13233,28651,167855,3991,5457,51523,6278,17048,84297,4300,6926,53774,6955,11603,83558
1,Andong,2008-02-01,8295,12328,105082,13158,28873,167736,3981,5448,51431,6263,17157,84280,4314,6880,53651,6895,11716,83456
2,Andong,2008-03-01,8274,12313,104957,13103,29021,167668,3985,5433,51397,6244,17250,84309,4289,6880,53560,6859,11771,83359
3,Andong,2008-04-01,8236,12319,104791,13078,29116,167540,3961,5439,51307,6241,17321,84269,4275,6880,53484,6837,11795,83271
4,Andong,2008-05-01,8238,12337,104652,13032,29127,167386,3961,5452,51227,6212,17347,84199,4277,6885,53425,6820,11780,83187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54834,Yuseong,2025-08-01,15229,22894,255046,25150,47997,366316,7301,11093,124123,12272,25899,180688,7928,11801,130923,12878,22098,185628
54835,Yuseong,2025-09-01,15193,22903,255116,25039,48265,366516,7295,11098,124094,12216,26074,180777,7898,11805,131022,12823,22191,185739
54836,Yuseong,2025-10-01,15186,22894,254806,24935,48534,366355,7294,11104,123968,12163,26205,180734,7892,11790,130838,12772,22329,185621
54837,Yuseong,2025-11-01,15206,22976,255117,24814,48836,366949,7281,11123,124146,12117,26347,181014,7925,11853,130971,12697,22489,185935


In [62]:
# Extract year and month from both dataframes for merging
population_df['Date'] = pd.to_datetime(population_df['Date'])
population_df['Year'] = population_df['Date'].dt.year
population_df['Month'] = population_df['Date'].dt.month

station_passenger_count_location_district_tax_population_df = pd.merge(station_passenger_count_location_district_tax_df, population_df, on=['Year', 'Month', 'District'], how='left')
display(station_passenger_count_location_district_tax_population_df.info())
display(station_passenger_count_location_district_tax_population_df.isnull().sum())
station_passenger_count_location_district_tax_population_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104468 entries, 0 to 104467
Data columns (total 74 columns):
 #   Column                                     Non-Null Count   Dtype         
---  ------                                     --------------   -----         
 0   Date_x                                     104468 non-null  datetime64[ns]
 1   Station Number                             104468 non-null  int64         
 2   Boarding                                   104468 non-null  bool          
 3   Line                                       104468 non-null  int64         
 4   Passengers                                 104468 non-null  int64         
 5   Year                                       104468 non-null  int32         
 6   Month                                      104468 non-null  int32         
 7   lat                                        104468 non-null  float64       
 8   lng                                        104468 non-null  float64       
 9   Prov

None

Date_x                     0
Station Number             0
Boarding                   0
Line                       0
Passengers                 0
                        ... 
Population_Male_13-18    756
Population_Male_19-64    756
Population_Male_6-12     756
Population_Male_65+      756
Population_Male_Total    756
Length: 74, dtype: int64

Unnamed: 0,Date_x,Station Number,Boarding,Line,Passengers,Year,Month,lat,lng,Province,...,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
0,2008-01-31,150,False,1,1503741,2008,1,37.48760,126.753664,Gyeonggi-do,...,295535.0,38217.0,34527.0,430628.0,26169.0,42352.0,302800.0,41748.0,21679.0,434748.0
1,2008-02-29,150,False,1,1394346,2008,2,37.48760,126.753664,Gyeonggi-do,...,296025.0,38046.0,34698.0,431029.0,26175.0,42328.0,303366.0,41444.0,21843.0,435156.0
2,2008-03-31,150,False,1,1518669,2008,3,37.48760,126.753664,Gyeonggi-do,...,295999.0,37838.0,34874.0,431017.0,26060.0,42549.0,303220.0,41255.0,21981.0,435065.0
3,2008-04-30,150,False,1,1495350,2008,4,37.48760,126.753664,Gyeonggi-do,...,296317.0,37681.0,35014.0,431307.0,25935.0,42613.0,303688.0,41099.0,22081.0,435416.0
4,2008-05-31,150,False,1,1565671,2008,5,37.48760,126.753664,Gyeonggi-do,...,296481.0,37509.0,35043.0,431366.0,25851.0,42541.0,303882.0,40964.0,22158.0,435396.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104463,2024-08-31,826,True,8,161087,2024,8,37.43213,127.129087,Gyeonggi-do,...,311137.0,25588.0,86965.0,462719.0,15171.0,25372.0,314323.0,26786.0,71500.0,453152.0
104464,2024-09-30,826,True,8,155338,2024,9,37.43213,127.129087,Gyeonggi-do,...,310401.0,25414.0,87307.0,462237.0,15115.0,25387.0,313632.0,26673.0,71788.0,452595.0
104465,2024-10-31,826,True,8,171370,2024,10,37.43213,127.129087,Gyeonggi-do,...,310074.0,25277.0,87625.0,462107.0,15099.0,25375.0,313014.0,26528.0,72111.0,452127.0
104466,2024-11-30,826,True,8,172851,2024,11,37.43213,127.129087,Gyeonggi-do,...,309700.0,25195.0,88004.0,461964.0,15083.0,25413.0,312470.0,26403.0,72448.0,451817.0


In [63]:
# Only 700 null values so we can drop them
# Also drop unnecessary columns and fix names
station_passenger_count_location_district_tax_population_df.dropna(inplace=True)
station_passenger_count_location_district_tax_population_df.drop(columns=['Date_x'], inplace=True)
station_passenger_count_location_district_tax_population_df.rename(columns={'Date_y': 'Date'}, inplace=True)

# Rename to something easier
station_all_data_tax_population_df = station_passenger_count_location_district_tax_population_df

station_all_data_tax_population_df

Unnamed: 0,Station Number,Boarding,Line,Passengers,Year,Month,lat,lng,Province,District,...,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
0,150,False,1,1503741,2008,1,37.48760,126.753664,Gyeonggi-do,Bucheon,...,295535.0,38217.0,34527.0,430628.0,26169.0,42352.0,302800.0,41748.0,21679.0,434748.0
1,150,False,1,1394346,2008,2,37.48760,126.753664,Gyeonggi-do,Bucheon,...,296025.0,38046.0,34698.0,431029.0,26175.0,42328.0,303366.0,41444.0,21843.0,435156.0
2,150,False,1,1518669,2008,3,37.48760,126.753664,Gyeonggi-do,Bucheon,...,295999.0,37838.0,34874.0,431017.0,26060.0,42549.0,303220.0,41255.0,21981.0,435065.0
3,150,False,1,1495350,2008,4,37.48760,126.753664,Gyeonggi-do,Bucheon,...,296317.0,37681.0,35014.0,431307.0,25935.0,42613.0,303688.0,41099.0,22081.0,435416.0
4,150,False,1,1565671,2008,5,37.48760,126.753664,Gyeonggi-do,Bucheon,...,296481.0,37509.0,35043.0,431366.0,25851.0,42541.0,303882.0,40964.0,22158.0,435396.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104463,826,True,8,161087,2024,8,37.43213,127.129087,Gyeonggi-do,Seongnam,...,311137.0,25588.0,86965.0,462719.0,15171.0,25372.0,314323.0,26786.0,71500.0,453152.0
104464,826,True,8,155338,2024,9,37.43213,127.129087,Gyeonggi-do,Seongnam,...,310401.0,25414.0,87307.0,462237.0,15115.0,25387.0,313632.0,26673.0,71788.0,452595.0
104465,826,True,8,171370,2024,10,37.43213,127.129087,Gyeonggi-do,Seongnam,...,310074.0,25277.0,87625.0,462107.0,15099.0,25375.0,313014.0,26528.0,72111.0,452127.0
104466,826,True,8,172851,2024,11,37.43213,127.129087,Gyeonggi-do,Seongnam,...,309700.0,25195.0,88004.0,461964.0,15083.0,25413.0,312470.0,26403.0,72448.0,451817.0


In [65]:
# Add Land Price Change Data
land_price_df = pd.read_csv(os.path.join(LAND_PRICE_CHANGE_DATA_DIR, "land-price.csv"))
land_price_df

Unnamed: 0,District,Land_Type,Date,Price_Change
0,Seoul,Dwelling,2005-01-01,0.173
1,Seoul,Dwelling,2005-02-01,0.117
2,Seoul,Dwelling,2005-03-01,0.375
3,Seoul,Dwelling,2005-04-01,0.672
4,Seoul,Dwelling,2005-05-01,0.549
...,...,...,...,...
93600,Ongjin,Semi-Agriculture,2015-02-01,
93601,Ongjin,Semi-Agriculture,2015-03-01,
93602,Ongjin,Semi-Agriculture,2015-04-01,
93603,Ongjin,Semi-Agriculture,2015-05-01,


In [70]:
# Pivot to wide format
land_price_wide_df = land_price_df.pivot_table(index=['District', 'Date'], columns='Land_Type', values='Price_Change', aggfunc='mean').reset_index()
display(land_price_wide_df.info())
display(land_price_wide_df.isnull().sum())
land_price_wide_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9036 entries, 0 to 9035
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   District                  9036 non-null   object 
 1   Date                      9036 non-null   object 
 2   Agriculture               1036 non-null   float64
 3   Commercial                8786 non-null   float64
 4   Comprehensive Management  92 non-null     float64
 5   Dwelling                  8786 non-null   float64
 6   Green                     7338 non-null   float64
 7   Industry                  4231 non-null   float64
 8   Management                274 non-null    float64
 9   Nature Conservation       36 non-null     float64
 10  Preservation Management   717 non-null    float64
 11  Production Management     717 non-null    float64
 12  Program Management        717 non-null    float64
dtypes: float64(11), object(2)
memory usage: 917.8+ KB


None

Land_Type
District                       0
Date                           0
Agriculture                 8000
Commercial                   250
Comprehensive Management    8944
Dwelling                     250
Green                       1698
Industry                    4805
Management                  8762
Nature Conservation         9000
Preservation Management     8319
Production Management       8319
Program Management          8319
dtype: int64

Land_Type,District,Date,Agriculture,Commercial,Comprehensive Management,Dwelling,Green,Industry,Management,Nature Conservation,Preservation Management,Production Management,Program Management
0,Bupyeong,2005-01-01,,0.097,,0.171,0.271,0.107,,,,,
1,Bupyeong,2005-02-01,,0.233,,0.177,0.841,0.199,,,,,
2,Bupyeong,2005-03-01,,0.333,,0.302,1.392,0.451,,,,,
3,Bupyeong,2005-04-01,,0.831,,0.619,1.892,0.672,,,,,
4,Bupyeong,2005-05-01,,0.809,,0.679,2.122,0.577,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9031,Yongsan,2025-07-01,,0.761,,0.550,,,,,,,
9032,Yongsan,2025-08-01,,0.553,,0.632,,,,,,,
9033,Yongsan,2025-09-01,,0.557,,0.777,,,,,,,
9034,Yongsan,2025-10-01,,0.632,,0.470,,,,,,,


In [72]:
# Drop columns that are mostly null and 0 the rest
land_price_wide_cleaned_df = land_price_wide_df.drop(columns=['Agriculture', 'Comprehensive Management', 'Nature Conservation', 'Management', 'Preservation Management', 'Production Management', 'Program Management'])
land_price_wide_cleaned_df.fillna(0, inplace=True)

land_price_wide_cleaned_df

Land_Type,District,Date,Commercial,Dwelling,Green,Industry
0,Bupyeong,2005-01-01,0.097,0.171,0.271,0.107
1,Bupyeong,2005-02-01,0.233,0.177,0.841,0.199
2,Bupyeong,2005-03-01,0.333,0.302,1.392,0.451
3,Bupyeong,2005-04-01,0.831,0.619,1.892,0.672
4,Bupyeong,2005-05-01,0.809,0.679,2.122,0.577
...,...,...,...,...,...,...
9031,Yongsan,2025-07-01,0.761,0.550,0.000,0.000
9032,Yongsan,2025-08-01,0.553,0.632,0.000,0.000
9033,Yongsan,2025-09-01,0.557,0.777,0.000,0.000
9034,Yongsan,2025-10-01,0.632,0.470,0.000,0.000


In [74]:
# Extract year and month from df for merging by district and month
land_price_wide_cleaned_df['Date'] = pd.to_datetime(land_price_wide_cleaned_df['Date'])
land_price_wide_cleaned_df['Year'] = land_price_wide_cleaned_df['Date'].dt.year
land_price_wide_cleaned_df['Month'] = land_price_wide_cleaned_df['Date'].dt.month
land_price_wide_cleaned_df.drop(columns=["Date"], inplace=True)

station_all_data_tax_population_land_price_df = pd.merge(station_all_data_tax_population_df, land_price_wide_cleaned_df, on=['Month', 'Year', 'District'], how='left')

display(station_all_data_tax_population_land_price_df.info())
display(station_all_data_tax_population_land_price_df.isnull().sum())
station_all_data_tax_population_land_price_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103712 entries, 0 to 103711
Data columns (total 77 columns):
 #   Column                                     Non-Null Count   Dtype         
---  ------                                     --------------   -----         
 0   Station Number                             103712 non-null  int64         
 1   Boarding                                   103712 non-null  bool          
 2   Line                                       103712 non-null  int64         
 3   Passengers                                 103712 non-null  int64         
 4   Year                                       103712 non-null  int32         
 5   Month                                      103712 non-null  int32         
 6   lat                                        103712 non-null  float64       
 7   lng                                        103712 non-null  float64       
 8   Province                                   103712 non-null  object        
 9   Dist

None

Station Number               0
Boarding                     0
Line                         0
Passengers                   0
Year                         0
                         ...  
Population_Male_Total        0
Commercial               11520
Dwelling                 11520
Green                    11520
Industry                 11520
Length: 77, dtype: int64

Unnamed: 0,Station Number,Boarding,Line,Passengers,Year,Month,lat,lng,Province,District,...,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total,Commercial,Dwelling,Green,Industry
0,150,False,1,1503741,2008,1,37.48760,126.753664,Gyeonggi-do,Bucheon,...,26169.0,42352.0,302800.0,41748.0,21679.0,434748.0,,,,
1,150,False,1,1394346,2008,2,37.48760,126.753664,Gyeonggi-do,Bucheon,...,26175.0,42328.0,303366.0,41444.0,21843.0,435156.0,,,,
2,150,False,1,1518669,2008,3,37.48760,126.753664,Gyeonggi-do,Bucheon,...,26060.0,42549.0,303220.0,41255.0,21981.0,435065.0,,,,
3,150,False,1,1495350,2008,4,37.48760,126.753664,Gyeonggi-do,Bucheon,...,25935.0,42613.0,303688.0,41099.0,22081.0,435416.0,,,,
4,150,False,1,1565671,2008,5,37.48760,126.753664,Gyeonggi-do,Bucheon,...,25851.0,42541.0,303882.0,40964.0,22158.0,435396.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103707,826,True,8,161087,2024,8,37.43213,127.129087,Gyeonggi-do,Seongnam,...,15171.0,25372.0,314323.0,26786.0,71500.0,453152.0,,,,
103708,826,True,8,155338,2024,9,37.43213,127.129087,Gyeonggi-do,Seongnam,...,15115.0,25387.0,313632.0,26673.0,71788.0,452595.0,,,,
103709,826,True,8,171370,2024,10,37.43213,127.129087,Gyeonggi-do,Seongnam,...,15099.0,25375.0,313014.0,26528.0,72111.0,452127.0,,,,
103710,826,True,8,172851,2024,11,37.43213,127.129087,Gyeonggi-do,Seongnam,...,15083.0,25413.0,312470.0,26403.0,72448.0,451817.0,,,,


In [76]:
# Around 11,000 NaN values, fill with 0 for now
station_all_data_tax_population_land_price_df.fillna(0, inplace=True)

In [78]:
# Save final dataset
station_all_data_tax_population_land_price_df.to_csv(os.path.join(DATA_DIR, "final-dataset.csv"), index=False)