In [111]:
# Combine all the cleaned data into a single dataset for training and evaluation of models

import os
import pandas as pd
import numpy as np

DATA_DIR = "../../data/cleaned"

STATION_PASSENGER_COUNT_DATA_DIR = os.path.join(DATA_DIR, "station-passenger-count")
TAX_DATA_DIR = os.path.join(DATA_DIR, "tax")
STATION_LOCATION_DATA_DIR = os.path.join(DATA_DIR, "station-location")
LAND_PRICE_CHANGE_DATA_DIR = os.path.join(DATA_DIR, "land-price")
DISTRICT_LOCATION_DATA_DIR = os.path.join(DATA_DIR, "district-location")
POPULATION_DATA_DIR = os.path.join(DATA_DIR, "population")

In [112]:
# Start by loading the station passenger count data (label)
station_passenger_count_df = pd.read_csv(os.path.join(STATION_PASSENGER_COUNT_DATA_DIR, "passenger-count-monthly-2008-2024.csv"))
station_passenger_count_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total
0,2008-01-31,150,False,1,1503741
1,2008-02-29,150,False,1,1394346
2,2008-03-31,150,False,1,1518669
3,2008-04-30,150,False,1,1495350
4,2008-05-31,150,False,1,1565671
...,...,...,...,...,...
111275,2024-08-31,828,True,8,197693
111276,2024-09-30,828,True,8,196702
111277,2024-10-31,828,True,8,228296
111278,2024-11-30,828,True,8,233108


In [113]:
# Load the station location data
station_location_df = pd.read_csv(os.path.join(STATION_LOCATION_DATA_DIR, "station-locations.csv"))
station_location_df

Unnamed: 0,line,name,lat,lng,no
0,01호선,소요산,37.948100,127.061034,100
1,01호선,동두천,37.927878,127.054790,101
2,01호선,보산,37.913702,127.057277,102
3,01호선,동두천중앙,37.901885,127.056482,103
4,01호선,지행,37.892334,127.055716,104
...,...,...,...,...,...
704,경강선,신둔도예촌,37.317185,127.404760,K416
705,경강선,이천,37.265579,127.442260,K417
706,경강선,부발,37.260192,127.490277,K418
707,경강선,세종대왕릉,37.295309,127.570938,K419


In [114]:
# Try to merge the two datasets on station number
# First we need to convert no in station_location_df to be an int
station_location_df['no'] = pd.to_numeric(station_location_df['no'], errors='coerce')

merged_df = pd.merge(station_passenger_count_df, station_location_df, left_on='Station Number', right_on='no', how='left')

# Check how many missing values there are in the merged dataframe
print(merged_df.count())
display(merged_df.isnull().sum())
merged_df

Date              111280
Station Number    111280
Boarding          111280
Line              111280
Total             111280
line              105692
name              105692
lat               105692
lng               105692
no                105692
dtype: int64


Date                 0
Station Number       0
Boarding             0
Line                 0
Total                0
line              5588
name              5588
lat               5588
lng               5588
no                5588
dtype: int64

Unnamed: 0,Date,Station Number,Boarding,Line,Total,line,name,lat,lng,no
0,2008-01-31,150,False,1,1503741,01호선,송내,37.4876,126.753664,150.0
1,2008-02-29,150,False,1,1394346,01호선,송내,37.4876,126.753664,150.0
2,2008-03-31,150,False,1,1518669,01호선,송내,37.4876,126.753664,150.0
3,2008-04-30,150,False,1,1495350,01호선,송내,37.4876,126.753664,150.0
4,2008-05-31,150,False,1,1565671,01호선,송내,37.4876,126.753664,150.0
...,...,...,...,...,...,...,...,...,...,...
111275,2024-08-31,828,True,8,197693,,,,,
111276,2024-09-30,828,True,8,196702,,,,,
111277,2024-10-31,828,True,8,228296,,,,,
111278,2024-11-30,828,True,8,233108,,,,,


In [115]:
# Most of the data has locations so we will drop rows with missing location data and remove extra columns
merged_df.dropna(subset=['no', 'lat', 'lng'], inplace=True)
merged_df.drop(columns=['no', 'name', 'line'], inplace=True)

merged_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,lat,lng
0,2008-01-31,150,False,1,1503741,37.48760,126.753664
1,2008-02-29,150,False,1,1394346,37.48760,126.753664
2,2008-03-31,150,False,1,1518669,37.48760,126.753664
3,2008-04-30,150,False,1,1495350,37.48760,126.753664
4,2008-05-31,150,False,1,1565671,37.48760,126.753664
...,...,...,...,...,...,...,...
110793,2024-08-31,826,True,8,161087,37.43213,127.129087
110794,2024-09-30,826,True,8,155338,37.43213,127.129087
110795,2024-10-31,826,True,8,171370,37.43213,127.129087
110796,2024-11-30,826,True,8,172851,37.43213,127.129087


In [116]:
import geopandas as gpd
from shapely.geometry import Point

# Add district information based on lat/lng
district_location_gdf = gpd.read_file(os.path.join(DISTRICT_LOCATION_DATA_DIR, "district-location.geojson"))
district_location_gdf

Unnamed: 0,GID_2,GID_0,COUNTRY,GID_1,NAME_1,NL_NAME_1,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2,geometry
0,KOR.1.1_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Buk,,,Gu,District,,,"MULTIPOLYGON (((129.0256 35.1917, 129.0211 35...."
1,KOR.1.2_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Busanjin,,,Gu,District,,,"MULTIPOLYGON (((129.0332 35.1371, 129.0256 35...."
2,KOR.1.3_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Dong,,,Gu,District,,,"MULTIPOLYGON (((129.0365 35.1129, 129.0323 35...."
3,KOR.1.4_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Dongnae,,,Gu,District,,,"MULTIPOLYGON (((129.116 35.2011, 129.1143 35.1..."
4,KOR.1.5_2,KOR,SouthKorea,KOR.1_1,Busan,부산광역시|釜山廣域市,Gangseo,,,Gu,District,,,"MULTIPOLYGON (((128.7742 35.0148, 128.7763 35...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,KOR.17.1_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Buk,,,Gu,District,,,"MULTIPOLYGON (((129.4133 35.5547, 129.4076 35...."
225,KOR.17.2_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Dong,,,Gu,District,,,"MULTIPOLYGON (((129.3998 35.4666, 129.4065 35...."
226,KOR.17.3_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Jung,,,Gu,District,,,"MULTIPOLYGON (((129.3515 35.5594, 129.3493 35...."
227,KOR.17.4_2,KOR,SouthKorea,KOR.17_1,Ulsan,울산광역시|蔚山廣域市,Nam,,,Gu,District,,,"MULTIPOLYGON (((129.3665 35.5424, 129.3694 35...."


In [117]:
merged_gdf = gpd.GeoDataFrame(
    merged_df,
    geometry=gpd.points_from_xy(merged_df.lng, merged_df.lat),
    crs="EPSG:4326"
)

district_location_gdf = district_location_gdf.to_crs("EPSG:4326")

stations_with_district = gpd.sjoin(
    merged_gdf,
    district_location_gdf[["NAME_1", "NAME_2", "geometry"]],
    how="left",
    predicate="within"
)


In [118]:
stations_with_district

Unnamed: 0,Date,Station Number,Boarding,Line,Total,lat,lng,geometry,index_right,NAME_1,NAME_2
0,2008-01-31,150,False,1,1503741,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
1,2008-02-29,150,False,1,1394346,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
2,2008-03-31,150,False,1,1518669,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
3,2008-04-30,150,False,1,1495350,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
4,2008-05-31,150,False,1,1565671,37.48760,126.753664,POINT (126.75366 37.4876),81.0,Gyeonggi-do,Bucheon
...,...,...,...,...,...,...,...,...,...,...,...
110793,2024-08-31,826,True,8,161087,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam
110794,2024-09-30,826,True,8,155338,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam
110795,2024-10-31,826,True,8,171370,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam
110796,2024-11-30,826,True,8,172851,37.43213,127.129087,POINT (127.12909 37.43213),99.0,Gyeonggi-do,Seongnam


In [119]:
stations_with_district.drop(columns=['geometry', 'index_right'], inplace=True)

In [120]:
stations_with_district.isnull().sum()

Date                 0
Station Number       0
Boarding             0
Line                 0
Total                0
lat                  0
lng                  0
NAME_1            2040
NAME_2            2040
dtype: int64

In [121]:
stations_with_district.dropna(inplace=True)
stations_with_district.isnull().sum()

Date              0
Station Number    0
Boarding          0
Line              0
Total             0
lat               0
lng               0
NAME_1            0
NAME_2            0
dtype: int64

In [122]:
merged_df = stations_with_district
merged_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,lat,lng,NAME_1,NAME_2
0,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon
1,2008-02-29,150,False,1,1394346,37.48760,126.753664,Gyeonggi-do,Bucheon
2,2008-03-31,150,False,1,1518669,37.48760,126.753664,Gyeonggi-do,Bucheon
3,2008-04-30,150,False,1,1495350,37.48760,126.753664,Gyeonggi-do,Bucheon
4,2008-05-31,150,False,1,1565671,37.48760,126.753664,Gyeonggi-do,Bucheon
...,...,...,...,...,...,...,...,...,...
110793,2024-08-31,826,True,8,161087,37.43213,127.129087,Gyeonggi-do,Seongnam
110794,2024-09-30,826,True,8,155338,37.43213,127.129087,Gyeonggi-do,Seongnam
110795,2024-10-31,826,True,8,171370,37.43213,127.129087,Gyeonggi-do,Seongnam
110796,2024-11-30,826,True,8,172851,37.43213,127.129087,Gyeonggi-do,Seongnam


In [123]:
# Add Tax Data
tax_df = pd.read_csv(os.path.join(TAX_DATA_DIR, "tax-data.csv"))
tax_df

Unnamed: 0,Type,District,Year,Revenue
0,Total,Total,2002,
1,Total,Seoul Rto,2002,37082078.0
2,Total,Jongno,2002,3253060.0
3,Total,Jungbu,2002,2444410.0
4,Total,Namdaemun,2002,5578244.0
...,...,...,...,...
158580,Comprehensive Real Estate Holding Tax,Tongyeong,2024,5430.0
158581,Comprehensive Real Estate Holding Tax,Jinju,2024,58685.0
158582,Comprehensive Real Estate Holding Tax,Jeju,2024,66359.0
158583,Comprehensive Real Estate Holding Tax,Tax On Lmport,2024,


In [124]:
tax_df['Year'] = tax_df['Year'].astype(int)
merged_df['Year'] = pd.DatetimeIndex(merged_df['Date']).year

merged_df = pd.merge(merged_df, tax_df, left_on=['Year', 'NAME_2'], right_on=['Year', 'District'], how='left')
merged_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,lat,lng,NAME_1,NAME_2,Year,Type,District,Revenue
0,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,Total,Bucheon,764565.0
1,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,Total,Bucheon,
2,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,Internal Tax,Bucheon,752804.0
3,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,Internal Tax,Bucheon,
4,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,Direct Tax,Bucheon,438806.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4158899,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,Transportation¡¤Energy ¡¤ Environment Tax,Seongnam,-19.0
4158900,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,Defense Tax,Seongnam,
4158901,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,Education Tax,Seongnam,118.0
4158902,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,Special Tax For Rural Development,Seongnam,14337.0


In [125]:
merged_df.isnull().sum()

Date                    0
Station Number          0
Boarding                0
Line                    0
Total                   0
lat                     0
lng                     0
NAME_1                  0
NAME_2                  0
Year                    0
Type                19992
District            19992
Revenue           1703308
dtype: int64

In [126]:
# Just drop all NA
merged_df.dropna(inplace=True)

In [127]:
# Add population data
population_df = pd.read_csv(os.path.join(POPULATION_DATA_DIR, "population-by-age-groups.csv"))
population_df

Unnamed: 0.1,Unnamed: 0,District,Date,Population_0-5,Population_13-18,Population_19-64,Population_6-12,Population_65+,Population_Total,Population_Female_0-5,...,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
0,0,Andong,2008-01-01,8291,12383,105297,13233,28651,167855,3991,...,51523,6278,17048,84297,4300,6926,53774,6955,11603,83558
1,1,Andong,2008-02-01,8295,12328,105082,13158,28873,167736,3981,...,51431,6263,17157,84280,4314,6880,53651,6895,11716,83456
2,2,Andong,2008-03-01,8274,12313,104957,13103,29021,167668,3985,...,51397,6244,17250,84309,4289,6880,53560,6859,11771,83359
3,3,Andong,2008-04-01,8236,12319,104791,13078,29116,167540,3961,...,51307,6241,17321,84269,4275,6880,53484,6837,11795,83271
4,4,Andong,2008-05-01,8238,12337,104652,13032,29127,167386,3961,...,51227,6212,17347,84199,4277,6885,53425,6820,11780,83187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54834,54834,Yuseong,2025-08-01,15229,22894,255046,25150,47997,366316,7301,...,124123,12272,25899,180688,7928,11801,130923,12878,22098,185628
54835,54835,Yuseong,2025-09-01,15193,22903,255116,25039,48265,366516,7295,...,124094,12216,26074,180777,7898,11805,131022,12823,22191,185739
54836,54836,Yuseong,2025-10-01,15186,22894,254806,24935,48534,366355,7294,...,123968,12163,26205,180734,7892,11790,130838,12772,22329,185621
54837,54837,Yuseong,2025-11-01,15206,22976,255117,24814,48836,366949,7281,...,124146,12117,26347,181014,7925,11853,130971,12697,22489,185935


In [128]:
# Extract year-month from both dataframes for merging
merged_df['YearMonth'] = pd.to_datetime(merged_df['Date']).dt.to_period('M')
population_df['YearMonth'] = pd.to_datetime(population_df['Date']).dt.to_period('M')

merged_df = pd.merge(merged_df, population_df, left_on=['YearMonth', 'NAME_2'], right_on=['YearMonth', 'District'], how='left')
merged_df.drop(columns=['Unnamed: 0', 'Date_y', 'District_y', 'District_x'], inplace=True)
merged_df.rename(columns={'Date_x': 'Date'}, inplace=True)
merged_df

Unnamed: 0,Date,Station Number,Boarding,Line,Total,lat,lng,NAME_1,NAME_2,Year,...,Population_Female_19-64,Population_Female_6-12,Population_Female_65+,Population_Female_Total,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total
0,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,295535,38217,34527,430628,26169,42352,302800,41748,21679,434748
1,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,295535,38217,34527,430628,26169,42352,302800,41748,21679,434748
2,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,295535,38217,34527,430628,26169,42352,302800,41748,21679,434748
3,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,295535,38217,34527,430628,26169,42352,302800,41748,21679,434748
4,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,295535,38217,34527,430628,26169,42352,302800,41748,21679,434748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455591,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,308892,25113,88508,461601,15095,25361,311829,26332,72791,451408
2455592,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,308892,25113,88508,461601,15095,25361,311829,26332,72791,451408
2455593,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,308892,25113,88508,461601,15095,25361,311829,26332,72791,451408
2455594,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,308892,25113,88508,461601,15095,25361,311829,26332,72791,451408


In [129]:
merged_df.isnull().sum()

Date                       0
Station Number             0
Boarding                   0
Line                       0
Total                      0
lat                        0
lng                        0
NAME_1                     0
NAME_2                     0
Year                       0
Type                       0
Revenue                    0
YearMonth                  0
Population_0-5             0
Population_13-18           0
Population_19-64           0
Population_6-12            0
Population_65+             0
Population_Total           0
Population_Female_0-5      0
Population_Female_13-18    0
Population_Female_19-64    0
Population_Female_6-12     0
Population_Female_65+      0
Population_Female_Total    0
Population_Male_0-5        0
Population_Male_13-18      0
Population_Male_19-64      0
Population_Male_6-12       0
Population_Male_65+        0
Population_Male_Total      0
dtype: int64

In [130]:
# Add Land Price Change Data
land_price_df = pd.read_csv(os.path.join(LAND_PRICE_CHANGE_DATA_DIR, "land-price.csv"))
land_price_df

Unnamed: 0,District,Land_Type,Date,Price_Change
0,Seoul,Dwelling,2005-01-01,0.173
1,Seoul,Dwelling,2005-02-01,0.117
2,Seoul,Dwelling,2005-03-01,0.375
3,Seoul,Dwelling,2005-04-01,0.672
4,Seoul,Dwelling,2005-05-01,0.549
...,...,...,...,...
93600,Ongjin,Semi-Agriculture,2015-02-01,
93601,Ongjin,Semi-Agriculture,2015-03-01,
93602,Ongjin,Semi-Agriculture,2015-04-01,
93603,Ongjin,Semi-Agriculture,2015-05-01,


In [None]:
# # Extract year-month from land_price_df for merging by district and month
# land_price_df['YearMonth'] = pd.to_datetime(land_price_df['Date']).dt.to_period('M')

# merged_df = pd.merge(merged_df, land_price_df, left_on=['YearMonth', 'NAME_2'], right_on=['YearMonth', 'District'], how='left')
# # merged_df.drop(columns=['YearMonth', 'Date_y', 'District_y'], inplace=True)
# merged_df



Unnamed: 0,Date_x,Station Number,Boarding,Line,Total,lat,lng,NAME_1,NAME_2,Year,...,Population_Male_0-5,Population_Male_13-18,Population_Male_19-64,Population_Male_6-12,Population_Male_65+,Population_Male_Total,District,Land_Type,Date_y,Price_Change
0,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,26169,42352,302800,41748,21679,434748,,,,
1,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,26169,42352,302800,41748,21679,434748,,,,
2,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,26169,42352,302800,41748,21679,434748,,,,
3,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,26169,42352,302800,41748,21679,434748,,,,
4,2008-01-31,150,False,1,1503741,37.48760,126.753664,Gyeonggi-do,Bucheon,2008,...,26169,42352,302800,41748,21679,434748,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16819037,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,15095,25361,311829,26332,72791,451408,,,,
16819038,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,15095,25361,311829,26332,72791,451408,,,,
16819039,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,15095,25361,311829,26332,72791,451408,,,,
16819040,2024-12-31,826,True,8,174013,37.43213,127.129087,Gyeonggi-do,Seongnam,2024,...,15095,25361,311829,26332,72791,451408,,,,


In [None]:
# merged_df.isnull().sum()

Date_x                            0
Station Number                    0
Boarding                          0
Line                              0
Total                             0
lat                               0
lng                               0
NAME_1                            0
NAME_2                            0
Year                              0
Type                              0
Revenue                           0
YearMonth                         0
Population_0-5                    0
Population_13-18                  0
Population_19-64                  0
Population_6-12                   0
Population_65+                    0
Population_Total                  0
Population_Female_0-5             0
Population_Female_13-18           0
Population_Female_19-64           0
Population_Female_6-12            0
Population_Female_65+             0
Population_Female_Total           0
Population_Male_0-5               0
Population_Male_13-18             0
Population_Male_19-64       