Data from https://data.seoul.go.kr/dataList/OA-12921/F/1/datasetView.do file numbers `1-2, 6-20`

Each file should contain hourly data for a full year (2008-2025) and should be renamed to `<year>.csv` or `<year>.xlsx`

## **THIS NEEDS THE STATION LOCATION DATA**
Make sure [station-location.ipynb](station-location.ipynb) has been run!

In [None]:
import pandas as pd
import numpy as np
import os

DATA_DIR = '../../data'
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
CLEANED_DATA_DIR = os.path.join(DATA_DIR, 'cleaned')

PASSENGER_COUNT_RAW_DATA_DIR = os.path.join(RAW_DATA_DIR, 'station-passenger-count')
PASSENGER_COUNT_FORMATTED_DATA_DIR = os.path.join(CLEANED_DATA_DIR, 'station-passenger-count')

## Convert Excel to CSV

In [None]:
# Convert xlsx files to csv files for 2017-2019 (makes it faster to load in the future)
# DONT RUN if csv files already exist

# for year in range(2017, 2020):
#     print(f"Converting {year}.xlsx to {year}.csv...")
#     xlsx_path = os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, f"{year}.xlsx")
#     csv_path = os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, f"{year}.csv")
#     data = pd.read_excel(xlsx_path, header=1)
#     data.to_csv(csv_path, index=False, encoding='euc-kr')

Converting 2017.xlsx to 2017.csv...
Converting 2018.xlsx to 2018.csv...
Converting 2019.xlsx to 2019.csv...


## Load and Format Data

In [8]:
def clean_passenger_count_data(df: pd.DataFrame, includes_line: bool = False, includes_total: bool = False, includes_24: bool = True) -> pd.DataFrame:
    # Rename columns to English
    english_columns = ['Date', 'Station Number', 'Station Name', 'Boarding', '05', '06', '07',
    '08', '09', '10', '11', '12', '13',
    '14', '15', '16', '17', '18', '19',
    '20', '21', '22', '23']
    if includes_24:
        english_columns.append('24')
    if includes_line:
        english_columns.insert(1, 'Line')
    if includes_total:
        english_columns.append('Total')
    df.columns = english_columns
    
    # Remove unnecessary columns
    df.drop(columns=['Station Name'], inplace=True)
    if includes_total:
        df.drop(columns=['Total'], inplace=True)
    if includes_line:
        df.drop(columns=['Line'], inplace=True)

    # Convert Boarding to boolean
    df.Boarding = df.Boarding.apply(lambda x: True if "승차" in x else False)

    # Ensure numeric columns are properly formatted
    col_start = 4 if includes_line else 3
    hours = 20 if includes_24 else 19
    col_end = col_start + hours
    hours_columns = df.columns[col_start:col_end]
    numeric_columns = list(hours_columns)
    numeric_columns.append('Station Number')
    for col in numeric_columns:
        if df[col].dtype != 'int64':
            df[col] = df[col].astype(str)
            df[col] = pd.to_numeric(df[col].str.strip().str.replace(',', ''), errors='coerce').astype('Int64')

    # Fix formatting of station numbers (2xxx -> xxx)
    df['Station Number'] = df['Station Number'] % 1000

    # Make line column
    df["Line"] = (df["Station Number"] // np.pow(10, np.floor(np.log10(df["Station Number"])))).astype('Int64')

    # Convert Date to datetime
    df.Date = df.Date.str.strip()
    df.Date = pd.to_datetime(df.Date, format='mixed')

    # Set datetime as index
    df.set_index('Date', inplace=True)

    return df

In [9]:
# Test
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2010.csv'), encoding='euc-kr')
clean_passenger_count_data(df)

Unnamed: 0_level_0,Station Number,Boarding,05,06,07,08,09,10,11,12,...,16,17,18,19,20,21,22,23,24,Line
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,150,True,390,400,452,1008,1326,1688,2265,2363,...,2794,2961,2463,2831,2070,2224,1716,804,18,1
2010-01-01,150,False,187,1169,1171,1715,1984,2396,2209,2149,...,2015,1924,1898,1360,1147,1171,932,548,216,1
2010-01-01,151,True,114,106,144,160,218,194,272,270,...,822,871,965,853,839,812,594,176,0,1
2010-01-01,151,False,40,197,229,359,368,328,490,528,...,662,641,643,472,318,250,155,121,13,1
2010-01-01,152,True,917,714,364,304,306,365,502,754,...,1851,2210,2243,2210,2090,2575,2583,1207,22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31,825,False,53,93,119,226,181,179,185,239,...,503,568,703,625,454,456,395,268,159,8
2010-12-31,826,True,127,186,594,762,421,234,234,330,...,341,393,388,263,205,184,161,67,36,8
2010-12-31,826,False,41,60,164,279,202,144,173,214,...,402,523,487,485,365,325,324,243,131,8
2010-12-31,827,True,36,74,274,284,187,173,191,251,...,373,403,395,299,209,210,183,88,44,8


In [10]:
# Load data between 2008 and 2016
passenger_count_datasets = {}
for year in range(2008, 2017):
    print(f"Processing data for year: {year}...")
    file_path = os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, f"{year}.csv")
    data = pd.read_csv(file_path, encoding='euc-kr')
    cleaned_data = clean_passenger_count_data(data)
    passenger_count_datasets[year] = cleaned_data
print("Done!")

Processing data for year: 2008...
Processing data for year: 2009...
Processing data for year: 2010...
Processing data for year: 2011...
Processing data for year: 2012...
Processing data for year: 2013...
Processing data for year: 2014...
Processing data for year: 2015...


  data = pd.read_csv(file_path, encoding='euc-kr')


Processing data for year: 2016...
Done!


In [11]:
# Load 2017
print("Processing data for year: 2017...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2017.csv'), encoding='euc-kr')
# Drop unnecessary columns
df.drop(columns=['구분'], inplace=True)

passenger_count_datasets[2017] = clean_passenger_count_data(df, includes_line=True, includes_total=True)

# Load 2018
print("Processing data for year: 2018...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2018.csv'), encoding='euc-kr')

passenger_count_datasets[2018] = clean_passenger_count_data(df, includes_line=True, includes_total=True)

# Load 2019
print("Processing data for year: 2019...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2019.csv'), encoding='euc-kr')

passenger_count_datasets[2019] = clean_passenger_count_data(df, includes_line=True, includes_total=True)

print("Done!")


Processing data for year: 2017...
Processing data for year: 2018...
Processing data for year: 2019...
Done!


In [12]:
# Load 2020
print("Processing data for year: 2020...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2020.csv'), encoding='euc-kr')

passenger_count_datasets[2020] = clean_passenger_count_data(df, includes_line=True)

# Load 2021
print("Processing data for year: 2021...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2021.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2021] = clean_passenger_count_data(df, includes_line=True, includes_total=True, includes_24=False)

# Load 2022
print("Processing data for year: 2022...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2022.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2022] = clean_passenger_count_data(df, includes_line=True, includes_total=False)

# Load 2023
print("Processing data for year 2023...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2023.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2023] = clean_passenger_count_data(df, includes_line=True)

# Load 2024
print("Processing data for year: 2024...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2024.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2024] = clean_passenger_count_data(df, includes_line=True)

print("All data processing complete!")

Processing data for year: 2020...
Processing data for year: 2021...
Processing data for year: 2022...


  df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2022.csv'), encoding='euc-kr')


Processing data for year 2023...
Processing data for year: 2024...
All data processing complete!


## Combine Data

In [13]:
# Merge all datasets into a single DataFrame
passenger_count_all_data = pd.concat(passenger_count_datasets.values())
passenger_count_all_data

Unnamed: 0_level_0,Station Number,Boarding,05,06,07,08,09,10,11,12,...,16,17,18,19,20,21,22,23,24,Line
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-01-01,150,True,379,287,371,876,965,1389,1989,2375,...,3078,3495,3055,2952,2726,3307,2584,1059,264,1
2008-01-01,150,False,145,707,689,1037,1170,1376,1451,1743,...,2304,2203,2128,1747,1593,1078,744,406,558,1
2008-01-01,151,True,131,131,101,152,191,202,275,361,...,900,1154,1706,1444,1267,928,531,233,974,1
2008-01-01,151,False,35,158,203,393,375,460,591,841,...,1153,1303,1190,830,454,284,141,107,185,1
2008-01-01,152,True,1287,867,400,330,345,338,595,791,...,2269,2777,2834,2646,2784,2920,2290,802,1559,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31,826,False,14,85,152,477,233,191,191,250,...,414,465,468,376,261,248,246,175,68,8
2024-12-31,827,True,80,103,332,400,241,241,268,336,...,537,440,381,247,136,170,140,90,50,8
2024-12-31,827,False,19,109,138,428,219,197,234,266,...,340,340,374,281,193,212,207,141,128,8
2024-12-31,828,True,44,236,686,811,437,279,346,388,...,469,556,513,285,245,216,213,75,46,8


## Save Formatted Data

In [15]:
os.makedirs(PASSENGER_COUNT_FORMATTED_DATA_DIR, exist_ok=True)

passenger_count_all_data.to_csv(os.path.join(PASSENGER_COUNT_FORMATTED_DATA_DIR, 'passenger-count-full-2008-2024.csv'))

In [22]:
# Save only daily totals
daily_totals = passenger_count_all_data.copy()
hour_cols = ['05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']
daily_totals['Total'] = daily_totals[hour_cols].sum(axis=1)

daily_totals.drop(columns=hour_cols, inplace=True)

daily_totals

daily_totals.to_csv(os.path.join(PASSENGER_COUNT_FORMATTED_DATA_DIR, 'passenger-count-daily-2008-2024.csv'))

In [34]:
# Save only mounthly totals
monthly_totals = daily_totals.copy()

monthly_totals = monthly_totals.groupby([
    'Station Number',
    'Boarding',
    'Line',
    pd.Grouper(level='Date', freq='ME')
]).sum().reset_index().set_index('Date')

monthly_totals.to_csv(os.path.join(PASSENGER_COUNT_FORMATTED_DATA_DIR, 'passenger-count-monthly-2008-2024.csv'))