In [2]:
import pandas as pd
import numpy as np
import os
import math

DATA_DIR = '../data'
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
FORMATTED_DATA_DIR = os.path.join(DATA_DIR, 'formatted')

# Station Location Data

In [3]:
STATION_LOCATION_RAW_DATA_PATH = os.path.join(RAW_DATA_DIR, 'station-location', 'subway-station-cords.csv')

In [11]:
df = pd.read_csv(STATION_LOCATION_RAW_DATA_PATH)
df = df[["lat", "lng", "no"]]
display(df[df["no"] == "K215"])
display(df[df["no"] == "215"])

Unnamed: 0,lat,lng,no
448,37.504503,127.049008,K215


Unnamed: 0,lat,lng,no
116,37.520733,127.10379,215


In [None]:
print(passenger_count_all_data.count())
print(passenger_count_all_data[passenger_count_all_data.apply(lambda row: row['Station Number'] in station_location_station_numbers, axis=1)].count())

# Station Passenger Count Data

In [72]:
PASSENGER_COUNT_RAW_DATA_DIR = os.path.join(RAW_DATA_DIR, 'passenger-count')
PASSENGER_COUNT_FORMATTED_DATA_DIR = FORMATTED_DATA_DIR

## Convert Excel to CSV

In [73]:
# Convert xlsx files to csv files for 2017-2019 (makes it faster to load in the future)
# DONT RUN if csv files already exist

for year in range(2017, 2020):
    print(f"Converting {year}.xlsx to {year}.csv...")
    xlsx_path = os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, f"{year}.xlsx")
    csv_path = os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, f"{year}.csv")
    data = pd.read_excel(xlsx_path, header=1)
    data.to_csv(csv_path, index=False, encoding='euc-kr')

Converting 2017.xlsx to 2017.csv...
Converting 2018.xlsx to 2018.csv...
Converting 2019.xlsx to 2019.csv...


## Load and Format Data

In [74]:
def clean_passenger_count_data(df: pd.DataFrame, includes_line: bool = False, includes_total: bool = False, includes_24: bool = True) -> pd.DataFrame:
    # Rename columns to English
    english_columns = ['Date', 'Station Number', 'Station Name', 'Boarding', '05', '06', '07',
    '08', '09', '10', '11', '12', '13',
    '14', '15', '16', '17', '18', '19',
    '20', '21', '22', '23']
    if includes_24:
        english_columns.append('24')
    if includes_line:
        english_columns.insert(1, 'Line')
    if includes_total:
        english_columns.append('Total')
    df.columns = english_columns
    
    # Remove unnecessary columns
    df.drop(columns=['Station Name'], inplace=True)
    if includes_total:
        df.drop(columns=['Total'], inplace=True)
    if includes_line:
        df.drop(columns=['Line'], inplace=True)

    # Convert Boarding to boolean
    df.Boarding = df.Boarding.apply(lambda x: True if "승차" in x else False)


    # Ensure numeric columns are properly formatted
    col_start = 4 if includes_line else 3
    hours = 20 if includes_24 else 19
    col_end = col_start + hours
    hours_columns = df.columns[col_start:col_end]
    numeric_columns = list(hours_columns)
    numeric_columns.append('Station Number')
    for col in numeric_columns:
        if df[col].dtype != 'int64':
            df[col] = df[col].astype(str)
            df[col] = pd.to_numeric(df[col].str.strip().str.replace(',', ''), errors='coerce').astype('Int64')

    # Fix formatting of station numbers (2xxx -> xxx)
    df['Station Number'] = df['Station Number'] % 1000

    # Make line column
    df["Line"] = (df["Station Number"] // np.pow(10, np.floor(np.log10(df["Station Number"])))).astype('Int64')


    # Convert Date to datetime
    df.Date = df.Date.str.strip()
    df.Date = pd.to_datetime(df.Date, format='mixed')

    # Convert from wide to long format
    id_vars = ['Date', 'Station Number', 'Boarding', 'Line']
    df = df.melt(id_vars=id_vars, value_vars=hours_columns,
                 var_name='Hour', value_name='Passenger Count')

    # Convert Hour to integer
    df.Hour = df.Hour.astype(int)

    # Combine Date and Hour into a single datetime column
    df["datetime"] = pd.to_datetime(df["Date"]) + pd.to_timedelta(df["Hour"], unit="h")
    df.drop(columns=['Date', 'Hour'], inplace=True)

    # Set datetime as index
    df.set_index('datetime', inplace=True)

    return df

In [75]:
# Test
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2010.csv'), encoding='euc-kr')
clean_passenger_count_data(df)

Unnamed: 0_level_0,Station Number,Boarding,Line,Passenger Count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 05:00:00,150,True,1,390
2010-01-01 05:00:00,150,False,1,187
2010-01-01 05:00:00,151,True,1,114
2010-01-01 05:00:00,151,False,1,40
2010-01-01 05:00:00,152,True,1,917
...,...,...,...,...
2011-01-01 00:00:00,825,False,8,159
2011-01-01 00:00:00,826,True,8,36
2011-01-01 00:00:00,826,False,8,131
2011-01-01 00:00:00,827,True,8,44


In [76]:
# Load data between 2008 and 2016
passenger_count_datasets = {}
for year in range(2008, 2017):
    print(f"Processing data for year: {year}...")
    file_path = os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, f"{year}.csv")
    data = pd.read_csv(file_path, encoding='euc-kr')
    cleaned_data = clean_passenger_count_data(data)
    passenger_count_datasets[year] = cleaned_data
print("Done!")

Processing data for year: 2008...
Processing data for year: 2009...
Processing data for year: 2010...
Processing data for year: 2011...
Processing data for year: 2012...
Processing data for year: 2013...
Processing data for year: 2014...
Processing data for year: 2015...


  data = pd.read_csv(file_path, encoding='euc-kr')


Processing data for year: 2016...
Done!


In [77]:
# Load 2017
print("Processing data for year: 2017...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2017.csv'), encoding='euc-kr')
# Drop unnecessary columns
df.drop(columns=['구분'], inplace=True)

passenger_count_datasets[2017] = clean_passenger_count_data(df, includes_line=True, includes_total=True)

# Load 2018
print("Processing data for year: 2018...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2018.csv'), encoding='euc-kr')

passenger_count_datasets[2018] = clean_passenger_count_data(df, includes_line=True, includes_total=True)

# Load 2019
print("Processing data for year: 2019...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2019.csv'), encoding='euc-kr')

passenger_count_datasets[2019] = clean_passenger_count_data(df, includes_line=True, includes_total=True)

print("Done!")


Processing data for year: 2017...
Processing data for year: 2018...
Processing data for year: 2019...
Done!


In [78]:
# Load 2020
print("Processing data for year: 2020...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2020.csv'), encoding='euc-kr')

passenger_count_datasets[2020] = clean_passenger_count_data(df, includes_line=True)

# Load 2021
print("Processing data for year: 2021...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2021.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2021] = clean_passenger_count_data(df, includes_line=True, includes_total=True, includes_24=False)

# Load 2022
print("Processing data for year: 2022...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2022.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2022] = clean_passenger_count_data(df, includes_line=True, includes_total=False)

# Load 2023
print("Processing data for year 2023...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2023.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2023] = clean_passenger_count_data(df, includes_line=True)

# Load 2024
print("Processing data for year: 2024...")
df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2024.csv'), encoding='euc-kr')
df.drop(columns=['연번'], inplace=True)

passenger_count_datasets[2024] = clean_passenger_count_data(df, includes_line=True)

print("All data processing complete!")

Processing data for year: 2020...
Processing data for year: 2021...
Processing data for year: 2022...


  df = pd.read_csv(os.path.join(PASSENGER_COUNT_RAW_DATA_DIR, '2022.csv'), encoding='euc-kr')


Processing data for year 2023...
Processing data for year: 2024...
All data processing complete!


## Combine Data

In [79]:
# Merge all datasets into a single DataFrame
passenger_count_all_data = pd.concat(passenger_count_datasets.values())
passenger_count_all_data

Unnamed: 0_level_0,Station Number,Boarding,Line,Passenger Count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-01-01 05:00:00,150,True,1,379
2008-01-01 05:00:00,150,False,1,145
2008-01-01 05:00:00,151,True,1,131
2008-01-01 05:00:00,151,False,1,35
2008-01-01 05:00:00,152,True,1,1287
...,...,...,...,...
2025-01-01 00:00:00,826,False,8,68
2025-01-01 00:00:00,827,True,8,50
2025-01-01 00:00:00,827,False,8,128
2025-01-01 00:00:00,828,True,8,46


## Save Formatted Data

In [80]:
passenger_count_all_data.to_csv(os.path.join(PASSENGER_COUNT_FORMATTED_DATA_DIR, 'passenger-count-2008-2024.csv'))