In [1]:
import os
from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
class MetroPreproc:
    def __init__(self, metro_df, time_columns) -> None:
        self.metro_df = metro_df
        self.time_columns = time_columns

    @staticmethod
    def __format_date(df):
        df["날짜"] = pd.to_datetime(df["날짜"])
        return df

    @staticmethod
    def __concat_line(df, time_columns):
        main_df = pd.merge(
            left=df.drop(columns=time_columns),
            right=df.groupby(["날짜", "역명", "구분"])[time_columns]
            .sum()
            .reset_index(),
            on=["날짜", "역명", "구분"],
        ).drop(columns=["호선"])
        return main_df

    def __call__(self):
        df = self.metro_df.copy()
        time_columns = self.time_columns.copy()
        df = self.__format_date(df)
        df = self.__concat_line(df, time_columns)
        return df

In [3]:
class CalcMetroOutlierScore:
    """Return Score of out's outlier"""

    def __init__(self, metro_df, time_columns) -> None:
        self.metro_df = metro_df
        self.time_columns = time_columns
        self.metro_out_df = self.__append_total_out(
            self.__filter_out(metro_df), time_columns
        )

    @staticmethod
    def __filter_out(metro_df):
        metro_out_df = metro_df[metro_df["구분"] == "하차"].copy()
        return metro_out_df

    @staticmethod
    def __append_total_out(metro_out_df, time_columns):
        metro_out_df["total_out"] = metro_out_df[time_columns].sum(axis=1)
        return metro_out_df

    def calc_max_out(self):
        # 각 역별 일일 최대 하차 수
        station_max_out_size = self.metro_out_df.groupby("역명")[
            "total_out"
        ].max()
        return station_max_out_size

    def calc_mean_out(self):
        # 각 역별 일일 평균 하차 수
        station_mean_out_size = self.metro_out_df.groupby("역명")[
            "total_out"
        ].mean()
        return station_mean_out_size

    def calc_max_out_date(self):
        metro_out_df = self.metro_out_df.copy()
        station_max_out_date = metro_out_df.loc[
            metro_out_df.groupby("역명")["total_out"].idxmax(), :
        ].set_index("역명")["날짜"]
        return station_max_out_date

    def __call__(self):
        max_out = self.calc_max_out()
        mean_out = self.calc_mean_out()
        outlier_out_score = max_out.div(mean_out)
        return outlier_out_score

In [4]:
# Code

In [5]:
time_columns = [
    "05~06", "06~07", "07~08", "08~09", "09~10", "10~11", "11~12",
    "12~13", "13~14", "14~15", "15~16", "16~17", "17~18", "18~19",
    "19~20", "20~21", "21~22", "22~23", "23~24", "00~01"
    ]

file_paths = sorted(glob("./../data/preproc/metro/*.csv"))

results = list()
for file_path in tqdm(file_paths):
    df = pd.read_csv(file_path, index_col=0)
    
    metro_preproc = MetroPreproc(df, time_columns)
    metro_df = metro_preproc()

    calc_metro_outlier_score = CalcMetroOutlierScore(metro_df, time_columns)
    metro_outlier_df = pd.concat(
        [calc_metro_outlier_score.calc_max_out_date(), calc_metro_outlier_score()],
        axis=1,
    )
    results.append(metro_outlier_df)

  df = pd.read_csv(file_path, index_col=0)
100%|██████████| 6/6 [00:02<00:00,  2.04it/s]


In [6]:
metro_final_df = pd.concat(results)

In [7]:
metro_final_df.to_csv("./../data/preproc/metro/metro_v1_final.csv")