In [53]:
import numpy as np
import pandas as pd

In [34]:
df = pd.read_csv("./../data/preproc/metro/metro_2017.csv", index_col=0)

In [35]:
class MetroPreproc:
    def __init__(self, metro_df, time_columns) -> None:
        self.metro_df = metro_df
        self.time_columns = time_columns

    @staticmethod
    def __format_date(df):
        df["날짜"] = pd.to_datetime(df["날짜"])
        return df

    @staticmethod
    def __concat_line(df, time_columns):
        main_df = pd.merge(
            left=df.drop(columns=time_columns),
            right=df.groupby(["날짜", "역명", "구분"])[time_columns]
            .sum()
            .reset_index(),
            on=["날짜", "역명", "구분"],
        ).drop(columns=["호선"])
        return main_df

    def __call__(self):
        df = self.metro_df.copy()
        time_columns = self.time_columns.copy()
        df = self.__format_date(df)
        df = self.__concat_line(df, time_columns)
        return df

In [36]:
time_columns = [
    '05~06', '06~07', '07~08', '08~09', '09~10', '10~11', '11~12',
    '12~13', '13~14', '14~15', '15~16', '16~17', '17~18', '18~19',
    '19~20', '20~21', '21~22', '22~23', '23~24', '00~01'
    ]

metro_preproc = MetroPreproc(df, time_columns)
metro_df = metro_preproc()

In [115]:
class CalcMetroOutlierScore:
    """Return Score of out's outlier"""

    def __init__(self, metro_df) -> None:
        self.metro_df = metro_df
        self.metro_out_df = self.__append_total_out(
            self.__filter_out(metro_df)
        )

    @staticmethod
    def __filter_out(metro_df):
        metro_out_df = metro_df[metro_df["구분"] == "하차"].copy()
        return metro_out_df

    @staticmethod
    def __append_total_out(metro_out_df):
        metro_out_df["total_out"] = metro_out_df[time_columns].sum(axis=1)
        return metro_out_df

    def calc_max_out(self):
        # 각 역별 일일 최대 하차 수
        station_max_out_size = self.metro_out_df.groupby("역명")[
            "total_out"
        ].max()
        return station_max_out_size

    def calc_mean_out(self):
        # 각 역별 일일 평균 하차 수
        station_mean_out_size = self.metro_out_df.groupby("역명")[
            "total_out"
        ].mean()
        return station_mean_out_size

    def calc_max_out_date(self):
        metro_out_df = self.metro_out_df.copy()
        station_max_out_date = metro_out_df.loc[
            metro_out_df.groupby("역명")["total_out"].idxmax(), :
        ].set_index("역명")["날짜"]
        return station_max_out_date

    def __call__(self):
        max_out = self.calc_max_out()
        mean_out = self.calc_mean_out()
        outlier_out_score = max_out.div(mean_out)
        return outlier_out_score

In [117]:
calc_metro_outlier_score = CalcMetroOutlierScore(metro_df)

metro_outlier_df = pd.concat(
    [calc_metro_outlier_score.calc_max_out_date(), calc_metro_outlier_score()],
    axis=1,
)

In [37]:
class MetroWeekdayFilter:
    def __init__(self, metro_df) -> None:
        self.metro_df = metro_df
        self.weekday_series = self.__calc_weekday_series(metro_df)

    @staticmethod
    def __calc_weekday_series(df):
        weekday_series = metro_df["날짜"].apply(lambda x: x.weekday())
        return weekday_series

    def get_wkd_df(self):
        metro_wkd_df = metro_df[
            self.weekday_series.apply(lambda x: x in [0, 1, 2, 3])
        ]
        return metro_wkd_df

    def get_fri_df(self):
        metro_wkd_df = metro_df[self.weekday_series.apply(lambda x: x == 4)]
        return metro_wkd_df

    def get_sat_df(self):
        metro_wkd_df = metro_df[self.weekday_series.apply(lambda x: x == 5)]
        return metro_wkd_df

    def get_sun_df(self):
        metro_wkd_df = metro_df[self.weekday_series.apply(lambda x: x == 6)]
        return metro_wkd_df

In [38]:
metro_weekday_filter = MetroWeekdayFilter(metro_df)

wkd_df = metro_weekday_filter.get_wkd_df()
fri_df = metro_weekday_filter.get_fri_df()
sat_df = metro_weekday_filter.get_sat_df()
sun_df = metro_weekday_filter.get_sun_df()

In [31]:
class MetroTimelyConcat:

    @staticmethod
    def yearly_mean(metro_df):
        df = metro_df.copy()
        df["year"] = df["날짜"].apply(lambda x: x.year)
        timely_df = (
            df.groupby(["역명", "구분", "year"])[time_columns]
            .mean()
            .astype(int)
        )
        timely_df.reset_index(inplace=True)
        return timely_df

    @staticmethod
    def monthly_mean(metro_df):
        df = metro_df.copy()
        df["month"] = df["날짜"].apply(lambda x: x.month)
        timely_df = (
            df.groupby(["역명", "구분", "month"])[time_columns]
            .mean()
            .astype(int)
        )
        timely_df.reset_index(inplace=True)
        return timely_df

    @staticmethod
    def weekly_mean(metro_df):
        df = metro_df.copy()
        df["week"] = df["날짜"].apply(lambda x: x.week)
        timely_df = (
            df.groupby(["역명", "구분", "week"])[time_columns]
            .mean()
            .astype(int)
        )
        timely_df.reset_index(inplace=True)
        return timely_df

In [44]:
# wkd_average_out_df = (
#     wkd_df[wkd_df["구분"] == "하차"].groupby("역명")[time_columns].mean()
# )

# fri_average_out_df = (
#     fri_df[fri_df["구분"] == "하차"].groupby("역명")[time_columns].mean()
# )

# sat_average_out_df = (
#     sat_df[sat_df["구분"] == "하차"].groupby("역명")[time_columns].mean()
# )


Unnamed: 0_level_0,05~06,06~07,07~08,08~09,09~10,10~11,11~12,12~13,13~14,14~15,15~16,16~17,17~18,18~19,19~20,20~21,21~22,22~23,23~24,00~01
역명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
가락시장,114.894231,430.682692,993.769231,3095.778846,1307.475962,798.971154,797.836538,799.826923,859.471154,906.408654,985.317308,1053.774038,1257.778846,1863.259615,1641.115385,1060.750000,923.168269,879.548077,515.490385,210.384615
가산디지털단지,175.067308,878.687500,5154.889423,18275.677885,8348.615385,2112.831731,1410.076923,1279.432692,1311.947115,1195.865385,1099.177885,1125.067308,1190.331731,1334.240385,1147.725962,592.346154,455.841346,403.543269,277.605769,113.346154
강남,486.778846,2513.250000,8327.341346,15525.793269,14131.490385,6430.509615,5146.000000,4692.634615,5850.769231,5066.826923,4826.052885,4927.567308,6070.841346,9538.519231,7590.841346,3116.620192,2304.735577,1972.870192,1164.480769,425.754808
강남구청,62.149038,484.711538,1316.692308,4625.375000,3554.105769,1256.038462,758.610577,638.788462,706.418269,630.653846,574.913462,587.298077,625.153846,774.990385,606.187500,356.375000,292.932692,240.019231,169.302885,93.000000
강동,50.158654,557.721154,857.024038,1460.379808,884.211538,612.870192,551.423077,577.471154,636.870192,673.682692,809.307692,956.038462,1214.884615,1922.360577,1983.120192,1224.745192,1106.158654,1120.394231,720.711538,330.033654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
홍제,56.336538,330.105769,512.192308,1290.115385,990.725962,783.360577,718.298077,789.923077,896.350962,933.653846,1086.899038,1281.201923,1529.485577,2317.610577,2373.341346,1403.355769,1263.754808,1241.451923,772.673077,265.048077
화곡,65.038462,354.725962,615.639423,1271.418269,1020.682692,702.259615,727.033654,815.423077,918.533654,1035.177885,1297.139423,1674.875000,2256.576923,4214.394231,4417.591346,2669.182692,2176.293269,2175.206731,1351.009615,421.927885
화랑대(서울여대입구),33.524038,143.596154,313.769231,758.038462,494.009615,351.038462,333.447115,334.000000,342.596154,364.730769,406.461538,572.442308,690.740385,1381.500000,1527.913462,921.110577,770.721154,888.668269,588.629808,220.062500
회현(남대문시장),32.990385,1139.403846,2698.538462,6766.298077,4246.038462,3124.163462,3337.865385,2654.144231,2260.908654,2163.307692,1830.158654,1370.721154,1032.461538,922.312500,575.894231,329.504808,337.908654,326.019231,218.423077,78.201923


In [132]:
metro_timely_concat = MetroTimelyConcat()

wkd_yearly_mean_df = metro_timely_concat.yearly_mean(wkd_df)
fri_yearly_mean_df = metro_timely_concat.yearly_mean(fri_df)
sat_yearly_mean_df = metro_timely_concat.yearly_mean(sat_df)
sun_yearly_mean_df = metro_timely_concat.yearly_mean(sun_df)

In [None]:
class ProcessingDiff:
    

In [133]:
wkd_yearly_mean_df = (
    wkd_yearly_mean_df.set_index(["역명", "구분", "year"])
    .stack()
    .reset_index()
)
wkd_yearly_mean_df.columns = ["역명", "구분", "year", "time", "wkd"]

In [134]:
fri_yearly_mean_df = (
    fri_yearly_mean_df.set_index(["역명", "구분", "year"])
    .stack()
    .reset_index()
)
fri_yearly_mean_df.columns = ["역명", "구분", "year", "time", "fri"]

In [135]:
wkd_fri_yearly_mean_df = pd.merge(
    left=wkd_yearly_mean_df,
    right=fri_yearly_mean_df,
    on=["역명", "구분", "year", "time"],
)