In [1]:
from glob import glob

import pandas as pd
from tqdm import tqdm

In [2]:
class DongCodePreproc:
    def __init__(self, dong_code_df) -> None:
        self.dong_code_df = dong_code_df

    @staticmethod
    def __format_code(dong_code_df):
        dong_code_df["code"] = (
            dong_code_df["행정동코드"].astype(str).apply(lambda x: x[:-2])
        )
        return dong_code_df

    @staticmethod
    def __format_name(dong_code_df):
        dong_code_df["name"] = (
            dong_code_df[["시도명", "시군구명", "읍면동명"]]
            .fillna("")
            .agg("_".join, axis=1)
        )
        return dong_code_df

    def get_code_name_dict(self):
        df = self.dong_code_df.copy()
        df = self.__format_code(df)
        df = self.__format_name(df)
        code_name_dict = df.set_index("code")["name"].to_dict()
        return code_name_dict

In [3]:
class PopulationPreproc:
    def __init__(self, ppltn_df) -> None:
        self.ppltn_df = ppltn_df

    @staticmethod
    def __format_date(ppltn_df):
        ppltn_df["date"] = ppltn_df["기준일ID"].astype(str) + ppltn_df[
            "시간대구분"
        ].astype(str).apply(lambda x: x.zfill(2))
        ppltn_df["date"] = pd.to_datetime(ppltn_df["date"], format="%Y%m%d%H")
        ppltn_df.drop(columns=["기준일ID", "시간대구분"], inplace=True)
        return ppltn_df

    @staticmethod
    def __format_code(ppltn_df):
        ppltn_df["행정동코드"] = ppltn_df["행정동코드"].astype(str)
        return ppltn_df

    @staticmethod
    def __merge_man(ppltn_df, man_columns):
        ppltn_df["man"] = ppltn_df[man_columns].sum(axis=1)
        ppltn_df.drop(columns=man_columns, inplace=True)
        return ppltn_df

    @staticmethod
    def __merge_woman(ppltn_df, woman_columns):
        ppltn_df["woman"] = ppltn_df[woman_columns].sum(axis=1)
        ppltn_df.drop(columns=woman_columns, inplace=True)
        return ppltn_df

    def __call__(self, man_columns, woman_columns):
        df = self.__format_date(self.ppltn_df)
        df = self.__format_code(df)
        df = self.__merge_man(df, man_columns)
        df = self.__merge_woman(df, woman_columns)
        df.reset_index(drop=True, inplace=True)
        return df

In [4]:
class CalcDongOutlierScore:
    def __init__(self, ppltn_df) -> None:
        self.ppltn_df = ppltn_df

    def calc_mean_size(self):
        dong_mean_size = self.ppltn_df.groupby("행정동코드")[
            "총생활인구수"
        ].mean()
        return dong_mean_size

    def calc_max_size(self):
        dong_max_size = self.ppltn_df.groupby("행정동코드")[
            "총생활인구수"
        ].max()
        return dong_max_size

    def calc_max_size_time(self):
        ppltn_df = self.ppltn_df
        max_size_time = ppltn_df.loc[
            ppltn_df.groupby("행정동코드")["총생활인구수"].idxmax(), :
        ].set_index("행정동코드")["date"]
        return max_size_time
    

In [5]:
man_columns = [
    "남자0세부터9세생활인구수",
    "남자10세부터14세생활인구수",
    "남자15세부터19세생활인구수",
    "남자20세부터24세생활인구수",
    "남자25세부터29세생활인구수",
    "남자30세부터34세생활인구수",
    "남자35세부터39세생활인구수",
    "남자40세부터44세생활인구수",
    "남자45세부터49세생활인구수",
    "남자50세부터54세생활인구수",
    "남자55세부터59세생활인구수",
    "남자60세부터64세생활인구수",
    "남자65세부터69세생활인구수",
    "남자70세이상생활인구수",
]

woman_columns = [
    "여자0세부터9세생활인구수",
    "여자10세부터14세생활인구수",
    "여자15세부터19세생활인구수",
    "여자20세부터24세생활인구수",
    "여자25세부터29세생활인구수",
    "여자30세부터34세생활인구수",
    "여자35세부터39세생활인구수",
    "여자40세부터44세생활인구수",
    "여자45세부터49세생활인구수",
    "여자50세부터54세생활인구수",
    "여자55세부터59세생활인구수",
    "여자60세부터64세생활인구수",
    "여자65세부터69세생활인구수",
    "여자70세이상생활인구수",
]

In [6]:
dong_code_df = pd.read_excel("./../data/asset/KIKcd_H.20240208.xlsx")

dong_code_preproc = DongCodePreproc(dong_code_df)
code_name_dict = dong_code_preproc.get_code_name_dict()

In [7]:
file_paths = sorted(glob("./../data/preproc/population/*.csv"))

In [8]:
results = list()
for file_path in tqdm(file_paths):
    df = pd.read_csv(file_path, index_col=0)
    #    
    population_preproc = PopulationPreproc(df)
    ppltn_df = population_preproc(man_columns, woman_columns)    
    
    #
    calc_dong_outlier_score = CalcDongOutlierScore(ppltn_df)
    dong_max_size = calc_dong_outlier_score.calc_max_size()
    dong_mean_size = calc_dong_outlier_score.calc_mean_size()
    dong_max_size_time = calc_dong_outlier_score.calc_max_size_time()
    #
    dong_outlier_df = pd.concat(
        [dong_max_size.div(dong_mean_size), dong_max_size_time], axis=1
    )
    results.append(dong_outlier_df)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:30<00:00, 10.09s/it]


In [9]:
dong_final_df = pd.concat(results)

In [10]:
dong_final_df.index = dong_final_df.index.map(code_name_dict)

In [11]:
dong_final_df.to_csv("./../data/preproc/population/population_v1_final.csv")