In [None]:
import os
import re
import json
from glob import glob

import pandas as pd
from tqdm import tqdm

In [None]:
# Params
BASE_COLS = ["상가업소번호"]

CATEGORY_COLS = ["상호명", "상권업종대분류명", "상권업종중분류명", "상권업종소분류명"]
ADDR_COLS = ["시도명", "시군구명", "행정동명", "법정동명"]
ADDR_DETAIL_COLS = ["도로명", "위도", "경도"]

In [None]:
# Loads
with open('./data/file_read_params.json', 'r') as f:
    file_read_params = json.load(f)

dir_paths = glob("./data/raw_data/*")

In [None]:
# Functions

def get_time(dir_paths):
    """dir -> time"""
    dir_name = os.path.basename(dir_paths)
    date = re.findall(r"\d{6}", dir_name)[-1]
    if date:
        return date
    else:
        raise ValueError

def read_csv(file_path, file_read_params):
    return pd.read_csv(file_path, **file_read_params[file_path], low_memory=False)

read_csv_with_params = lambda file_path: read_csv(file_path, file_read_params)

    
def filter_df(df, base_cols, feature_cols):
    filtered_df = df.loc[:, base_cols + feature_cols]
    return filtered_df


filter_category_df = lambda df: filter_df(df, BASE_COLS, CATEGORY_COLS)
filter_addr_df = lambda df: filter_df(df, BASE_COLS, ADDR_COLS)
filter_addr_detail_df = lambda df: filter_df(df, BASE_COLS, ADDR_DETAIL_COLS)

def append_time(df, time):
    df["time"] = time
    return df

preproc_category = lambda df, time: append_time(filter_category_df(df), time)
preproc_addr = lambda df, time: append_time(filter_addr_df(df), time)
preproc_addr_detail = lambda df, time: append_time(filter_addr_detail_df(df), time)

In [None]:
failed_dir_paths = []

for dir_path in tqdm(dir_paths):
    time = get_time(dir_path)
    file_paths = glob(os.path.join(dir_path, "*.csv"))
    try:
        df = pd.concat([read_csv_with_params(file_path) for file_path in file_paths], axis=0)

        preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
        preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
        preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")
    except:
        failed_dir_paths.append(dir_path)
        print(dir_path)

In [None]:
failed_dir_paths

In [None]:
dir_path = failed_dir_paths[0]

In [None]:
time = get_time(dir_path)
file_paths = glob(os.path.join(dir_path, "*.csv"))

df = pd.concat([read_csv_with_params(file_path) for file_path in file_paths], axis=0)

preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")