In [None]:
from functools import reduce
from collections import Counter
from glob import glob

import pandas as pd
from tqdm import tqdm
import dill

In [None]:
file_paths = glob("./data/raw_data/*.csv")

In [None]:
def read_csv(file_path):
    encodings = ["utf-8", "cp949"]
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, low_memory=False, encoding=encoding)
            return (df, encoding)
        except UnicodeDecodeError:
            continue
        except pd.errors.ParserError:
            return (None, None)

In [None]:
file_encoding_dict = dict()
file_column_dict = dict()

for file_path in tqdm(file_paths):
    df, encoding = read_csv(file_path)
    
    if df is not None:
        file_encoding_dict[file_path] = encoding
        file_column_dict[file_path] = list(df.columns)
    
    else:
        file_encoding_dict[file_path] = "ParseError"
        file_column_dict[file_path] = "ParseError"

In [None]:
Counter(list(file_encoding_dict.values()))

In [None]:
read_csv = lambda file_path: pd.read_csv(file_path, low_memory=False, encoding='cp949')
with open("./read_csv.pkl", 'wb') as f:
    dill.dump(read_csv, f)

In [None]:
columns_list = list(file_column_dict.values())

In [None]:
Counter(sum(columns_list,[]))

In [None]:
get_first = lambda x: x.split()[0]

preproc = lambda cols: [get_first(col) for col in cols]
Counter(sum(list(map(preproc,columns_list)),[]))

In [None]:
get_first = lambda x: x.split()[0]
do_strip = lambda x: x.strip()

preproc = lambda cols: [do_strip(get_first(col)) for col in cols]
Counter(sum(list(map(preproc,columns_list)),[]))

In [None]:
get_first = lambda x: x.split()[0]
do_strip = lambda x: x.strip()
drop_bracket = lambda x : x.split("(")[0]

preproc = lambda cols: [drop_bracket(do_strip(get_first(col))) for col in cols]
Counter(sum(list(map(preproc,columns_list)),[]))

In [None]:
def composer(*funcs):
    return reduce(lambda f, g: lambda x: g(f(x)), funcs)


def get_first(x):
    return x.split()[0]


def do_strip(x):
    return x.strip()


def drop_bracket(x):
    return x.split("(")[0]

preproc = composer(get_first, do_strip, drop_bracket)
# Counter(sum(list(map(preproc,columns_list)),[]))

In [None]:
import dill

with open("./preproc_column.pkl", 'wb') as f:
    dill.dump(preproc, f)

In [None]:
columns = {
    # base_cols
    '자료생성년월': 100,
    '사업자등록번호': 100,
    # corp_cols
    '사업장명': 100,
    '사업장가입상태코드': 100,
    '사업장형태구분코드': 100,
    '사업장업종코드': 100,
    '사업장업종코드명': 100,
    '적용일자': 100,
    '재등록일자': 100,
    '탈퇴일자': 100,
    # addr_cols
    '사업장지번상세주소': 100,
    '사업장도로명상세주소': 100,
    '고객법정동주소코드': 100,
    '고객행정동주소코드': 100,
    '법정동주소광역시도코드': 100,
    '법정동주소광역시시군구코드': 100,
    '법정동주소광역시시군구읍면동코드': 100,
    '우편번호': 100,
    # nps_cols (national pension service(국민연금))
    '가입자수': 100,
    '당월고지금액': 100,
    '신규취득자수': 100,
    '상실가입자수': 100,
}

with open("./columns.pkl", 'wb') as f:
    dill.dump(columns, f)