In [1]:
from functools import reduce
from collections import Counter
from glob import glob

import pandas as pd
from tqdm import tqdm
import dill

In [2]:
file_paths = glob("./data/raw_data/*.csv")

In [3]:
def read_csv(file_path):
    encodings = ["utf-8", "cp949"]
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, low_memory=False, encoding=encoding)
            return (df, encoding)
        except UnicodeDecodeError:
            continue
        except pd.errors.ParserError:
            return (None, None)

In [4]:
file_encoding_dict = dict()
file_column_dict = dict()

for file_path in tqdm(file_paths):
    df, encoding = read_csv(file_path)
    
    if df is not None:
        file_encoding_dict[file_path] = encoding
        file_column_dict[file_path] = list(df.columns)
    
    else:
        file_encoding_dict[file_path] = "ParseError"
        file_column_dict[file_path] = "ParseError"

100%|██████████| 100/100 [02:38<00:00,  1.58s/it]


In [17]:
read_csv = lambda file_path: pd.read_csv(file_path, low_memory=False, encoding='cp949')
with open("./read_csv.pkl", 'wb') as f:
    dill.dump(read_csv, f)

In [5]:
Counter(list(file_encoding_dict.values()))

Counter({'cp949': 100})

In [6]:
columns_list = list(file_column_dict.values())

In [7]:
Counter(sum(columns_list,[]))

Counter({'자료생성년월 DATA_CRT_YM VARCHAR(6)': 50,
         '사업장명 WKPL_NM\tVARCHAR(100)': 50,
         '사업자등록번호 BZOWR_RGST_NO VARCHAR(10)': 50,
         '사업장가입상태코드 1:등록2:탈퇴 WKPL_JNNG_STCD VARCHAR(1)': 50,
         '우편번호 ZIP\tVARCHAR(6)': 50,
         '사업장지번상세주소 WKPL_LTNO_DTL_ADDR\tVARCHAR(300)': 50,
         '사업장도로명상세주소 WKPL_ROAD_NM_DTL_ADDR VARCHAR(300)': 50,
         '고객법정동주소코드 CUST_LDONG_ADDR_CD\tVARCHAR(10)': 50,
         '고객행정동주소코드 CUST_PADONG_ADDR_CD\tVARCHAR(10)': 50,
         '법정동주소광역시도코드 LDONG_ADDR_MGPL_DG_CD\tVARCHAR(2)': 50,
         '법정동주소광역시시군구코드 LDONG_ADDR_MGPL_SGGU_CD\tVARCHAR(5)': 50,
         '법정동주소광역시시군구읍면동코드 LDONG_ADDR_MGPL_SGGU_EMD_CD\tVARCHAR(8)': 50,
         '사업장형태구분코드 1:법인2:개인 WKPL_STYL_DVCD\tVARCHAR(1)': 50,
         '사업장업종코드 WKPL_INTP_CD\t국세청업종코드참조 VARCHAR(6)': 50,
         '사업장업종코드명 VLDT_VL_KRN_NM VARCHAR(200)': 50,
         '적용일자 ADPT_DT VARCHAR(8)': 50,
         '재등록일자 RRG_DT\tVARCHAR(8)': 50,
         '탈퇴일자 SCSN_DT VARCHAR(8)': 50,
         '가입자수 JNNGP_CNT INTE

In [8]:
get_first = lambda x: x.split()[0]

preproc = lambda cols: [get_first(col) for col in cols]
Counter(sum(list(map(preproc,columns_list)),[]))

Counter({'사업장명': 100,
         '사업자등록번호': 100,
         '사업장가입상태코드': 100,
         '우편번호': 100,
         '사업장지번상세주소': 100,
         '사업장도로명상세주소': 100,
         '고객법정동주소코드': 100,
         '고객행정동주소코드': 100,
         '법정동주소광역시도코드': 100,
         '법정동주소광역시시군구코드': 100,
         '법정동주소광역시시군구읍면동코드': 100,
         '사업장형태구분코드': 100,
         '사업장업종코드': 100,
         '사업장업종코드명': 100,
         '적용일자': 100,
         '재등록일자': 100,
         '탈퇴일자': 100,
         '자료생성년월': 82,
         '가입자수': 82,
         '당월고지금액': 82,
         '신규취득자수': 82,
         '상실가입자수': 82,
         '자료생성년월(자격마감일(사유발생일이': 18,
         '가입자수(고지인원': 18,
         '당월고지금액(': 18,
         '신규취득자수(납부재개': 18,
         '상실가입자수(납부예외': 18})

In [9]:
get_first = lambda x: x.split()[0]
do_strip = lambda x: x.strip()

preproc = lambda cols: [do_strip(get_first(col)) for col in cols]
Counter(sum(list(map(preproc,columns_list)),[]))

Counter({'사업장명': 100,
         '사업자등록번호': 100,
         '사업장가입상태코드': 100,
         '우편번호': 100,
         '사업장지번상세주소': 100,
         '사업장도로명상세주소': 100,
         '고객법정동주소코드': 100,
         '고객행정동주소코드': 100,
         '법정동주소광역시도코드': 100,
         '법정동주소광역시시군구코드': 100,
         '법정동주소광역시시군구읍면동코드': 100,
         '사업장형태구분코드': 100,
         '사업장업종코드': 100,
         '사업장업종코드명': 100,
         '적용일자': 100,
         '재등록일자': 100,
         '탈퇴일자': 100,
         '자료생성년월': 82,
         '가입자수': 82,
         '당월고지금액': 82,
         '신규취득자수': 82,
         '상실가입자수': 82,
         '자료생성년월(자격마감일(사유발생일이': 18,
         '가입자수(고지인원': 18,
         '당월고지금액(': 18,
         '신규취득자수(납부재개': 18,
         '상실가입자수(납부예외': 18})

In [5]:
def composer(*funcs):
    return reduce(lambda f, g: lambda x: g(f(x)), funcs)


def get_first(x):
    return x.split()[0]


def do_strip(x):
    return x.strip()


def drop_bracket(x):
    return x.split("(")[0]


# def preproc(cols):
#     return [drop_bracket(do_strip(get_first(col))) for col in cols]

preproc = composer(get_first, do_strip, drop_bracket)

# preproc = lambda cols: [drop_bracket(do_strip(get_first(col))) for col in cols]
# Counter(sum(list(map(preproc, columns_list)), []))

In [6]:
import dill

with open("./preproc_column.pkl", 'wb') as f:
    dill.dump(preproc, f)

In [None]:
columns = {
    # base_cols
    '자료생성년월': 100,
    '사업자등록번호': 100,
    # corp_cols
    '사업장명': 100,
    '사업장가입상태코드': 100,
    '사업장형태구분코드': 100,
    '사업장업종코드': 100,
    '사업장업종코드명': 100,
    '적용일자': 100,
    '재등록일자': 100,
    '탈퇴일자': 100,
    # addr_cols
    '사업장지번상세주소': 100,
    '사업장도로명상세주소': 100,
    '고객법정동주소코드': 100,
    '고객행정동주소코드': 100,
    '법정동주소광역시도코드': 100,
    '법정동주소광역시시군구코드': 100,
    '법정동주소광역시시군구읍면동코드': 100,
    '우편번호': 100,
    # nps_cols (national pension service(국민연금))
    '가입자수': 100,
    '당월고지금액': 100,
    '신규취득자수': 100,
    '상실가입자수': 100,
}

with open("./columns.pkl", 'wb') as f:
    dill.dump(columns, f)