In [1]:
import os
from glob import glob

import pandas as pd
from tqdm import tqdm

In [2]:
# PARAMS
BASE_COLS = ["상가업소번호"]

CATEGORY_COLS = ["상호명", "상권업종대분류명", "상권업종중분류명", "상권업종소분류명"]
ADDR_COLS = ["시도명", "시군구명", "행정동명", "법정동명"]
ADDR_DETAIL_COLS = ["도로명", "위도", "경도"]

In [3]:
def read_csv(file_path):
    encodings = ["utf-8", "cp949"]
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, low_memory=False, encoding=encoding)
            return df
        except UnicodeDecodeError:
            continue
    return None

In [4]:
def filter_df(df, base_cols, feature_cols):
    filtered_df = df.loc[:, base_cols + feature_cols]
    return filtered_df


filter_category_df = lambda df: filter_df(df, BASE_COLS, CATEGORY_COLS)
filter_addr_df = lambda df: filter_df(df, BASE_COLS, ADDR_COLS)
filter_addr_detail_df = lambda df: filter_df(df, BASE_COLS, ADDR_DETAIL_COLS)

In [5]:
def append_time(df, time):
    df["time"] = time
    return df

preproc_category = lambda df, time: append_time(filter_category_df(df), time)
preproc_addr = lambda df, time: append_time(filter_addr_df(df), time)
preproc_addr_detail = lambda df, time: append_time(filter_addr_detail_df(df), time)

In [6]:
get_time = lambda dir_path: dir_path.split("_")[-1]

In [7]:
failed_dir_paths = list()

dir_paths = sorted(glob("./data/raw_data/*"))

for dir_path in tqdm(dir_paths):
    time = get_time(dir_path)
    file_paths = glob(os.path.join(dir_path, "*.csv"))
    try:
        df = pd.concat([read_csv(file_path) for file_path in file_paths], axis=0)

        preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
        preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
        preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")
    except:
        failed_dir_paths.append(dir_path)
        print(dir_path)

  0%|          | 0/17 [00:00<?, ?it/s]

./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331
./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630
./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930
./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231


 76%|███████▋  | 13/17 [03:18<01:11, 17.93s/it]

./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331


100%|██████████| 17/17 [05:08<00:00, 18.16s/it]


In [9]:
failed_dir_paths

['./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331']

In [10]:
dir_path = failed_dir_paths[0]
dir_path

'./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331'

In [11]:
file_paths = glob(os.path.join(dir_path, "*.csv"))
file_paths

['./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_세종_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_강원_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_전남_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_경남_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_인천_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_경북_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_경기1_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_전북_202003.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200331/상가업소정보_울산_202003.csv',
 '

In [12]:
file_path = file_paths[0]
df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,21995886,대일공구,,D,소매,D21,철물/난방/건설자재소매,D21A06,철물/건설자재소매,G47511,...,4473025025100870013042285,,세종특별자치시 세종특별자치시 조치원읍 새내로 90-1,339882,30027,,1.0,,127.298552,36.59992
1,22183786,경동보일러,,D,소매,D21,철물/난방/건설자재소매,D21A02,보일러/냉난방용품,G47511,...,4473035022102160003003913,,세종특별자치시 세종특별자치시 전의면 왕의물로 40,339853,30004,,1.0,,127.204297,36.682526
2,22029920,종촌설비,,F,생활서비스,F15,주택수리,F15A03,배관난방보일러,F42201,...,4473032033106240001015893,,세종특별자치시 세종특별자치시 연서면 당산로 322,339813,30047,,1.0,,127.28099,36.560523
3,22463784,모애선교아가방,,D,소매,D11,유아용품,D11A03,유아용품판매,G47599,...,4473025028100890000033547,욱일아파트,세종특별자치시 세종특별자치시 조치원읍 충현로 159,339752,30021,,1.0,,127.294177,36.602922
4,22878158,채낭골통닭분식,,Q,음식,Q04,분식,Q04A01,라면김밥분식,I56194,...,4415034025101440000013420,,세종특별자치시 세종특별자치시 금남면 채나무길 61-3,339837,30086,,,,127.270561,36.426637


In [13]:
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
    except:
        print(file_path)

In [14]:
read_csv = lambda file_path: pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")

df = pd.concat([read_csv(file_path) for file_path in file_paths], axis=0)
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,21995886,대일공구,,D,소매,D21,철물/난방/건설자재소매,D21A06,철물/건설자재소매,G47511,...,4473025025100870013042285,,세종특별자치시 세종특별자치시 조치원읍 새내로 90-1,339882.0,30027.0,,1.0,,127.298552,36.59992
1,22183786,경동보일러,,D,소매,D21,철물/난방/건설자재소매,D21A02,보일러/냉난방용품,G47511,...,4473035022102160003003913,,세종특별자치시 세종특별자치시 전의면 왕의물로 40,339853.0,30004.0,,1.0,,127.204297,36.682526
2,22029920,종촌설비,,F,생활서비스,F15,주택수리,F15A03,배관난방보일러,F42201,...,4473032033106240001015893,,세종특별자치시 세종특별자치시 연서면 당산로 322,339813.0,30047.0,,1.0,,127.28099,36.560523
3,22463784,모애선교아가방,,D,소매,D11,유아용품,D11A03,유아용품판매,G47599,...,4473025028100890000033547,욱일아파트,세종특별자치시 세종특별자치시 조치원읍 충현로 159,339752.0,30021.0,,1.0,,127.294177,36.602922
4,22878158,채낭골통닭분식,,Q,음식,Q04,분식,Q04A01,라면김밥분식,I56194,...,4415034025101440000013420,,세종특별자치시 세종특별자치시 금남면 채나무길 61-3,339837.0,30086.0,,,,127.270561,36.426637


In [16]:
time = get_time(dir_path)

preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")

In [17]:
dir_path = failed_dir_paths[1]
dir_path

'./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630'

In [18]:
file_paths = glob(os.path.join(dir_path, "*.csv"))
file_paths

['./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_부산_20200630.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_경남_20200630.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_세종_20200630.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_강원_20200630.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_충남_20200630.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_전북_20200630.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200630/소상공인시장진흥공단_상가(상권)정보_경기_202

In [19]:
file_path = file_paths[0]
df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,20031733,동해제일산오징어,,Q,음식,Q03,일식/수산물,Q03A13,낙지/오징어,I56111,...,2626010800113800002003460,,"부산광역시 동래구 아시아드대로220번길 30, (온천동)",607060,47838.0,,1,,129.068324,35.202902
1,19912201,싱싱커피&토스트,,Q,음식,Q07,패스트푸드,Q07A10,토스트전문,I56192,...,2653010400105780000002037,산업용품유통상가,"부산광역시 사상구 괘감로 37, (괘법동)",617726,46977.0,,1,26.0,128.980455,35.159774
2,20388346,옥이영양탕,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2623010400108450056015744,,"부산광역시 부산진구 범일로142번가길 16, (범천동)",614020,47364.0,,1,,129.060972,35.143421
3,20476586,대성식당,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2644010400100010377015257,,"부산광역시 강서구 신포길 13-2, (명지동)",618813,46717.0,,1,,128.932142,35.112985
4,20033259,리지호프소주,,Q,음식,Q06,양식,Q06A01,정통양식/경양식,I56114,...,2623010400108470015054042,,"부산광역시 부산진구 범일로 166, (범천동)",614020,47361.0,,1,,129.059574,35.144776


In [20]:
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
    except:
        print(file_path)

In [21]:
read_csv = lambda file_path: pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")

df = pd.concat([read_csv(file_path) for file_path in file_paths], axis=0)
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,20031733,동해제일산오징어,,Q,음식,Q03,일식/수산물,Q03A13,낙지/오징어,I56111,...,2626010800113800002003460,,"부산광역시 동래구 아시아드대로220번길 30, (온천동)",607060.0,47838.0,,1,,129.068324,35.202902
1,19912201,싱싱커피&토스트,,Q,음식,Q07,패스트푸드,Q07A10,토스트전문,I56192,...,2653010400105780000002037,산업용품유통상가,"부산광역시 사상구 괘감로 37, (괘법동)",617726.0,46977.0,,1,26.0,128.980455,35.159774
2,20388346,옥이영양탕,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2623010400108450056015744,,"부산광역시 부산진구 범일로142번가길 16, (범천동)",614020.0,47364.0,,1,,129.060972,35.143421
3,20476586,대성식당,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2644010400100010377015257,,"부산광역시 강서구 신포길 13-2, (명지동)",618813.0,46717.0,,1,,128.932142,35.112985
4,20033259,리지호프소주,,Q,음식,Q06,양식,Q06A01,정통양식/경양식,I56114,...,2623010400108470015054042,,"부산광역시 부산진구 범일로 166, (범천동)",614020.0,47361.0,,1,,129.059574,35.144776


In [23]:
time = get_time(dir_path)

preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")

In [24]:
dir_path = failed_dir_paths[2]
dir_path

'./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930'

In [25]:
file_paths = glob(os.path.join(dir_path, "*.csv"))
file_paths

['./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_전북_202009.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_경북_202009.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_경남_202009.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_전남_202009.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_인천_202009.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_광주_202009.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_강원_202009.csv',


In [26]:
file_path = file_paths[0]
df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,20179431,내고향식당,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,4514025021105470008041046,미니스톱함열점,전라북도 익산시 함열읍 함열중앙로 14,570804,54511.0,,1.0,,126.958849,36.078892
1,20349196,브이는비프로그램,,D,소매,D10,건강/미용식품,D10A01,다이어트상품판매,G47216,...,4511112900109710000006915,롯데백화점,전라북도 전주시 완산구 온고을로 2,560734,54946.0,,1.0,,127.121847,35.83446
2,20427824,용다방,,Q,음식,Q12,커피점/카페,Q12A01,커피전문점/카페/다방,I56220,...,4575038026104960010116609,"신광사진관,용다방",전라북도 임실군 관촌면 사선로 46-1,566812,55910.0,,1.0,,127.270176,35.674618
3,20354267,가족마트양념육,,D,소매,D01,음/식료품소매,D01A03,정육점,G47212,...,4511311500105940032015575,마트,전라북도 전주시 덕진구 호성1길 6,561823,54905.0,,,,127.15582,35.856046
4,20400064,황토건어물할인매장,,D,소매,D01,음/식료품소매,D01A11,건어물상,G47213,...,4511112900107900000027069,,전라북도 전주시 완산구 안터6길 9,560821,54950.0,,1.0,,127.118094,35.831521


In [27]:
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
    except:
        print(file_path)

./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_대전_20200930.csv


In [29]:
file_path = "./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20200930/소상공인시장진흥공단_상가(상권)정보_대전_20200930.csv"
df = pd.read_csv(file_path, sep=",", low_memory=False, on_bad_lines="skip", encoding="cp949")

In [31]:
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, sep=",", low_memory=False, on_bad_lines="skip", encoding="cp949")
    return df


df = pd.concat([read_csv(file_path) for file_path in file_paths], axis=0)
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,20179431,내고향식당,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,4514025021105470008041046,미니스톱함열점,전라북도 익산시 함열읍 함열중앙로 14,570804.0,54511.0,,1.0,,126.958849,36.078892
1,20349196,브이는비프로그램,,D,소매,D10,건강/미용식품,D10A01,다이어트상품판매,G47216,...,4511112900109710000006915,롯데백화점,전라북도 전주시 완산구 온고을로 2,560734.0,54946.0,,1.0,,127.121847,35.83446
2,20427824,용다방,,Q,음식,Q12,커피점/카페,Q12A01,커피전문점/카페/다방,I56220,...,4575038026104960010116609,"신광사진관,용다방",전라북도 임실군 관촌면 사선로 46-1,566812.0,55910.0,,1.0,,127.270176,35.674618
3,20354267,가족마트양념육,,D,소매,D01,음/식료품소매,D01A03,정육점,G47212,...,4511311500105940032015575,마트,전라북도 전주시 덕진구 호성1길 6,561823.0,54905.0,,,,127.15582,35.856046
4,20400064,황토건어물할인매장,,D,소매,D01,음/식료품소매,D01A11,건어물상,G47213,...,4511112900107900000027069,,전라북도 전주시 완산구 안터6길 9,560821.0,54950.0,,1.0,,127.118094,35.831521


In [32]:
time = get_time(dir_path)

preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")

In [36]:
dir_path = failed_dir_paths[3]
dir_path

'./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231'

In [37]:
file_paths = glob(os.path.join(dir_path, "*.csv"))
file_paths

['./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_부산_202012.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_대구_202012.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_제주_202012.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_세종_202012.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_대전_202012.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_서울_202012.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20201231/소상공인시장진흥공단_상가(상권)정보_경기_202012.csv',
 './data

In [38]:
file_path = file_paths[0]
df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,17175332,정성밥상,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2635010500105520002023290,,부산광역시 해운대구 해운대로 635-7,612819,48087.0,,,,129.159641,35.164474
1,23206623,미광장,,O,숙박,O02,모텔/여관/여인숙,O02A01,모텔/여관/여인숙,I55112,...,2611012600100250001004473,,부산광역시 중구 보수대로44번길 5,600074,48974.0,,,,129.023668,35.100979
2,20418637,BHC치킨,동래점,Q,음식,Q05,닭/오리요리,Q05A08,후라이드/양념치킨,I56193,...,2626010500101850000016529,,부산광역시 동래구 동래로147번길 18,607020,47802.0,,,,129.087156,35.205267
3,24659633,스텔라,,D,소매,D05,의복의류,D05A01,일반의류,G47416,...,2650010500101480004001694,비치아파트,부산광역시 수영구 광안해변로 100,613751,48305.0,,,20.0,129.115397,35.14377
4,17174094,도란도란,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2623011100101740004050547,,부산광역시 부산진구 가야대로482번길 29-3,614813,47327.0,,,,129.024938,35.151645


In [39]:
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")
    except:
        print(file_path)

In [40]:
read_csv = lambda file_path: pd.read_csv(file_path, sep="|", low_memory=False, on_bad_lines="skip", encoding="utf-8")

df = pd.concat([read_csv(file_path) for file_path in file_paths], axis=0)
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,17175332,정성밥상,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2635010500105520002023290,,부산광역시 해운대구 해운대로 635-7,612819.0,48087.0,,,,129.159641,35.164474
1,23206623,미광장,,O,숙박,O02,모텔/여관/여인숙,O02A01,모텔/여관/여인숙,I55112,...,2611012600100250001004473,,부산광역시 중구 보수대로44번길 5,600074.0,48974.0,,,,129.023668,35.100979
2,20418637,BHC치킨,동래점,Q,음식,Q05,닭/오리요리,Q05A08,후라이드/양념치킨,I56193,...,2626010500101850000016529,,부산광역시 동래구 동래로147번길 18,607020.0,47802.0,,,,129.087156,35.205267
3,24659633,스텔라,,D,소매,D05,의복의류,D05A01,일반의류,G47416,...,2650010500101480004001694,비치아파트,부산광역시 수영구 광안해변로 100,613751.0,48305.0,,,20.0,129.115397,35.14377
4,17174094,도란도란,,Q,음식,Q01,한식,Q01A01,한식/백반/한정식,I56111,...,2623011100101740004050547,,부산광역시 부산진구 가야대로482번길 29-3,614813.0,47327.0,,,,129.024938,35.151645


In [41]:
time = get_time(dir_path)

preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")

In [44]:
dir_path = failed_dir_paths[4]
dir_path

'./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331'

In [45]:
file_paths = glob(os.path.join(dir_path, "*.csv"))
file_paths

['./data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_세종_202303.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_제주_202303.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_대구_202303.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_부산_202303.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_인천_202303.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_경남_202303.csv',
 './data/raw_data/소상공인시장진흥공단_상가(상권)정보_20230331/소상공인시장진흥공단_상가(상권)정보_전남_202303.csv',
 './d

In [52]:
file_paths = [file_path for file_path in file_paths if '파일열람방법' not in file_path]

In [53]:
file_path = file_paths[0]
df = pd.read_csv(file_path, sep=",", low_memory=False, on_bad_lines="skip", encoding="cp949")
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,MA0101202210A0084499,베트남쌀국수퍼짱,,I2,음식,I205,동남아시아,I20501,베트남식 전문,I56194,...,3611010700103610086000001,모닝시티2,세종특별자치시 세종특별자치시 한누리대로 311,339003,30123,,1.0,,127.25584,36.492065
1,MA010120220805431075,바이루이,,S2,수리·개인,S207,이용·미용,S20701,미용실,S96112,...,3611010800103290000000001,새뜸마을3단지,세종특별자치시 세종특별자치시 새롬북로 14,339009,30126,,,,127.24592,36.486028
2,MA010120220805431936,책통클럽읽기훈련센타,,P1,교육,P106,기타 교육,P10629,그 외 기타 교육기관,P85699,...,3611010800200190000000001,해피라움W,세종특별자치시 세종특별자치시 새롬중앙로 62-15,339009,30127,,,,127.251201,36.487016
3,MA010120220805432402,세종유진부동산공인중개사사무소,,L1,부동산,L102,부동산 서비스,L10203,부동산 중개/대리업,L68221,...,3611010800103290000000001,새뜸마을3단지,세종특별자치시 세종특별자치시 새롬북로 14,339009,30126,,,,127.24592,36.486028
4,MA010120220805434190,더함인테리어컨설팅,,M1,과학·기술,M112,전문 디자인,M11201,인테리어 디자인업,M73201,...,3611036033106370003000001,,세종특별자치시 세종특별자치시 연서면 당산로 294,339813,30047,,,,127.278712,36.559184


In [54]:
read_csv = lambda file_path: pd.read_csv(file_path, sep=",", low_memory=False, on_bad_lines="skip", encoding="cp949")

df = pd.concat([read_csv(file_path) for file_path in file_paths], axis=0)
df.head()

Unnamed: 0,상가업소번호,상호명,지점명,상권업종대분류코드,상권업종대분류명,상권업종중분류코드,상권업종중분류명,상권업종소분류코드,상권업종소분류명,표준산업분류코드,...,건물관리번호,건물명,도로명주소,구우편번호,신우편번호,동정보,층정보,호정보,경도,위도
0,MA0101202210A0084499,베트남쌀국수퍼짱,,I2,음식,I205,동남아시아,I20501,베트남식 전문,I56194,...,3611010700103610086000001,모닝시티2,세종특별자치시 세종특별자치시 한누리대로 311,339003,30123,,1.0,,127.25584,36.492065
1,MA010120220805431075,바이루이,,S2,수리·개인,S207,이용·미용,S20701,미용실,S96112,...,3611010800103290000000001,새뜸마을3단지,세종특별자치시 세종특별자치시 새롬북로 14,339009,30126,,,,127.24592,36.486028
2,MA010120220805431936,책통클럽읽기훈련센타,,P1,교육,P106,기타 교육,P10629,그 외 기타 교육기관,P85699,...,3611010800200190000000001,해피라움W,세종특별자치시 세종특별자치시 새롬중앙로 62-15,339009,30127,,,,127.251201,36.487016
3,MA010120220805432402,세종유진부동산공인중개사사무소,,L1,부동산,L102,부동산 서비스,L10203,부동산 중개/대리업,L68221,...,3611010800103290000000001,새뜸마을3단지,세종특별자치시 세종특별자치시 새롬북로 14,339009,30126,,,,127.24592,36.486028
4,MA010120220805434190,더함인테리어컨설팅,,M1,과학·기술,M112,전문 디자인,M11201,인테리어 디자인업,M73201,...,3611036033106370003000001,,세종특별자치시 세종특별자치시 연서면 당산로 294,339813,30047,,,,127.278712,36.559184


In [55]:
time = get_time(dir_path)

preproc_category(df, time).to_csv(f"./data/preproc_data/category_{time}.csv")
preproc_addr(df, time).to_csv(f"./data/preproc_data/addr_{time}.csv")
preproc_addr_detail(df, time).to_csv(f"./data/preproc_data/addr_detail_{time}.csv")