In [None]:
from glob import glob

import pandas as pd
from tqdm import tqdm

In [None]:
file_paths = sorted(glob("./data/raw_data/*/*.csv"))

In [None]:
is_instruction = lambda x: '방법' in x
is_not_instruction = lambda x: not is_instruction(x)

filtered_file_paths = list(filter(is_not_instruction, file_paths))

In [None]:
def read_csv(file_path):
    encodings = ["utf-8", "cp949"]
    seps = [",", "|"]
    for encoding in encodings:
        for sep in seps:
            try:
                df = pd.read_csv(file_path, encoding=encoding, sep=sep, low_memory=False)
                return (df, encoding, sep)
            except UnicodeDecodeError:
                continue
            except pd.errors.ParserError:
                continue
            except Exception:
                print(file_path)
                continue
    return (None, None, None)

In [None]:
failed_files = list()
file_read_params = dict()
file_columns = dict()

for file_path in tqdm(file_paths):
    df, encoding, sep = read_csv(file_path)
    if df is not None:
        file_read_params[file_path] = {"encoding": encoding, "sep": sep}
        file_columns[file_path] = list(df.columns)
    else:
        failed_files.append(file_path)

In [None]:
all_columns = list(file_columns.values())

In [None]:
from collections import Counter

In [None]:
Counter(sum(all_columns,[]))

In [None]:
failed_files

In [None]:
failed_file = failed_files[0]

In [None]:
pd.read_csv(failed_file, encoding='utf-8', sep='|', on_bad_lines='skip').head()

In [None]:
file_read_params[failed_file] = {
    'encoding': 'utf-8',
    'sep': '|',
    'on_bad_lines': 'skip',
}

In [None]:
import json

with open("./data/file_read_params.json", 'w') as f:
    json.dump(file_read_params, f, indent=4, ensure_ascii=False)