## AIHub Json Parsing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [1]:
import re
import os
import kss
import json
from mecab import MeCab
import pandas as pd
from glob import glob

In [30]:
pwd

'd:\\AIHUB'

### Function

In [3]:
def json_file_name_list(path_list):
    for i in path_list:
        file_name = glob(i, recursive = True)
    return file_name

In [4]:
def train_valid_json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [11]:
def divide_source_file_list(l, n): 
    for i in range(0, len(l), n): 
        yield l[i:i + n] 

In [5]:
def txt_file_name_list(source_file_nested_list, folder_corpus_type_name):
   
  text_file_name_list = []

  for i in range(len(source_file_nested_list)):
    txt_file_name = folder_corpus_type_name + str(i) + ".txt"
    text_file_name_list.append(txt_file_name)

  return text_file_name_list

In [6]:
def train_valid_txt_file_name_list(source_file_nested_list, folder_corpus_type_name):
   
  train_valid_set = source_file_nested_list[0][0]
  text_file_name_list = []

  if 'rain' in train_valid_set:
    for i in range(len(source_file_nested_list)):
      txt_file_name = folder_corpus_type_name + str(i) + ".txt"
      text_file_name_list.append(txt_file_name)

  elif 'alid' in train_valid_set:
    for i in range(len(source_file_nested_list)):
      txt_file_name = folder_corpus_type_name + str(i) + ".txt"
      text_file_name_list.append(txt_file_name)  

  return text_file_name_list

In [7]:
def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거
    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # Formal articles (wiki, news, essays): recommend to False

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            sentence = re.sub(r"\[.*?\]|\{.*?\}", "",  sentence)
            # The String starts with a letter
            # The String ends with [. ! ?]
            # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거
            
            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외
                preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

In [8]:
def informal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거
    for sentence in kss.split_sentences(source, use_heuristic=True,
                                        num_workers=32):
    # Formal articles (wiki, news, essays): recommend to False

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            sentence = re.sub(r"\[.*?\]|\{.*?\}", "",  sentence)
            # The String starts with a letter
            # The String ends with [. ! ?]
            # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거
            
            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외
                preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

### AIHUB 대규모 웹데이터 기반 한국어 말뭉치 데이터

[Source](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=624)

In [20]:
path_list = ['AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/IT_과학/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/건강/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/경제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/교육/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/국제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/라이프스타일/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/문화/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/사건사고/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/사회일반/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/산업/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/스포츠/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/여성복지/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/여행레저/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/연예/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/정치/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/지역/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/취미/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/IT_과학/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/건강/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/경제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/교육/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/국제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/라이프스타일/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/문화/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/사건사고/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/사회일반/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/산업/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/스포츠/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/여성복지/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/여행레저/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/연예/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/정치/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/지역/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/취미/' + '**/*.json',]

train_file_name_01 = glob(path_list[0], recursive = True)
train_file_name_02 = glob(path_list[1], recursive = True)
train_file_name_03 = glob(path_list[2], recursive = True)
train_file_name_04 = glob(path_list[3], recursive = True)
train_file_name_05 = glob(path_list[4], recursive = True)
train_file_name_06 = glob(path_list[5], recursive = True)
train_file_name_07 = glob(path_list[6], recursive = True)
train_file_name_08 = glob(path_list[7], recursive = True)
train_file_name_09 = glob(path_list[8], recursive = True)
train_file_name_10 = glob(path_list[9], recursive = True)
train_file_name_11 = glob(path_list[10], recursive = True)
train_file_name_12 = glob(path_list[11], recursive = True)
train_file_name_13 = glob(path_list[12], recursive = True)
train_file_name_14 = glob(path_list[13], recursive = True)
train_file_name_15 = glob(path_list[14], recursive = True)
train_file_name_16 = glob(path_list[15], recursive = True)
train_file_name_17 = glob(path_list[16], recursive = True)

valid_file_name_01 = glob(path_list[17], recursive = True)
valid_file_name_02 = glob(path_list[18], recursive = True)
valid_file_name_03 = glob(path_list[19], recursive = True)
valid_file_name_04 = glob(path_list[20], recursive = True)
valid_file_name_05 = glob(path_list[21], recursive = True)
valid_file_name_06 = glob(path_list[22], recursive = True)
valid_file_name_07 = glob(path_list[23], recursive = True)
valid_file_name_08 = glob(path_list[24], recursive = True)
valid_file_name_09 = glob(path_list[25], recursive = True)
valid_file_name_10 = glob(path_list[26], recursive = True)
valid_file_name_11 = glob(path_list[27], recursive = True)
valid_file_name_12 = glob(path_list[28], recursive = True)
valid_file_name_13 = glob(path_list[29], recursive = True)
valid_file_name_14 = glob(path_list[30], recursive = True)
valid_file_name_15 = glob(path_list[31], recursive = True)
valid_file_name_16 = glob(path_list[32], recursive = True)
valid_file_name_17 = glob(path_list[33], recursive = True)

In [21]:
the_number_of_file = len(train_file_name_01) 
print("The number of file: ", the_number_of_file)
n = the_number_of_file // 10
print("The number of list element:", n)
train_file_nested_list_01 = list(divide_source_file_list(train_file_name_01, n))
train_file_nested_list_02 = list(divide_source_file_list(train_file_name_02, n))
train_file_nested_list_03 = list(divide_source_file_list(train_file_name_03, n))
train_file_nested_list_04 = list(divide_source_file_list(train_file_name_04, n))
train_file_nested_list_05 = list(divide_source_file_list(train_file_name_05, n))
train_file_nested_list_06 = list(divide_source_file_list(train_file_name_06, n))
train_file_nested_list_07 = list(divide_source_file_list(train_file_name_07, n))
train_file_nested_list_08 = list(divide_source_file_list(train_file_name_08, n))
train_file_nested_list_09 = list(divide_source_file_list(train_file_name_09, n))
train_file_nested_list_10 = list(divide_source_file_list(train_file_name_10, n))
train_file_nested_list_11 = list(divide_source_file_list(train_file_name_11, n))
train_file_nested_list_12 = list(divide_source_file_list(train_file_name_12, n))
train_file_nested_list_13 = list(divide_source_file_list(train_file_name_13, n))
train_file_nested_list_14 = list(divide_source_file_list(train_file_name_14, n))
train_file_nested_list_15 = list(divide_source_file_list(train_file_name_15, n))
train_file_nested_list_16 = list(divide_source_file_list(train_file_name_16, n))
train_file_nested_list_17 = list(divide_source_file_list(train_file_name_17, n))

valid_file_nested_list_01 = list(divide_source_file_list(valid_file_name_01, n))
valid_file_nested_list_02 = list(divide_source_file_list(valid_file_name_02, n))
valid_file_nested_list_03 = list(divide_source_file_list(valid_file_name_03, n))
valid_file_nested_list_04 = list(divide_source_file_list(valid_file_name_04, n))
valid_file_nested_list_05 = list(divide_source_file_list(valid_file_name_05, n))
valid_file_nested_list_06 = list(divide_source_file_list(valid_file_name_06, n))
valid_file_nested_list_07 = list(divide_source_file_list(valid_file_name_07, n))
valid_file_nested_list_08 = list(divide_source_file_list(valid_file_name_08, n))
valid_file_nested_list_09 = list(divide_source_file_list(valid_file_name_09, n))
valid_file_nested_list_10 = list(divide_source_file_list(valid_file_name_10, n))
valid_file_nested_list_11 = list(divide_source_file_list(valid_file_name_11, n))
valid_file_nested_list_12 = list(divide_source_file_list(valid_file_name_12, n))
valid_file_nested_list_13 = list(divide_source_file_list(valid_file_name_13, n))
valid_file_nested_list_14 = list(divide_source_file_list(valid_file_name_14, n))
valid_file_nested_list_15 = list(divide_source_file_list(valid_file_name_15, n))
valid_file_nested_list_16 = list(divide_source_file_list(valid_file_name_16, n))
valid_file_nested_list_17 = list(divide_source_file_list(valid_file_name_17, n))

The number of file:  1217
The number of list element: 121


In [None]:
train_text_file_name_list_01 = train_valid_txt_file_name_list(train_file_nested_list_01,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_IT_과학_01_")
train_text_file_name_list_02 = train_valid_txt_file_name_list(train_file_nested_list_02,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_건강_02_")
train_text_file_name_list_03 = train_valid_txt_file_name_list(train_file_nested_list_03,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_경제_03_")
train_text_file_name_list_04 = train_valid_txt_file_name_list(train_file_nested_list_04,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_교육_04_")
train_text_file_name_list_05 = train_valid_txt_file_name_list(train_file_nested_list_05,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_국제_05_")
train_text_file_name_list_06 = train_valid_txt_file_name_list(train_file_nested_list_06,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_라이프스타일_06_")
train_text_file_name_list_07 = train_valid_txt_file_name_list(train_file_nested_list_07,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_문화_07_")
train_text_file_name_list_08 = train_valid_txt_file_name_list(train_file_nested_list_08,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_사건사고_08_")
train_text_file_name_list_09 = train_valid_txt_file_name_list(train_file_nested_list_09,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_사회일반_09_")
train_text_file_name_list_10 = train_valid_txt_file_name_list(train_file_nested_list_10,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_산업_10_")
train_text_file_name_list_11 = train_valid_txt_file_name_list(train_file_nested_list_11,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_스포츠_11_")
train_text_file_name_list_12 = train_valid_txt_file_name_list(train_file_nested_list_12,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_여성복지_12_")
train_text_file_name_list_13 = train_valid_txt_file_name_list(train_file_nested_list_13,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_여행레저_13_")
train_text_file_name_list_14 = train_valid_txt_file_name_list(train_file_nested_list_14,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_연예_14_")
train_text_file_name_list_15 = train_valid_txt_file_name_list(train_file_nested_list_15,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_정치_15_")
train_text_file_name_list_16 = train_valid_txt_file_name_list(train_file_nested_list_16,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_지역_16_")
train_text_file_name_list_17 = train_valid_txt_file_name_list(train_file_nested_list_17,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_취미_17_")
valid_text_file_name_list_01 = train_valid_txt_file_name_list(valid_file_nested_list_01,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_IT_과학_01_")
valid_text_file_name_list_02 = train_valid_txt_file_name_list(valid_file_nested_list_02,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_건강_02_")
valid_text_file_name_list_03 = train_valid_txt_file_name_list(valid_file_nested_list_03,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_경제_03_")
valid_text_file_name_list_04 = train_valid_txt_file_name_list(valid_file_nested_list_04,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_교육_04_")
valid_text_file_name_list_05 = train_valid_txt_file_name_list(valid_file_nested_list_05,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_국제_05_")
valid_text_file_name_list_06 = train_valid_txt_file_name_list(valid_file_nested_list_06,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_라이프스타일_06_")
valid_text_file_name_list_07 = train_valid_txt_file_name_list(valid_file_nested_list_07,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_문화_07_")
valid_text_file_name_list_08 = train_valid_txt_file_name_list(valid_file_nested_list_08,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_사건사고_08_")
valid_text_file_name_list_09 = train_valid_txt_file_name_list(valid_file_nested_list_09,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_사회일반_09_")
valid_text_file_name_list_10 = train_valid_txt_file_name_list(valid_file_nested_list_10,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_산업_10_")
valid_text_file_name_list_11 = train_valid_txt_file_name_list(valid_file_nested_list_11,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_스포츠_11_")
valid_text_file_name_list_12 = train_valid_txt_file_name_list(valid_file_nested_list_12,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_여성복지_12_")
valid_text_file_name_list_13 = train_valid_txt_file_name_list(valid_file_nested_list_13,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_여행레저_13_")
valid_text_file_name_list_14 = train_valid_txt_file_name_list(valid_file_nested_list_14,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_연예_14_")
valid_text_file_name_list_15 = train_valid_txt_file_name_list(valid_file_nested_list_15,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_정치_15_")
valid_text_file_name_list_16 = train_valid_txt_file_name_list(valid_file_nested_list_16,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_지역_16_")
valid_text_file_name_list_17 = train_valid_txt_file_name_list(valid_file_nested_list_17,
                                                               "demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_취미_17_")
the_numer_of_txt_file = len(train_text_file_name_list_01) + len(train_text_file_name_list_02) + \
    len(train_text_file_name_list_03) + len(train_text_file_name_list_04) + \
    len(train_text_file_name_list_05) + len(train_text_file_name_list_06) + \
    len(train_text_file_name_list_07) + len(train_text_file_name_list_08) + \
    len(train_text_file_name_list_09) + len(train_text_file_name_list_10) +  \
    len(train_text_file_name_list_11) + len(train_text_file_name_list_12) + \
    len(train_text_file_name_list_13) + len(train_text_file_name_list_14) +  \
    len(train_text_file_name_list_15) + len(train_text_file_name_list_16) +  \
    len(train_text_file_name_list_17) + /
    len(valid_text_file_name_list_01) + len(valid_text_file_name_list_02) + \
    len(valid_text_file_name_list_03) + len(valid_text_file_name_list_04) + \
    len(valid_text_file_name_list_05) + len(valid_text_file_name_list_06) + \
    len(valid_text_file_name_list_07) + len(valid_text_file_name_list_08) + \
    len(valid_text_file_name_list_09) + len(valid_text_file_name_list_10) +  \
    len(valid_text_file_name_list_11) + len(valid_text_file_name_list_12) + \
    len(valid_text_file_name_list_13) + len(valid_text_file_name_list_14) +  \
    len(valid_text_file_name_list_15) + len(valid_text_file_name_list_16) +  \
    len(valid_text_file_name_list_17)
    
print("The number of txt file:", the_numer_of_txt_file )

In [28]:
def make_corpus_txt(source_file_nested_list, text_file_name_list):

  print("[Size]")
  print("The numnber of preprocessing corpus: " + str(len(source_file_nested_list)))
  print("\n[Order]")
  num = 0
  for source_file_list, txt_file_name in zip(source_file_nested_list, text_file_name_list):
    sentence_list = []
    
    num += 1
    print(str(num), end=" ")

    for i in range(len(source_file_list)):

      if type(source_file_list) == str:
        source_file = source_file_list

      elif type(source_file_list) != str:
        source_file = source_file_list[i]

      with open(source_file, 'r', encoding='utf-8') as one_json_file:
        one_json_sample = json.load(one_json_file)
      
      source_list = list(pd.DataFrame(one_json_sample['SJML']['text'])['content'])
      for source in source_list:
          sentences = formal_preprocessing_text(source)
          for sentence in sentences:
            sentence_list.append(sentence)   
          
    with open(os.path.join('AIHUB_corpus/', txt_file_name), "a", encoding='utf-8') as fp:        
        fp.write("\n".join(sentence_list))        

In [None]:
make_corpus_txt(train_file_nested_list_01, train_text_file_name_list_01)

In [None]:
make_corpus_txt(train_file_nested_list_02, train_text_file_name_list_02)

In [None]:
make_corpus_txt(train_file_nested_list_03, train_text_file_name_list_03)

In [None]:
make_corpus_txt(train_file_nested_list_04, train_text_file_name_list_04)

In [None]:
make_corpus_txt(train_file_nested_list_05, train_text_file_name_list_05)

In [None]:
make_corpus_txt(train_file_nested_list_06, train_text_file_name_list_06)

In [None]:
make_corpus_txt(train_file_nested_list_07, train_text_file_name_list_07)

In [None]:
make_corpus_txt(train_file_nested_list_08, train_text_file_name_list_08)

In [None]:
make_corpus_txt(train_file_nested_list_09, train_text_file_name_list_09)

In [None]:
make_corpus_txt(train_file_nested_list_10, train_text_file_name_list_10)

In [None]:
make_corpus_txt(train_file_nested_list_11, train_text_file_name_list_11)

In [None]:
make_corpus_txt(train_file_nested_list_12, train_text_file_name_list_12)

In [None]:
make_corpus_txt(train_file_nested_list_13, train_text_file_name_list_13)

In [None]:
make_corpus_txt(train_file_nested_list_14, train_text_file_name_list_14)

In [None]:
make_corpus_txt(train_file_nested_list_15, train_text_file_name_list_15)

In [None]:
make_corpus_txt(train_file_nested_list_16, train_text_file_name_list_16)

In [None]:
make_corpus_txt(train_file_nested_list_17, train_text_file_name_list_17)

In [None]:
make_corpus_txt(valid_file_nested_list_01, valid_text_file_name_list_01)

In [None]:
make_corpus_txt(valid_file_nested_list_02, valid_text_file_name_list_02)

In [None]:
make_corpus_txt(valid_file_nested_list_03, valid_text_file_name_list_03)

In [None]:
make_corpus_txt(valid_file_nested_list_04, valid_text_file_name_list_04)

In [None]:
make_corpus_txt(valid_file_nested_list_05, valid_text_file_name_list_05)

In [None]:
make_corpus_txt(valid_file_nested_list_06, valid_text_file_name_list_06) 

In [None]:
make_corpus_txt(valid_file_nested_list_07, valid_text_file_name_list_07)

In [None]:
make_corpus_txt(valid_file_nested_list_08, valid_text_file_name_list_08)

In [None]:
make_corpus_txt(valid_file_nested_list_09, valid_text_file_name_list_09)

In [None]:
make_corpus_txt(valid_file_nested_list_10, valid_text_file_name_list_10)

In [None]:
make_corpus_txt(valid_file_nested_list_11, valid_text_file_name_list_11)

In [None]:
make_corpus_txt(valid_file_nested_list_12, valid_text_file_name_list_12)

In [None]:
make_corpus_txt(valid_file_nested_list_13, valid_text_file_name_list_13)

In [None]:
make_corpus_txt(valid_file_nested_list_14, valid_text_file_name_list_14)

In [None]:
make_corpus_txt(valid_file_nested_list_15, valid_text_file_name_list_15)

In [None]:
make_corpus_txt(valid_file_nested_list_16, valid_text_file_name_list_16)

In [None]:
make_corpus_txt(valid_file_nested_list_17, valid_text_file_name_list_17)

In [None]:
corpus_list = glob("AIHUB_corpus/demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_train_" +"*.txt")
corpus_list += glob("AIHUB_corpus/demo/web_data_based_korean_corpus_data/AIHUB_web_data_based_korean_corpus_data_valid_" +"*.txt")

In [None]:
with open('AIHUB_corpus/demo/AIHUB_web_data_based_korean_corpus_data.txt', 'w', encoding='utf-8') as f:
    for corpus in corpus_list:
        with open(corpus, encoding='utf-8') as text:
            for line in text:
                f.write(line)