## AIHub Json Parsing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [1]:
import re
import os
import kss
import ray
import json
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'D:\\AIHUB'

### Function

In [3]:
def sorted_list(path_list):
    path_list = sorted(path_list, reverse=False)
    path_list = sorted(path_list, key=len)
    
    return path_list

In [4]:
def json_file_name_list(path_list):
    
    file_name  = [glob(i, recursive = True) for i in path_list][0]
    file_name = sorted_list(file_name)
    
    return file_name

In [5]:
def train_valid_json_file_name_list(path_list):

  train_file_name, valid_file_name = [glob(i, recursive = True) if 'rain' in i
                                      else glob(i, recursive = True)
                                      for i in path_list]

  train_file_name = sorted_list(train_file_name)
  valid_file_name = sorted_list(valid_file_name)
    
  return train_file_name, valid_file_name

In [6]:
def divide_source_file_list(l, n): 
    
  for i in range(0, len(l), n): 
    yield l[i:i + n] 

In [7]:
def txt_file_name_list(source_file_nested_list, folder_corpus_type_name):

  text_file_name_list = [folder_corpus_type_name + str(i) + ".txt"
                              for i in range(len(source_file_nested_list))]
    
  return text_file_name_list

In [8]:
def post_txt_file_name_list(corpus_list):
   
  post_corpus_list = [corpus_file.replace("pro", "post")
                      for corpus_file in corpus_list]

  post_corpus_list = sorted_list(post_corpus_list)

  return post_corpus_list

In [9]:
def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거

    source = re.sub(r"\[.*?\]|\{.*?\}", "", source)
    # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

    bracket_form = re.compile('\(([^)]+)')
    text_in_small_bracket = bracket_form.findall(source)
    
    if type(text_in_small_bracket) == str:
        
        text = text_in_small_bracket
        
        text_size = len(text)
        last_index = source.find(text) + len(text)
        if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
            source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
            
        if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
            small_bracket = "(" + text + ")"
            source = source.replace(small_bracket, text)    
        
    elif type(text_in_small_bracket) != str:
        
        for text in text_in_small_bracket:
            
            text_size = len(text)
            last_index = source.find(text) + len(text)
            if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
                source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
                
            if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
                small_bracket = "(" + text + ")"
                source = source.replace(small_bracket, text)    
                    
    # 마침표(.) 앞에 소괄호')'가 있을시 소괄호 제거와 함께 소괄호 내부 텍스트 제거
    # 소괄호 내부 텍스트가 5어절 이상이고 끝이 온점(.). 느낌표(!). 물음표(?)일 떼 소괄호 제거
    
    ganada_form = re.compile('[가나다라마바사아자차카타파하].')
    ganada_text = ganada_form.findall(source)
    
    if type(ganada_text) == str:
        
        ganada_index = source.find(ganada_text)
    
        if ganada_index == 0:
            source = re.sub(source[:2], "", source)
            
    elif type(ganada_text) != str:
        
        for text in ganada_text:
            ganada_index = source.find(text)

            if ganada_index == 0:
                source = re.sub(source[:2], "", source)
                
    source = re.sub(r' [가나다라마바사아자차카타파하].', "", source)
    # '가.', '나.', ... 형태의 문자열 제거   

    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # KSS(Korean Sentence Segmentation)로 문장 분리 
    # Formal articles (wiki, news, essays): recommend to False
    

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            # 문장의 시작이 특수문자인 문장(영어 대소문자, 한글, 한자, 숫자, -, + 제외
            # 문장의 끝이 온점(.). 느낌표(!). 물음표(?)가 아닌 문장 제외
            # 다섯 어절 이하 문장 제외


            if ']' in sentence and '[' not in sentence:
                sentence  = re.sub(r".*?]", "", sentence)    
            # 중괄호 앞에 있는 '성명/직함]' 형태 제거


            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-.,]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호, 마침표, 쉼표, 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거

            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외

                for sentence2 in kss.split_sentences(sentence, use_heuristic=False,
                                        num_workers=32):
                    for sentence3 in kss.split_sentences(sentence2, use_heuristic=False,
                                                         num_workers=32):
                        preprocessing_sentence_list.append(sentence3)

            # 마지막에 KSS(Korean Sentence Segmentation)로 문장 분리 2번 실행

  
    return preprocessing_sentence_list

### AIHUB 대규모 웹데이터 기반 한국어 말뭉치 데이터

[Source](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=624)

In [10]:
path_list = ['AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/IT_과학/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/건강/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/경제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/교육/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/국제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/라이프스타일/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/문화/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/사건사고/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/사회일반/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/산업/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/스포츠/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/여성복지/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/여행레저/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/연예/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/정치/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/지역/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/취미/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/IT_과학/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/건강/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/경제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/교육/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/국제/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/라이프스타일/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/문화/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/사건사고/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/사회일반/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/산업/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/스포츠/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/여성복지/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/여행레저/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/연예/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/정치/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/지역/' + '**/*.json',
'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/취미/' + '**/*.json',]

train_file_name_01 = glob(path_list[0], recursive = True)
train_file_name_02 = glob(path_list[1], recursive = True)
train_file_name_03 = glob(path_list[2], recursive = True)
train_file_name_04 = glob(path_list[3], recursive = True)
train_file_name_05 = glob(path_list[4], recursive = True)
train_file_name_06 = glob(path_list[5], recursive = True)
train_file_name_07 = glob(path_list[6], recursive = True)
train_file_name_08 = glob(path_list[7], recursive = True)
train_file_name_09 = glob(path_list[8], recursive = True)
train_file_name_10 = glob(path_list[9], recursive = True)
train_file_name_11 = glob(path_list[10], recursive = True)
train_file_name_12 = glob(path_list[11], recursive = True)
train_file_name_13 = glob(path_list[12], recursive = True)
train_file_name_14 = glob(path_list[13], recursive = True)
train_file_name_15 = glob(path_list[14], recursive = True)
train_file_name_16 = glob(path_list[15], recursive = True)
train_file_name_17 = glob(path_list[16], recursive = True)

valid_file_name_01 = glob(path_list[17], recursive = True)
valid_file_name_02 = glob(path_list[18], recursive = True)
valid_file_name_03 = glob(path_list[19], recursive = True)
valid_file_name_04 = glob(path_list[20], recursive = True)
valid_file_name_05 = glob(path_list[21], recursive = True)
valid_file_name_06 = glob(path_list[22], recursive = True)
valid_file_name_07 = glob(path_list[23], recursive = True)
valid_file_name_08 = glob(path_list[24], recursive = True)
valid_file_name_09 = glob(path_list[25], recursive = True)
valid_file_name_10 = glob(path_list[26], recursive = True)
valid_file_name_11 = glob(path_list[27], recursive = True)
valid_file_name_12 = glob(path_list[28], recursive = True)
valid_file_name_13 = glob(path_list[29], recursive = True)
valid_file_name_14 = glob(path_list[30], recursive = True)
valid_file_name_15 = glob(path_list[31], recursive = True)
valid_file_name_16 = glob(path_list[32], recursive = True)
valid_file_name_17 = glob(path_list[33], recursive = True)

In [11]:
the_number_of_file = len(train_file_name_01) 
print("The number of file: ", the_number_of_file)
n = the_number_of_file // 10
print("The number of list element:", n)
train_file_nested_list_01 = list(divide_source_file_list(train_file_name_01, n))
train_file_nested_list_02 = list(divide_source_file_list(train_file_name_02, n))
train_file_nested_list_03 = list(divide_source_file_list(train_file_name_03, n))
train_file_nested_list_04 = list(divide_source_file_list(train_file_name_04, n))
train_file_nested_list_05 = list(divide_source_file_list(train_file_name_05, n))
train_file_nested_list_06 = list(divide_source_file_list(train_file_name_06, n))
train_file_nested_list_07 = list(divide_source_file_list(train_file_name_07, n))
train_file_nested_list_08 = list(divide_source_file_list(train_file_name_08, n))
train_file_nested_list_09 = list(divide_source_file_list(train_file_name_09, n))
train_file_nested_list_10 = list(divide_source_file_list(train_file_name_10, n))
train_file_nested_list_11 = list(divide_source_file_list(train_file_name_11, n))
train_file_nested_list_12 = list(divide_source_file_list(train_file_name_12, n))
train_file_nested_list_13 = list(divide_source_file_list(train_file_name_13, n))
train_file_nested_list_14 = list(divide_source_file_list(train_file_name_14, n))
train_file_nested_list_15 = list(divide_source_file_list(train_file_name_15, n))
train_file_nested_list_16 = list(divide_source_file_list(train_file_name_16, n))
train_file_nested_list_17 = list(divide_source_file_list(train_file_name_17, n))

valid_file_nested_list_01 = list(divide_source_file_list(valid_file_name_01, n))
valid_file_nested_list_02 = list(divide_source_file_list(valid_file_name_02, n))
valid_file_nested_list_03 = list(divide_source_file_list(valid_file_name_03, n))
valid_file_nested_list_04 = list(divide_source_file_list(valid_file_name_04, n))
valid_file_nested_list_05 = list(divide_source_file_list(valid_file_name_05, n))
valid_file_nested_list_06 = list(divide_source_file_list(valid_file_name_06, n))
valid_file_nested_list_07 = list(divide_source_file_list(valid_file_name_07, n))
valid_file_nested_list_08 = list(divide_source_file_list(valid_file_name_08, n))
valid_file_nested_list_09 = list(divide_source_file_list(valid_file_name_09, n))
valid_file_nested_list_10 = list(divide_source_file_list(valid_file_name_10, n))
valid_file_nested_list_11 = list(divide_source_file_list(valid_file_name_11, n))
valid_file_nested_list_12 = list(divide_source_file_list(valid_file_name_12, n))
valid_file_nested_list_13 = list(divide_source_file_list(valid_file_name_13, n))
valid_file_nested_list_14 = list(divide_source_file_list(valid_file_name_14, n))
valid_file_nested_list_15 = list(divide_source_file_list(valid_file_name_15, n))
valid_file_nested_list_16 = list(divide_source_file_list(valid_file_name_16, n))
valid_file_nested_list_17 = list(divide_source_file_list(valid_file_name_17, n))

The number of file:  1217
The number of list element: 121


In [16]:
train_text_file_name_list_01 = txt_file_name_list(train_file_nested_list_01,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_01_IT_과학_")
train_text_file_name_list_02 = txt_file_name_list(train_file_nested_list_02,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_02_건강_")
train_text_file_name_list_03 = txt_file_name_list(train_file_nested_list_03,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_03_경제_")
train_text_file_name_list_04 = txt_file_name_list(train_file_nested_list_04,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_04_교육_")
train_text_file_name_list_05 = txt_file_name_list(train_file_nested_list_05,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_05_국제_")
train_text_file_name_list_06 = txt_file_name_list(train_file_nested_list_06,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_06_라이프스타일_")
train_text_file_name_list_07 = txt_file_name_list(train_file_nested_list_07,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_07_문화_")
train_text_file_name_list_08 = txt_file_name_list(train_file_nested_list_08,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_08_사건사고_")
train_text_file_name_list_09 = txt_file_name_list(train_file_nested_list_09,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_09_사회일반_")
train_text_file_name_list_10 = txt_file_name_list(train_file_nested_list_10,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_10_산업_")
train_text_file_name_list_11 = txt_file_name_list(train_file_nested_list_11,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_11_스포츠_")
train_text_file_name_list_12 = txt_file_name_list(train_file_nested_list_12,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_12_여성복지_")
train_text_file_name_list_13 = txt_file_name_list(train_file_nested_list_13,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_13_여행레저_")
train_text_file_name_list_14 = txt_file_name_list(train_file_nested_list_14,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_14_연예_")
train_text_file_name_list_15 = txt_file_name_list(train_file_nested_list_15,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_15_정치_")
train_text_file_name_list_16 = txt_file_name_list(train_file_nested_list_16,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_16_지역_")
train_text_file_name_list_17 = txt_file_name_list(train_file_nested_list_17,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_17_취미_")
valid_text_file_name_list_01 = txt_file_name_list(valid_file_nested_list_01,
                                                               "explorationo/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_01_IT_과학_")
valid_text_file_name_list_02 = txt_file_name_list(valid_file_nested_list_02,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_02_건강_")
valid_text_file_name_list_03 = txt_file_name_list(valid_file_nested_list_03,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_03_경제_")
valid_text_file_name_list_04 = txt_file_name_list(valid_file_nested_list_04,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_04_교육_")
valid_text_file_name_list_05 = txt_file_name_list(valid_file_nested_list_05,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_05_국제_")
valid_text_file_name_list_06 = txt_file_name_list(valid_file_nested_list_06,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_06_라이프스타일_")
valid_text_file_name_list_07 = txt_file_name_list(valid_file_nested_list_07,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_07_문화_")
valid_text_file_name_list_08 = txt_file_name_list(valid_file_nested_list_08,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_08_사건사고_")
valid_text_file_name_list_09 = txt_file_name_list(valid_file_nested_list_09,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_09_사회일반_")
valid_text_file_name_list_10 = txt_file_name_list(valid_file_nested_list_10,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_10_산업_")
valid_text_file_name_list_11 = txt_file_name_list(valid_file_nested_list_11,
                                                               "explorationo/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_11_스포츠_")
valid_text_file_name_list_12 = txt_file_name_list(valid_file_nested_list_12,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_12_여성복지_")
valid_text_file_name_list_13 = txt_file_name_list(valid_file_nested_list_13,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_13_여행레저_")
valid_text_file_name_list_14 = txt_file_name_list(valid_file_nested_list_14,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_14_연예_")
valid_text_file_name_list_15 = txt_file_name_list(valid_file_nested_list_15,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_15_정치_")
valid_text_file_name_list_16 = txt_file_name_list(valid_file_nested_list_16,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_16_지역_")
valid_text_file_name_list_17 = txt_file_name_list(valid_file_nested_list_17,
                                                               "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_17_취미_")
the_numer_of_txt_file = len(train_text_file_name_list_01) + len(train_text_file_name_list_02) + \
    len(train_text_file_name_list_03) + len(train_text_file_name_list_04) + \
    len(train_text_file_name_list_05) + len(train_text_file_name_list_06) + \
    len(train_text_file_name_list_07) + len(train_text_file_name_list_08) + \
    len(train_text_file_name_list_09) + len(train_text_file_name_list_10) +  \
    len(train_text_file_name_list_11) + len(train_text_file_name_list_12) + \
    len(train_text_file_name_list_13) + len(train_text_file_name_list_14) +  \
    len(train_text_file_name_list_15) + len(train_text_file_name_list_16) +  \
    len(train_text_file_name_list_17) + \
    len(valid_text_file_name_list_01) + len(valid_text_file_name_list_02) + \
    len(valid_text_file_name_list_03) + len(valid_text_file_name_list_04) + \
    len(valid_text_file_name_list_05) + len(valid_text_file_name_list_06) + \
    len(valid_text_file_name_list_07) + len(valid_text_file_name_list_08) + \
    len(valid_text_file_name_list_09) + len(valid_text_file_name_list_10) +  \
    len(valid_text_file_name_list_11) + len(valid_text_file_name_list_12) + \
    len(valid_text_file_name_list_13) + len(valid_text_file_name_list_14) +  \
    len(valid_text_file_name_list_15) + len(valid_text_file_name_list_16) +  \
    len(valid_text_file_name_list_17)
    
print("The number of txt file:", the_numer_of_txt_file)

The number of txt file: 503


In [None]:
def list_length_checker(source_file_nested_list, batch_size):
    
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    temp_nested_index = []
    
    for source_file_list in source_file_nested_list:   
        temp_index = []
        for i in range(len(source_file_list)):
            
            if type(source_file_list) == str:
                source_file = source_file_list

            elif type(source_file_list) != str:
                source_file = source_file_list[i]
            
            with open(source_file, 'r', encoding='utf-8') as one_json_file:
                one_json_sample = json.load(one_json_file) 

            source_list = list(pd.DataFrame(one_json_sample['SJML']['text'])['content'])

            the_number_of_txt_file = ((len(source_list) // batch_size) + 1)

            if len(source_list) >= 1000:
                print("File:", source_file)    
                print("Index:", i, "  ", "Length of Source List:", len(source_list), \
                    "  ", "The number of txt file:", the_number_of_txt_file, "\n")
                the_number_of_txt_file_list.append(the_number_of_txt_file)
                the_number_of_total_txt_file  += the_number_of_txt_file
            else:
                the_number_of_total_txt_file  += 1
                the_number_of_txt_file_list.append(1)
                if i > 0 and the_number_of_file % i == 0:
                    temp_index.append(i)
                    try:
                        if i == temp_nested_index[0][0] and len(temp_nested_index) <= 1:
                            print("[For Example]")
                            print("This is not subject of batch. It's small source list.")                            
                            print("File:", source_file)
                            print("Length of Source List:", len(source_list), 
                                    "  ", "The number of txt file:", 1, "\n") 
                    except:
                        pass

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [28]:
def make_corpus_txt_with_batch_list(source_file_nested_list,
                                    text_file_name_list,
                                    batch_size, the_number_of_total_txt_file_list):

  print("[Size]")
  print("The number of preprocessing corpus: " + str(sum(the_number_of_total_txt_file_list)))
  print("\n[Order]")
  num = 0
  for i in range(len(source_file_nested_list)):
    source_file_list = source_file_nested_list[i]
    
    for j in range(len(source_file_list)):
      
      if type(source_file_list) == str:
        source_file = source_file_list

      elif type(source_file_list) != str:
        source_file = source_file_list[j]

      with open(source_file, 'r', encoding='utf-8') as one_json_file:
        one_json_sample = json.load(one_json_file)

      source_list = list(pd.DataFrame(one_json_sample['SJML']['text'])['content'])
      
      n = batch_size
      source_batch_list = list(divide_source_file_list(source_list, n))
        
      for source_list in source_batch_list:
          num += 1
          print(str(num), end=" ")  
          
          with open(os.path.join('AIHUB_corpus/' + text_file_name_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
              fp.write("\n".join(source_list))   
    

In [None]:
batch_size = 1000
the_number_of_train_txt_file_01, the_number_of_train_txt_file_list_01 = list_length_checker(train_file_nested_list_01, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_02, the_number_of_train_txt_file_list_02 = list_length_checker(train_file_nested_list_02, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_03, the_number_of_train_txt_file_list_03 = list_length_checker(train_file_nested_list_03, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_04, the_number_of_train_txt_file_list_04 = list_length_checker(train_file_nested_list_04, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_05, the_number_of_train_txt_file_list_05 = list_length_checker(train_file_nested_list_05, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_06, the_number_of_train_txt_file_list_06 = list_length_checker(train_file_nested_list_06, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_07, the_number_of_train_txt_file_list_07 = list_length_checker(train_file_nested_list_07, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_08, the_number_of_train_txt_file_list_08 = list_length_checker(train_file_nested_list_08, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_09, the_number_of_train_txt_file_list_09 = list_length_checker(train_file_nested_list_09, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_10, the_number_of_train_txt_file_list_10 = list_length_checker(train_file_nested_list_10, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_11, the_number_of_train_txt_file_list_11 = list_length_checker(train_file_nested_list_11, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_12, the_number_of_train_txt_file_list_12 = list_length_checker(train_file_nested_list_12, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_13, the_number_of_train_txt_file_list_13 = list_length_checker(train_file_nested_list_13, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_14, the_number_of_train_txt_file_list_14 = list_length_checker(train_file_nested_list_14, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_15, the_number_of_train_txt_file_list_15 = list_length_checker(train_file_nested_list_15, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_16, the_number_of_train_txt_file_list_16 = list_length_checker(train_file_nested_list_16, batch_size)

In [None]:
batch_size = 1000
the_number_of_train_txt_file_17, the_number_of_train_txt_file_list_17 = list_length_checker(train_file_nested_list_17, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_01, the_number_of_valid_txt_file_list_01 = list_length_checker(valid_file_nested_list_01, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_02, the_number_of_valid_txt_file_list_02 = list_length_checker(valid_file_nested_list_02, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_03, the_number_of_valid_txt_file_list_03 = list_length_checker(valid_file_nested_list_03, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_04, the_number_of_valid_txt_file_list_04 = list_length_checker(valid_file_nested_list_04, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_05, the_number_of_valid_txt_file_list_05 = list_length_checker(valid_file_nested_list_05, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_06, the_number_of_valid_txt_file_list_06 = list_length_checker(valid_file_nested_list_06, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_07, the_number_of_valid_txt_file_list_07 = list_length_checker(valid_file_nested_list_07, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_08, the_number_of_valid_txt_file_list_08 = list_length_checker(valid_file_nested_list_08, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_09, the_number_of_valid_txt_file_list_09 = list_length_checker(valid_file_nested_list_09, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_10, the_number_of_valid_txt_file_list_10 = list_length_checker(valid_file_nested_list_10, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_11, the_number_of_valid_txt_file_list_11 = list_length_checker(valid_file_nested_list_11, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_12, the_number_of_valid_txt_file_list_12 = list_length_checker(valid_file_nested_list_12, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_13, the_number_of_valid_txt_file_list_13 = list_length_checker(valid_file_nested_list_13, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_14, the_number_of_valid_txt_file_list_14 = list_length_checker(valid_file_nested_list_14, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_15, the_number_of_valid_txt_file_list_15 = list_length_checker(valid_file_nested_list_15, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_16, the_number_of_valid_txt_file_list_16 = list_length_checker(valid_file_nested_list_16, batch_size)

In [None]:
batch_size = 1000
the_number_of_valid_txt_file_17, the_number_of_valid_txt_file_list_17 = list_length_checker(valid_file_nested_list_17, batch_size)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_01, train_text_file_name_list_01, batch_size, the_number_of_train_txt_file_01_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_02, train_text_file_name_list_02, batch_size, the_number_of_train_txt_file_02_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_03, train_text_file_name_list_03, batch_size, the_number_of_train_txt_file_03_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_04, train_text_file_name_list_04, batch_size, the_number_of_train_txt_file_04_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_05, train_text_file_name_list_05, batch_size, the_number_of_train_txt_file_05_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_06, train_text_file_name_list_06, batch_size, the_number_of_train_txt_file_06_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_07, train_text_file_name_list_07, batch_size, the_number_of_train_txt_file_07_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_08, train_text_file_name_list_08, batch_size, the_number_of_train_txt_file_08_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_09, train_text_file_name_list_09, batch_size, the_number_of_train_txt_file_09_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_10, train_text_file_name_list_10, batch_size, the_number_of_train_txt_file_10_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_11, train_text_file_name_list_11, batch_size, the_number_of_train_txt_file_11_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_12, train_text_file_name_list_12, batch_size, the_number_of_train_txt_file_12_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_13, train_text_file_name_list_13, batch_size, the_number_of_train_txt_file_13_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_14, train_text_file_name_list_14, batch_size, the_number_of_train_txt_file_14_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_15, train_text_file_name_list_15, batch_size, the_number_of_train_txt_file_15_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_16, train_text_file_name_list_16, batch_size, the_number_of_train_txt_file_16_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list_17, train_text_file_name_list_17, batch_size, the_number_of_train_txt_file_17_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_01, valid_text_file_name_list_01, batch_size, the_number_of_valid_txt_file_01_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_02, valid_text_file_name_list_02, batch_size, the_number_of_valid_txt_file_02_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_03, valid_text_file_name_list_03, batch_size, the_number_of_valid_txt_file_03_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_04, valid_text_file_name_list_04, batch_size, the_number_of_valid_txt_file_04_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_05, valid_text_file_name_list_05, batch_size, the_number_of_valid_txt_file_05_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_06, valid_text_file_name_list_06, batch_size, the_number_of_valid_txt_file_06_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_07, valid_text_file_name_list_07, batch_size, the_number_of_valid_txt_file_06_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_08, valid_text_file_name_list_08, batch_size, the_number_of_valid_txt_file_08_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_09, valid_text_file_name_list_09, batch_size, the_number_of_valid_txt_file_09_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_10, valid_text_file_name_list_10, batch_size, the_number_of_valid_txt_file_10_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_11, valid_text_file_name_list_11, batch_size, the_number_of_valid_txt_file_11_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_12, valid_text_file_name_list_12, batch_size, the_number_of_valid_txt_file_12_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_13, valid_text_file_name_list_13, batch_size, the_number_of_valid_txt_file_13_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_14, valid_text_file_name_list_14, batch_size, the_number_of_valid_txt_file_14_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_15, valid_text_file_name_list_15, batch_size, the_number_of_valid_txt_file_15_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_16, valid_text_file_name_list_16, batch_size, the_number_of_valid_txt_file_16_list)

In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list_17, valid_text_file_name_list_17, batch_size, the_number_of_valid_txt_file_17_list)

In [13]:
pro_total_corpus_list = glob("AIHUB_corpus/exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_" +"*.txt")

In [None]:
len(pro_total_corpus_list)

In [None]:
post_total_corpus_list = post_txt_file_name_list(pro_total_corpus_list)

In [None]:
line_list = []
line_num = 0
with open(pro_total_corpus_list[0], 'r', encoding='utf-8') as f:
    lines = f.read().splitlines() 
    for line in lines:
        line_num += 1
        if line_num <= 1:
           line_list.append(line)
for line in line_list:
    print(line, end="\n\n")

In [None]:
line_list = []
line_num = 0
with open(pro_total_corpus_list[0], 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()
    for line in lines:
        line_num += 1
        if line_num <= 1:  
            sentences = formal_preprocessing_text(line)
            for sentence in sentences:
                line_list.append(sentence) 
            
for line in line_list:
    print(line, end="\n\n")

In [None]:
ray.init()

@ray.remote

def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거

    source = re.sub(r"\[.*?\]|\{.*?\}", "", source)
    # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

    bracket_form = re.compile('\(([^)]+)')
    text_in_small_bracket = bracket_form.findall(source)
    
    if type(text_in_small_bracket) == str:
        
        text = text_in_small_bracket
        
        text_size = len(text)
        last_index = source.find(text) + len(text)
        if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
            source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
            
        if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
            small_bracket = "(" + text + ")"
            source = source.replace(small_bracket, text)    
        
    elif type(text_in_small_bracket) != str:
        
        for text in text_in_small_bracket:
            
            text_size = len(text)
            last_index = source.find(text) + len(text)
            if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
                source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
                
            if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
                small_bracket = "(" + text + ")"
                source = source.replace(small_bracket, text)    
                    
    # 마침표(.) 앞에 소괄호')'가 있을시 소괄호 제거와 함께 소괄호 내부 텍스트 제거
    # 소괄호 내부 텍스트가 5어절 이상이고 끝이 온점(.). 느낌표(!). 물음표(?)일 떼 소괄호 제거
    
    ganada_form = re.compile('[가나다라마바사아자차카타파하].')
    ganada_text = ganada_form.findall(source)
    
    if type(ganada_text) == str:
        
        ganada_index = source.find(ganada_text)
    
        if ganada_index == 0:
            source = re.sub(source[:2], "", source)
            
    elif type(ganada_text) != str:
        
        for text in ganada_text:
            ganada_index = source.find(text)

            if ganada_index == 0:
                source = re.sub(source[:2], "", source)
                
    source = re.sub(r' [가나다라마바사아자차카타파하].', "", source)
    # '가.', '나.', ... 형태의 문자열 제거   

    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # KSS(Korean Sentence Segmentation)로 문장 분리 
    # Formal articles (wiki, news, essays): recommend to False
    

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            # 문장의 시작이 특수문자인 문장(영어 대소문자, 한글, 한자, 숫자, -, + 제외
            # 문장의 끝이 온점(.). 느낌표(!). 물음표(?)가 아닌 문장 제외
            # 다섯 어절 이하 문장 제외


            if ']' in sentence and '[' not in sentence:
                sentence  = re.sub(r".*?]", "", sentence)    
            # 중괄호 앞에 있는 '성명/직함]' 형태 제거


            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-.,]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호, 마침표, 쉼표, 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거

            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외

                for sentence2 in kss.split_sentences(sentence, use_heuristic=False,
                                        num_workers=32):
                    for sentence3 in kss.split_sentences(sentence2, use_heuristic=False,
                                                         num_workers=32):
                        preprocessing_sentence_list.append(sentence3)

            # 마지막에 KSS(Korean Sentence Segmentation)로 문장 분리 2번 실행

  
    return preprocessing_sentence_list

In [None]:
print("[Size]")
print("The number of preprocessing corpus: " + str(len(pro_total_corpus_list)))
print("\n[Order]")
num = 0
process_num = 10    
for pro, post in zip(pro_total_corpus_list, post_total_corpus_list):
    
    with open(pro, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines() 
        nested_lines_num = len(lines) // process_num
        for i in range(nested_lines_num - 1):
            start_line = process_num * i
            end_line = process_num * (i+1)
            futures = [formal_preprocessing_text.remote(lines[start_line:end_line][j]) for j in range(process_num)]
            results = ray.get(futures)
            if i == nested_lines_num - 2:
                futures = [formal_preprocessing_text.remote(lines[end_line:][j]) for j in range(len(lines) - end_line)]
                results = ray.get(futures)
        sentence_list = list(chain.from_iterable(results))

    num += 1
    print(str(num), end=" ")  
        
    with open(post, 'a', encoding='utf-8') as fp:
        fp.write("\n".join(sentence_list))

In [None]:
ray.shutdown()

In [None]:
corpus_list = glob("AIHUB_corpus/exploration/web_data_based_korean_corpus_data_post/AIHUB_web_data_based_korean_corpus_data_" +"*.txt")

In [None]:
topic_name_list = ['01_IT_과학', '02_건강', '03_경제', '04_교육', '05_국제', '06_라이프스타일', '07_문화',
                  '08_사건사고', '09_사회일반', '10_산업', '11_스포츠', '12_여성복지', '13_여행레저',
                  '14_연예', '15_정치', '16_지역', '17_취미']

for i in range(len(topic_name_list)):
    with open('AIHUB_corpus/duplicate/AIHUB_web_data_based_korean_corpus_data_' + topic_name_list[i] + '.txt', 'w') as f:
        topic_corpus_list = [j if topic_name_list[i] in j else None for j in corpust_list]
        for corpus in topic_corpus_list:
            with open(corpus) as text:
                for line in text:
                    f.write(line)

In [None]:
with open('AIHUB_corpus/AIHUB_web_data_based_korean_corpus_data.txt', 'w', encoding='utf-8') as f1:
    with open('AIHUB_corpus/duplicate/AIHUB_web_data_based_korean_corpus_data_' + '*.txt', 'w') as f2:
        lines = f2.read().splitlines()
        single_sentence_dict = dict.fromkeys(lines)
        single_sentence_list = list(single_sentence_dict)
        f1.write("\n".join(single_sentence_list))           