## AIHub Json Preprocessing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [12]:
import re
import os
import kss
import ray
import json
import time
import inspect
from time import sleep
from tqdm import tqdm
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'c:\\Users\\MinSeok\\Documents\\text-preprocessing\\sentence-segmentation'

### AIHUB 대규모 웹데이터 기반 한국어 말뭉치 데이터

[Source](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=624)

#### Convert JSON File to TXT File

In [None]:
from data_preprocessing import make_topic_json_txt_file_path_list
from data_preprocessing import divide_source_file_list
from extract_source_text import make_sources

In [64]:
json_folder_list = ['AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/',
             'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/']

topic_name_list = ['01_IT_과학', '02_건강', '03_경제', '04_교육', '05_국제', '06_라이프스타일',
                   '07_문화', '08_사건사고', '09_사회일반', '10_산업', '11_스포츠', '12_여성복지',
                   '13_여행레저', '14_연예', '15_정치', '16_지역', '17_취미']

txt_path_list = ["exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_train_", 
                 "exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_valid_"]


In [65]:
make_topic_json_txt_file_path_list(json_folder_list, topic_name_list, txt_path_list)

The number of file:  58997


In [27]:
def count_number_of_txt_file_with_batch_list(source_file_list, batch_size):
    
    source_file_by_batch_df = pd.DataFrame({'File':[0], 'Length of Source List':[0],
                                            'The Number of TXT File':[0], 
                                            'Description':[0]})
                                            
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
        
    frame = inspect.currentframe()
    frame = inspect.getouterframes(frame)[1]
    string = inspect.getframeinfo(frame[0]).code_context[0].strip()
    name = string[string.find('(') + 1:-1].split(',')[0]
    file_name_number = re.findall(r'\d+', name)[0]

    for i in range(len(source_file_list)):    
        
        source_file = source_file_list[i]        

        with open(source_file, 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file) 

        source_list = make_sources(one_json_sample)
        
        the_number_of_txt_file = ((len(source_list) // batch_size) + 1) 

        if len(source_list) >= batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file, ""]
            the_number_of_txt_file_list.append(the_number_of_txt_file)
            the_number_of_total_txt_file  += the_number_of_txt_file

        elif len(source_list) < batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file,
                                              "not subject of batch. small source list."]
            the_number_of_txt_file_list.append(1)
            the_number_of_total_txt_file  += 1

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    if 'rain' in source_file:
        train_file = "source_file_by_batch/web_data_based_korean_corpus_data_train_" + file_name_number + ".xlsx"
        source_file_by_batch_df.to_excel(train_file, index=False)

    elif 'alid' in source_file:
        valid_file = "source_file_by_batch/web_data_based_korean_corpus_data_valid_" + file_name_number + ".xlsx"
        source_file_by_batch_df.to_excel(valid_file, index=False)

    else:
        plain_file = "source_file_by_batch/web_data_based_korean_corpus_data_" + file_name_number + ".xlsx"
        source_file_by_batch_df.to_excel(plain_file, index=False)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [28]:
def write_jsontext_to_txt_file_with_batch_list(source_file_list, text_file_path_list, batch_size, the_number_of_txt_file_list):

  progress_length = sum(the_number_of_txt_file_list)
  print("[Size]")
  print("The number of preprocessing corpus: " + str(progress_length))
  print("\n[Order]")
  pbar = tqdm(range(progress_length))
  num = 0
  
  for i in range(len(source_file_list)):

    source_file = source_file_list[i]
    
    with open(source_file, 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    source_list = make_sources(one_json_sample)
    
    n = batch_size
    source_batch_list = list(divide_source_file_list(source_list, n))
      
    for source_list in source_batch_list:   
      with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:
        fp.write("\n".join(source_list))           
      num += 1  
      pbar.n += 1
      pbar.refresh()
      time.sleep(0.01)
  pbar.close()  

In [29]:
batch_size = 1000
the_number_of_train_txt_file_01, the_number_of_train_txt_file_list_01 = count_number_of_txt_file_with_batch_list(train_json_file_list_01, batch_size)

Batch Size: 1000
The number of txt file: 1217


In [30]:
batch_size = 1000
the_number_of_train_txt_file_02, the_number_of_train_txt_file_list_02 = count_number_of_txt_file_with_batch_list(train_json_file_list_02, batch_size)

Batch Size: 1000
The number of txt file: 3247


In [31]:
batch_size = 1000
the_number_of_train_txt_file_03, the_number_of_train_txt_file_list_03 = count_number_of_txt_file_with_batch_list(train_json_file_list_03, batch_size)

Batch Size: 1000
The number of txt file: 5072


In [32]:
batch_size = 1000
the_number_of_train_txt_file_04, the_number_of_train_txt_file_list_04 = count_number_of_txt_file_with_batch_list(train_json_file_list_04, batch_size)

Batch Size: 1000
The number of txt file: 2789


In [33]:
batch_size = 1000
the_number_of_train_txt_file_05, the_number_of_train_txt_file_list_05 = count_number_of_txt_file_with_batch_list(train_json_file_list_05, batch_size)

Batch Size: 1000
The number of txt file: 2162


In [34]:
batch_size = 1000
the_number_of_train_txt_file_06, the_number_of_train_txt_file_list_06 = count_number_of_txt_file_with_batch_list(train_json_file_list_06, batch_size)

Batch Size: 1000
The number of txt file: 2913


In [35]:
batch_size = 1000
the_number_of_train_txt_file_07, the_number_of_train_txt_file_list_07 = count_number_of_txt_file_with_batch_list(train_json_file_list_07, batch_size)

Batch Size: 1000
The number of txt file: 1683


In [36]:
batch_size = 1000
the_number_of_train_txt_file_08, the_number_of_train_txt_file_list_08 = count_number_of_txt_file_with_batch_list(train_json_file_list_08, batch_size)

Batch Size: 1000
The number of txt file: 2813


In [37]:
batch_size = 1000
the_number_of_train_txt_file_09, the_number_of_train_txt_file_list_09 = count_number_of_txt_file_with_batch_list(train_json_file_list_09, batch_size)

Batch Size: 1000
The number of txt file: 3474


In [38]:
batch_size = 1000
the_number_of_train_txt_file_10, the_number_of_train_txt_file_list_10 = count_number_of_txt_file_with_batch_list(train_json_file_list_10, batch_size)

Batch Size: 1000
The number of txt file: 4031


In [39]:
batch_size = 1000
the_number_of_train_txt_file_11, the_number_of_train_txt_file_list_11 = count_number_of_txt_file_with_batch_list(train_json_file_list_11, batch_size)

Batch Size: 1000
The number of txt file: 2550


In [40]:
batch_size = 1000
the_number_of_train_txt_file_12, the_number_of_train_txt_file_list_12 = count_number_of_txt_file_with_batch_list(train_json_file_list_12, batch_size)

Batch Size: 1000
The number of txt file: 2786


In [41]:
batch_size = 1000
the_number_of_train_txt_file_13, the_number_of_train_txt_file_list_13 = count_number_of_txt_file_with_batch_list(train_json_file_list_13, batch_size)

Batch Size: 1000
The number of txt file: 3162


In [42]:
batch_size = 1000
the_number_of_train_txt_file_14, the_number_of_train_txt_file_list_14 = count_number_of_txt_file_with_batch_list(train_json_file_list_14, batch_size)

Batch Size: 1000
The number of txt file: 4603


In [43]:
batch_size = 1000
the_number_of_train_txt_file_15, the_number_of_train_txt_file_list_15 = count_number_of_txt_file_with_batch_list(train_json_file_list_15, batch_size)

Batch Size: 1000
The number of txt file: 3286


In [44]:
batch_size = 1000
the_number_of_train_txt_file_16, the_number_of_train_txt_file_list_16 = count_number_of_txt_file_with_batch_list(train_json_file_list_16, batch_size)

Batch Size: 1000
The number of txt file: 3296


In [45]:
batch_size = 1000
the_number_of_train_txt_file_17, the_number_of_train_txt_file_list_17 = count_number_of_txt_file_with_batch_list(train_json_file_list_17, batch_size)

Batch Size: 1000
The number of txt file: 2746


In [46]:
source_file_by_batch_train_01_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_01.xlsx', engine='openpyxl')  
source_file_by_batch_train_02_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_02.xlsx', engine='openpyxl')  
source_file_by_batch_train_03_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_03.xlsx', engine='openpyxl')  
source_file_by_batch_train_04_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_04.xlsx', engine='openpyxl')  
source_file_by_batch_train_05_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_05.xlsx', engine='openpyxl')  
source_file_by_batch_train_06_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_06.xlsx', engine='openpyxl')  
source_file_by_batch_train_07_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_07.xlsx', engine='openpyxl')  
source_file_by_batch_train_08_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_08.xlsx', engine='openpyxl')  
source_file_by_batch_train_09_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_09.xlsx', engine='openpyxl')  
source_file_by_batch_train_10_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_10.xlsx', engine='openpyxl')  
source_file_by_batch_train_11_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_11.xlsx', engine='openpyxl')  
source_file_by_batch_train_12_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_12.xlsx', engine='openpyxl')  
source_file_by_batch_train_10_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_10.xlsx', engine='openpyxl')  
source_file_by_batch_train_13_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_13.xlsx', engine='openpyxl')  
source_file_by_batch_train_10_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_10.xlsx', engine='openpyxl')  
source_file_by_batch_train_14_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_14.xlsx', engine='openpyxl')  
source_file_by_batch_train_15_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_15.xlsx', engine='openpyxl')  
source_file_by_batch_train_16_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_16.xlsx', engine='openpyxl')  
source_file_by_batch_train_17_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_train_17.xlsx', engine='openpyxl')  

In [47]:
source_file_by_batch_train_01_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,99,1,not subject of batch. small source list.
1,1,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,99,1,not subject of batch. small source list.
2,2,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,97,1,not subject of batch. small source list.
3,3,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,98,1,not subject of batch. small source list.
4,4,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,99,1,not subject of batch. small source list.
...,...,...,...,...,...
1212,1212,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,100,1,not subject of batch. small source list.
1213,1213,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,100,1,not subject of batch. small source list.
1214,1214,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,99,1,not subject of batch. small source list.
1215,1215,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/0...,99,1,not subject of batch. small source list.


In [67]:
batch_size = 1000
the_number_of_valid_txt_file_01, the_number_of_valid_txt_file_list_01 = count_number_of_txt_file_with_batch_list(valid_json_file_list_01, batch_size)

Batch Size: 1000
The number of txt file: 165


In [68]:
batch_size = 1000
the_number_of_valid_txt_file_02, the_number_of_valid_txt_file_list_02 = count_number_of_txt_file_with_batch_list(valid_json_file_list_02, batch_size)

Batch Size: 1000
The number of txt file: 573


In [69]:
batch_size = 1000
the_number_of_valid_txt_file_03, the_number_of_valid_txt_file_list_03 = count_number_of_txt_file_with_batch_list(valid_json_file_list_03, batch_size)

Batch Size: 1000
The number of txt file: 496


In [70]:
batch_size = 1000
the_number_of_valid_txt_file_04, the_number_of_valid_txt_file_list_04 = count_number_of_txt_file_with_batch_list(valid_json_file_list_04, batch_size)

Batch Size: 1000
The number of txt file: 337


In [71]:
batch_size = 1000
the_number_of_valid_txt_file_05, the_number_of_valid_txt_file_list_05 = count_number_of_txt_file_with_batch_list(valid_json_file_list_05, batch_size)

Batch Size: 1000
The number of txt file: 312


In [72]:
batch_size = 1000
the_number_of_valid_txt_file_06, the_number_of_valid_txt_file_list_06 = count_number_of_txt_file_with_batch_list(valid_json_file_list_06, batch_size)

Batch Size: 1000
The number of txt file: 421


In [73]:
batch_size = 1000
the_number_of_valid_txt_file_07, the_number_of_valid_txt_file_list_07 = count_number_of_txt_file_with_batch_list(valid_json_file_list_07, batch_size)

Batch Size: 1000
The number of txt file: 307


In [74]:
batch_size = 1000
the_number_of_valid_txt_file_08, the_number_of_valid_txt_file_list_08 = count_number_of_txt_file_with_batch_list(valid_json_file_list_08, batch_size)

Batch Size: 1000
The number of txt file: 380


In [75]:
batch_size = 1000
the_number_of_valid_txt_file_09, the_number_of_valid_txt_file_list_09 = count_number_of_txt_file_with_batch_list(valid_json_file_list_09, batch_size)

Batch Size: 1000
The number of txt file: 598


In [76]:
batch_size = 1000
the_number_of_valid_txt_file_10, the_number_of_valid_txt_file_list_10 = count_number_of_txt_file_with_batch_list(valid_json_file_list_10, batch_size)

Batch Size: 1000
The number of txt file: 521


In [77]:
batch_size = 1000
the_number_of_valid_txt_file_11, the_number_of_valid_txt_file_list_11 = count_number_of_txt_file_with_batch_list(valid_json_file_list_11, batch_size)

Batch Size: 1000
The number of txt file: 331


In [78]:
batch_size = 1000
the_number_of_valid_txt_file_12, the_number_of_valid_txt_file_list_12 = count_number_of_txt_file_with_batch_list(valid_json_file_list_12, batch_size)

Batch Size: 1000
The number of txt file: 306


In [79]:
batch_size = 1000
the_number_of_valid_txt_file_13, the_number_of_valid_txt_file_list_13 = count_number_of_txt_file_with_batch_list(valid_json_file_list_13, batch_size)

Batch Size: 1000
The number of txt file: 445


In [80]:
batch_size = 1000
the_number_of_valid_txt_file_14, the_number_of_valid_txt_file_list_14 = count_number_of_txt_file_with_batch_list(valid_json_file_list_14, batch_size)

Batch Size: 1000
The number of txt file: 720


In [81]:
batch_size = 1000
the_number_of_valid_txt_file_15, the_number_of_valid_txt_file_list_15 = count_number_of_txt_file_with_batch_list(valid_json_file_list_15, batch_size)

Batch Size: 1000
The number of txt file: 428


In [82]:
batch_size = 1000
the_number_of_valid_txt_file_16, the_number_of_valid_txt_file_list_16 = count_number_of_txt_file_with_batch_list(valid_json_file_list_16, batch_size)

Batch Size: 1000
The number of txt file: 359


In [83]:
batch_size = 1000
the_number_of_valid_txt_file_17, the_number_of_valid_txt_file_list_17 = count_number_of_txt_file_with_batch_list(valid_json_file_list_17, batch_size)

Batch Size: 1000
The number of txt file: 468


In [84]:
source_file_by_batch_valid_01_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_01.xlsx', engine='openpyxl')  
source_file_by_batch_valid_02_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_02.xlsx', engine='openpyxl')  
source_file_by_batch_valid_03_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_03.xlsx', engine='openpyxl')  
source_file_by_batch_valid_04_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_04.xlsx', engine='openpyxl')  
source_file_by_batch_valid_05_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_05.xlsx', engine='openpyxl')  
source_file_by_batch_valid_06_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_06.xlsx', engine='openpyxl')  
source_file_by_batch_valid_07_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_07.xlsx', engine='openpyxl')  
source_file_by_batch_valid_08_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_08.xlsx', engine='openpyxl')  
source_file_by_batch_valid_09_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_09.xlsx', engine='openpyxl')  
source_file_by_batch_valid_10_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_10.xlsx', engine='openpyxl')  
source_file_by_batch_valid_11_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_11.xlsx', engine='openpyxl')  
source_file_by_batch_valid_12_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_12.xlsx', engine='openpyxl')  
source_file_by_batch_valid_10_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_10.xlsx', engine='openpyxl')  
source_file_by_batch_valid_13_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_13.xlsx', engine='openpyxl')  
source_file_by_batch_valid_10_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_10.xlsx', engine='openpyxl')  
source_file_by_batch_valid_14_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_14.xlsx', engine='openpyxl')  
source_file_by_batch_valid_15_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_15.xlsx', engine='openpyxl')  
source_file_by_batch_valid_16_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_16.xlsx', engine='openpyxl')  
source_file_by_batch_valid_17_df = pd.read_excel('source_file_by_batch/web_data_based_korean_corpus_data_valid_17.xlsx', engine='openpyxl')  

In [85]:
source_file_by_batch_valid_01_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.
1,1,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.
2,2,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,99,1,not subject of batch. small source list.
3,3,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.
4,4,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.
...,...,...,...,...,...
160,160,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.
161,161,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.
162,162,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,99,1,not subject of batch. small source list.
163,163,AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1...,100,1,not subject of batch. small source list.


In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_01, train_txt_file_path_list_01, batch_size, the_number_of_train_txt_file_list_01)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_02, train_txt_file_path_list_02, batch_size, the_number_of_train_txt_file_list_02)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_03, train_txt_file_path_list_03, batch_size, the_number_of_train_txt_file_list_03)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_04, train_txt_file_path_list_04, batch_size, the_number_of_train_txt_file_list_04)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_05, train_txt_file_path_list_05, batch_size, the_number_of_train_txt_file_list_05)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_06, train_txt_file_path_list_06, batch_size, the_number_of_train_txt_file_list_06)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_07, train_txt_file_path_list_07, batch_size, the_number_of_train_txt_file_list_07)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_08, train_txt_file_path_list_08, batch_size, the_number_of_train_txt_file_list_08)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_09, train_txt_file_path_list_09, batch_size, the_number_of_train_txt_file_list_09)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_10, train_txt_file_path_list_10, batch_size, the_number_of_train_txt_file_list_10)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_11, train_txt_file_path_list_11, batch_size, the_number_of_train_txt_file_list_11)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_12, train_txt_file_path_list_12, batch_size, the_number_of_train_txt_file_list_12)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_13, train_txt_file_path_list_13, batch_size, the_number_of_train_txt_file_list_13)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_14, train_txt_file_path_list_14, batch_size, the_number_of_train_txt_file_list_14)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_15, train_txt_file_path_list_15, batch_size, the_number_of_train_txt_file_list_15)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_16, train_txt_file_path_list_16, batch_size, the_number_of_train_txt_file_list_16)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list_17, train_txt_file_path_list_17, batch_size, the_number_of_train_txt_file_list_17)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_01, valid_txt_file_path_list_01, batch_size, the_number_of_valid_txt_file_list_01)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_02, valid_txt_file_path_list_02, batch_size, the_number_of_valid_txt_file_list_02)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_03, valid_txt_file_path_list_03, batch_size, the_number_of_valid_txt_file_list_03)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_04, valid_txt_file_path_list_04, batch_size, the_number_of_valid_txt_file_list_04)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_05, valid_txt_file_path_list_05, batch_size, the_number_of_valid_txt_file_list_05)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_06, valid_txt_file_path_list_06, batch_size, the_number_of_valid_txt_file_list_06)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_07, valid_txt_file_path_list_07, batch_size, the_number_of_valid_txt_file_list_07)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_08, valid_txt_file_path_list_08, batch_size, the_number_of_valid_txt_file_list_08)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_09, valid_txt_file_path_list_09, batch_size, the_number_of_valid_txt_file_list_09)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_10, valid_txt_file_path_list_10, batch_size, the_number_of_valid_txt_file_list_10)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_11, valid_txt_file_path_list_11, batch_size, the_number_of_valid_txt_file_list_11)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_12, valid_txt_file_path_list_12, batch_size, the_number_of_valid_txt_file_list_12)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_13, valid_txt_file_path_list_13, batch_size, the_number_of_valid_txt_file_list_13)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_14, valid_txt_file_path_list_14, batch_size, the_number_of_valid_txt_file_list_14)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_15, valid_txt_file_path_list_15, batch_size, the_number_of_valid_txt_file_list_15)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_16, valid_txt_file_path_list_16, batch_size, the_number_of_valid_txt_file_list_16)

In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list_17, valid_txt_file_path_list_17, batch_size, the_number_of_valid_txt_file_list_17)

#### Preprocess TXT File

In [None]:
from sentence_segmentation import preprocessing_text
from data_preprocessing import make_pro_post_txt_file_path_list
from data_preprocessing import make_topic_pro_post_txt_file_name_list
from data_preprocessing import merge_and_deduplicate_topic_corpus_txt
from reading_data import reading_txt

In [8]:
pro_corpus_path = "AIHUB_corpus/exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [None]:
len(pro_total_corpus_path_list)

In [None]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "source"

reading_txt(pro_coprus_file, line_length, data_type)

In [None]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "preprocessing"

reading_txt(pro_coprus_file, line_length, data_type)

In [179]:
ray.init(num_cpus = 4)

@ray.remote
def ray_preprocessing_text(source, corpus_path):

    preprocessing_sentence_list = preprocessing_text(source, corpus_path)

    return preprocessing_sentence_list

2023-05-29 18:06:33,632	INFO worker.py:1625 -- Started a local Ray instance.


In [None]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):

    progress_length = len(pro_total_corpus_path_list)   
    print("[Size]")
    print("The number of preprocessing corpus: " + str(progress_length))
    print("\n[Order]")
    pbar = tqdm(range(progress_length))
    num = 0
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [ray_preprocessing_text.remote(lines[start_line:end_line][j], pro) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [ray_preprocessing_text.remote(lines[end_line:][j], pro) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))
        
        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)

    pbar.close() 

In [180]:
pro_corpus_name = "AIHUB_corpus/exploration/web_data_based_korean_corpus_data_pro/AIHUB_web_data_based_korean_corpus_data_"

topic_name_list = ['01_IT_과학', '02_건강', '03_경제', '04_교육', '05_국제', '06_라이프스타일', '07_문화',
                  '08_사건사고', '09_사회일반', '10_산업', '11_스포츠', '12_여성복지', '13_여행레저',
                  '14_연예', '15_정치', '16_지역', '17_취미']

In [None]:
make_topic_pro_post_txt_file_name_list(pro_corpus_path, topic_name_list)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_01, post_total_corpus_path_list_train_01)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_02, post_total_corpus_path_list_train_02)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_03, post_total_corpus_path_list_train_03)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_04, post_total_corpus_path_list_train_04)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_05, post_total_corpus_path_list_train_05)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_06, post_total_corpus_path_list_train_06)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_07, post_total_corpus_path_list_train_07)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_08, post_total_corpus_path_list_train_08)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_09, post_total_corpus_path_list_train_09)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_10, post_total_corpus_path_list_train_10)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_11, post_total_corpus_path_list_train_11)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_12, post_total_corpus_path_list_train_12)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_13, post_total_corpus_path_list_train_13)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_14, post_total_corpus_path_list_train_14)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_15, post_total_corpus_path_list_train_15)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_16, post_total_corpus_path_list_train_16)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_train_17, post_total_corpus_path_list_train_17)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_01, post_total_corpus_path_list_valid_01)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_02, post_total_corpus_path_list_valid_02)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_03, post_total_corpus_path_list_valid_03)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_04, post_total_corpus_path_list_valid_04)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_05, post_total_corpus_path_list_valid_05)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_06, post_total_corpus_path_list_valid_06)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_07, post_total_corpus_path_list_valid_07)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_08, post_total_corpus_path_list_valid_08)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_09, post_total_corpus_path_list_valid_09)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_10, post_total_corpus_path_list_valid_10)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_11, post_total_corpus_path_list_valid_11)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_12, post_total_corpus_path_list_valid_12)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_13, post_total_corpus_path_list_valid_13)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_14, post_total_corpus_path_list_valid_14)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_15, post_total_corpus_path_list_valid_15)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_16, post_total_corpus_path_list_valid_16)

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list_valid_17, post_total_corpus_path_list_valid_17)

In [178]:
ray.shutdown()

In [None]:
preprocessing_corpus_path = "AIHUB_corpus/exploration/web_data_based_korean_corpus_data_post/AIHUB_web_data_based_korean_corpus_data_" +"*.txt"
merge_corpus_path = 'AIHUB_corpus/duplicate/AIHUB_web_data_based_korean_corpus_data_'
deduplicate_corpus_path = 'AIHUB_corpus/AIHUB_web_data_based_korean_corpus_data.txt'

topic_name_list = ['01_IT_과학', '02_건강', '03_경제', '04_교육', '05_국제', '06_라이프스타일',
                   '07_문화', '08_사건사고', '09_사회일반', '10_산업', '11_스포츠', '12_여성복지',
                   '13_여행레저', '14_연예', '15_정치', '16_지역', '17_취미']

In [None]:
merge_and_deduplicate_topic_corpus_txt(preprocessing_corpus_path, merge_corpus_path,
                                        deduplicate_corpus_path, topic_name_list)