## AIHub Json Preprocessing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [1]:
import re
import os
import kss
import ray
import json
import time
from time import sleep
from tqdm import tqdm
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'D:\\AIHUB'

### AIHUB 도서자료 기계독해

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=92)

#### Convert JSON File to TXT File

In [None]:
from data_preprocessing import make_json_txt_file_path_list
from data_preprocessing import divide_source_file_list
from extract_source_text import make_sources

In [43]:
json_path_list = ['AIHUB_도서자료 기계독해/'+ '/**/*.json']
txt_path_list = ["exploration_test/general_common_sense_pro/AIHUB_general_common_sense_"]

In [44]:
json_file_list, txt_file_path_list = \
    make_json_txt_file_path_list(json_path_list, txt_path_list)

The number of file: 2


In [45]:
source_file_index_df = pd.DataFrame(json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/reading_books_by_machine_source_file_index.xlsx", index=False)

In [47]:
def count_number_of_txt_file_with_batch_list(source_file_list, batch_size):

    source_file_by_batch_df = pd.DataFrame({'File':[0], 'Length of Source List':[0],
                                        'The Number of TXT File':[0], 
                                        'Description':[0]})
                                            
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    
    for i in range(len(source_file_list)):    
        
        source_file = source_file_list[i]   

        with open(source_file, 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file)

        source_list = make_sources(one_json_sample)
        
        the_number_of_txt_file = ((len(source_list) // batch_size) + 1)

        if len(source_list) >= batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file, ""]
            the_number_of_txt_file_list.append(the_number_of_txt_file)
            the_number_of_total_txt_file  += the_number_of_txt_file

        elif len(source_list) < batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file,
                                              "not subject of batch. small source list."]
            the_number_of_txt_file_list.append(1)
            the_number_of_total_txt_file  += 1

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)

    if 'rain' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/reading_books_by_machine_train.xlsx", index=False)
    elif 'alid' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/reading_books_by_machine_valid.xlsx", index=False)
    else:
         source_file_by_batch_df.to_excel("source_file_by_batch/reading_books_by_machine.xlsx", index=False)

    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [27]:
def write_jsontext_to_txt_file_with_batch_list(source_file_list,
                                    text_file_path_list,
                                    batch_size, the_number_of_txt_file_list):
  
  progress_length = sum(the_number_of_txt_file_list)
  print("[Size]")
  print("The number of preprocessing corpus: " + str(progress_length))
  print("\n[Order]")
  pbar = tqdm(range(progress_length))
  num = 0

  for i in range(len(source_file_list)):

    source_file = source_file_list[i]
    
    with open(source_file, 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    source_list = make_sources(one_json_sample)
    
    n = batch_size
    source_batch_list = list(divide_source_file_list(source_list, n))
      
    for source_list in source_batch_list:
        with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
          fp.write("\n".join(source_list)) 
        num += 1  
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)
  pbar.close()  

In [30]:
batch_size = 1000
the_number_of_txt_file, the_number_of_txt_file_list = count_number_of_txt_file_with_batch_list(json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 239


In [48]:
source_file_by_batch_df = pd.read_excel('source_file_by_batch/reading_books_by_machine.xlsx', engine='openpyxl')  
source_file_by_batch_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_도서자료 기계독해\Validation\도서.json ~ AIHUB_도...,12500,13,
1,1,AIHUB_도서자료 기계독해\Training\도서_220419_add\도서_2204...,225000,226,


In [20]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(json_file_list, txt_file_path_list,
                batch_size, the_number_of_txt_file_list)

[Size]
The number of preprocessing corpus: 239

[Order]


100%|█████████▉| 238/239 [00:25<00:00,  9.23it/s]


#### Preprocess TXT File

In [None]:
from sentence_segmentation import preprocessing_text
from data_preprocessing import make_pro_post_txt_file_path_list
from data_preprocessing import merge_and_deduplicate_corpus_txt
from reading_data import reading_txt

In [None]:
pro_corpus_path = "AIHUB_corpus/exploration/reading_books_by_machine_pro/AIHUB_reading_books_by_machine_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [29]:
len(pro_total_corpus_path_list)

238

In [None]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "source"

reading_txt(pro_coprus_file, line_length, data_type)

In [None]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "preprocessing"

reading_txt(pro_coprus_file, line_length, data_type)

In [None]:
ray.init(num_cpus = 4)

@ray.remote
def ray_preprocessing_text(source, corpus_path):

    preprocessing_sentence_list = preprocessing_text(source, corpus_path)

    return preprocessing_sentence_list

In [None]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):
    
    progress_length = len(pro_total_corpus_path_list)
    print("[Size]")
    print("The number of preprocessing corpus: " + str(progress_length))
    print("\n[Order]")
    pbar = tqdm(range(progress_length))
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [ray_preprocessing_text.remote(lines[start_line:end_line][j], pro) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [ray_preprocessing_text.remote(lines[end_line:][j], pro) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))

        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))

        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)

    pbar.close() 

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list)

In [None]:
ray.shutdown()

In [None]:
preprocessing_corpus_path = "AIHUB_corpus/exploration/reading_books_by_machine_post/AIHUB_reading_books_by_machine_" + "*.txt"
merge_corpus_path = 'AIHUB_corpus/duplicate/AIHUB_reading_books_by_machine.txt'
deduplicate_corpus_path = 'AIHUB_corpus/AIHUB_reading_books_by_machine.txt'

In [None]:
merge_and_deduplicate_corpus_txt(preprocessing_corpus_path, merge_corpus_path, 
                                  deduplicate_corpus_path)