## AIHub Json Preprocessing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [1]:
import re
import os
import kss
import ray
import json
import time
from time import sleep
from tqdm import tqdm
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'c:\\Users\\MinSeok\\Documents\\AIHUB'

### AIHUB 문서요약 텍스트

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=97)

#### Convert JSON File to TXT File

In [None]:
from data_preprocessing import make_train_valid_json_txt_file_path_list
from data_preprocessing import divide_source_file_list
from extract_source_text import make_sources

In [11]:
json_path_list = ['AIHUB_문서요약 텍스트/Training/'+ '/**/*.json', 
                  'AIHUB_문서요약 텍스트/Validation/'+ '/**/*.json']
txt_path_list = ["exploration/document_summary_text_pro/AIHUB_document_summary_text_train_", 
                 "exploration/document_summary_text_pro/AIHUB_document_summary_text_valid_"]

In [12]:
train_json_file_list, valid_json_file_list, train_txt_file_path_list, valid_txt_file_path_list = \
    make_train_valid_json_txt_file_path_list(json_path_list, txt_path_list)

The number of file: 6


In [13]:
source_file_index_df = pd.DataFrame(train_json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/document_summary_text_source_train_file_index.xlsx", index=False)

source_file_index_df = pd.DataFrame(valid_json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/document_summary_text_source_valid_file_index.xlsx", index=False)

In [15]:
def count_number_of_txt_file_with_batch_list(source_file_list, batch_size):
    
    source_file_by_batch_df = pd.DataFrame({'File':[0], 'Length of Source List':[0],
                                            'The Number of TXT File':[0], 
                                            'Description':[0]})
                                            
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    
    for i in range(len(source_file_list)):    
        
        source_file = source_file_list[i]        

        with open(source_file, 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file) 

        source_list = make_sources(one_json_sample)
        
        the_number_of_txt_file = ((len(source_list) // batch_size) + 1) 

        if len(source_list) >= batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file, ""]
            the_number_of_txt_file_list.append(the_number_of_txt_file)
            the_number_of_total_txt_file  += the_number_of_txt_file

        elif len(source_list) < batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file,
                                              "not subject of batch. small source list."]
            the_number_of_txt_file_list.append(1)
            the_number_of_total_txt_file  += 1

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    if 'rain' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/document_summary_text_train.xlsx", index=False)
    elif 'alid' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/document_summary_text_valid.xlsx", index=False)
    else:
         source_file_by_batch_df.to_excel("source_file_by_batch/document_summary_text.xlsx", index=False)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [16]:
def write_jsontext_to_txt_file_with_batch_list(source_file_list, text_file_path_list, batch_size, the_number_of_txt_file_list):

  progress_length = sum(the_number_of_txt_file_list)
  print("[Size]")
  print("The number of preprocessing corpus: " + str(progress_length))
  print("\n[Order]")
  pbar = tqdm(range(progress_length))
  num = 0
  
  for i in range(len(source_file_list)):

    source_file = source_file_list[i]
    
    with open(source_file, 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    source_list = make_sources(one_json_sample)
    
    n = batch_size
    source_batch_list = list(divide_source_file_list(source_list, n))
      
    for source_list in source_batch_list:   
      with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:
        fp.write("\n".join(source_list))           
      num += 1  
      pbar.n += 1
      pbar.refresh()
      time.sleep(0.01)
  pbar.close()  

In [17]:
batch_size = 1000
the_number_of_train_txt_file, the_number_of_train_txt_file_list = count_number_of_txt_file_with_batch_list(train_json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 2474


In [19]:
source_file_by_batch_train_df = pd.read_excel('source_file_by_batch/document_summary_text_train.xlsx', engine='openpyxl')  
source_file_by_batch_train_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_문서요약 텍스트/Training\법률_train_original\trai...,42178,43,
1,1,AIHUB_문서요약 텍스트/Training\사설_train_original\trai...,242550,243,
2,2,AIHUB_문서요약 텍스트/Training\신문기사_train_original\tr...,2187380,2188,


In [None]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list, train_txt_file_path_list,
                batch_size, the_number_of_train_txt_file_list)

In [18]:
batch_size = 1000
the_number_of_valid_txt_file, the_number_of_valid_txt_file_list = count_number_of_txt_file_with_batch_list(valid_json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 271


In [22]:
source_file_by_batch_valid_df = pd.read_excel('source_file_by_batch/document_summary_text_valid.xlsx', engine='openpyxl')  
source_file_by_batch_valid_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_문서요약 텍스트/Validation\법률_valid_original\va...,4780,5,
1,1,AIHUB_문서요약 텍스트/Validation\사설_valid_original\va...,25202,26,
2,2,AIHUB_문서요약 텍스트/Validation\신문기사_valid_original\...,239892,240,


In [23]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list, valid_txt_file_path_list,
                batch_size, the_number_of_valid_txt_file_list)

[Size]
The number of preprocessing corpus: 271

[Order]


100%|██████████| 271/271 [00:26<00:00, 10.24it/s]


#### Preprocess TXT File

In [None]:
from sentence_segmentation import preprocessing_text
from data_preprocessing import make_pro_post_txt_file_path_list
from data_preprocessing import merge_and_deduplicate_corpus_txt
from reading_data import reading_txt

In [None]:
pro_corpus_path = "AIHUB_corpus/exploration/document_summary_text_pro/AIHUB_document_summary_text_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [20]:
len(pro_total_corpus_path_list)

2745

In [63]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "source"

reading_txt(pro_coprus_file, line_length, data_type)

 가. 부가가치세법 제22조 제3항 단서에 제1호와 제2호가 동시에 해당한다는 뜻은 제18조의 예정신고와 예정납부끼리, 제19조의 확정신고와 확정납부끼리 동시에 해당하는 경우를 말하는 것이지 제18조의 예정신고나 그 납부와 제19조의 확정신고나 그 납부가 동시에 해당하는 경우를 가리키는 것이 아니라고 해석되므로 부가가치세의 예정신고 또는 그 납부를 아니하고 또한 확정신고 또는 그 납부를 아니한 경우 통털어서 한번만 가산세를 부과하는 것이 아니라 각각 독립하여 가산세부과대상이 된다.



In [64]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "preprocessing"

reading_txt(pro_coprus_file, line_length, data_type)

부가가치세법 제22조 제3항 단서에 제1호와 제2호가 동시에 해당한다는 뜻은 제18조의 예정신고와 예정납부끼리, 제19조의 확정신고와 확정납부끼리 동시에 해당하는 경우를 말하는 것이지 제18조의 예정신고나 그 납부와 제19조의 확정신고나 그 납부가 동시에 해당하는 경우를 가리키는 것이 아니라고 해석되므로 부가가치세의 예정신고 또는 그 납부를 아니하고 또한 확정신고 또는 그 납부를 아니한 경우 통털어서 한번만 가산세를 부과하는 것이 아니라 각각 독립하여 가산세부과대상이 된다.



In [65]:
ray.init(num_cpus = 4)

@ray.remote
def ray_preprocessing_text(source, corpus_path):

    preprocessing_sentence_list = preprocessing_text(source, corpus_path)

    return preprocessing_sentence_list

2023-05-31 17:08:20,288	INFO worker.py:1625 -- Started a local Ray instance.


In [None]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):

    progress_length = len(pro_total_corpus_path_list)   
    print("[Size]")
    print("The number of preprocessing corpus: " + str(progress_length))
    print("\n[Order]")
    pbar = tqdm(range(progress_length))
    num = 0
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [ray_preprocessing_text.remote(lines[start_line:end_line][j], pro) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [ray_preprocessing_text.remote(lines[end_line:][j], pro) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))
        
        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)

    pbar.close() 

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list)

In [67]:
ray.shutdown()

In [None]:
preprocessing_corpus_path = "AIHUB_corpus/exploration/document_summary_text_post/AIHUB_document_summary_text_" +"*.txt"
merge_corpus_path = 'AIHUB_corpus/duplicate/AIHUB_document_summary_text.txt'
deduplicate_corpus_path = 'AIHUB_corpus/AIHUB_document_summary_text.txt'

In [None]:
merge_and_deduplicate_corpus_txt(preprocessing_corpus_path, merge_corpus_path, 
                                  deduplicate_corpus_path)