## AIHub Json Preprocessing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [1]:
import re
import os
import kss
import ray
import json
import time
from time import sleep
from tqdm import tqdm
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'c:\\Users\\MinSeok\\Documents\\AIHUB'

### AIHUB 일반상식

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=106)

#### Convert JSON File to TXT File

In [None]:
from data_preprocessing import make_json_txt_file_path_list
from data_preprocessing import divide_source_file_list
from extract_source_text import make_sources

In [11]:
json_path_list = ['AIHUB_일반상식/'+ '/**/*.json']
txt_path_list = ["exploration/general_common_sense_pro/AIHUB_general_common_sense_"]

In [12]:
json_file_list, txt_file_path_list = \
    make_json_txt_file_path_list(json_path_list, txt_path_list)

The number of file: 125


In [13]:
source_file_index_df = pd.DataFrame(json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/general_common_sense_source_file_index.xlsx", index=False)

In [31]:
def count_number_of_txt_file_with_batch_list(source_file_list, batch_size):

    source_file_by_batch_df = pd.DataFrame({'File':[0], 'Length of Source List':[0],
                                        'The Number of TXT File':[0], 
                                        'Description':[0]})
                                            
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    
    for i in range(len(source_file_list)):    
        
        source_file = source_file_list[i]   

        with open(source_file, 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file)
            
        source_list = make_sources(source_file, one_json_sample)
        
        the_number_of_txt_file = ((len(source_list) // batch_size) + 1)

        if len(source_list) >= batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file, ""]
            the_number_of_txt_file_list.append(the_number_of_txt_file)
            the_number_of_total_txt_file  += the_number_of_txt_file

        elif len(source_list) < batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file,
                                              "not subject of batch. small source list."]
            the_number_of_txt_file_list.append(1)
            the_number_of_total_txt_file  += 1

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)

    if 'rain' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/general_common_sense_train.xlsx", index=False)
    elif 'alid' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/general_common_sense_valid.xlsx", index=False)
    else:
         source_file_by_batch_df.to_excel("source_file_by_batch/general_common_sense.xlsx", index=False)

    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [33]:
def write_jsontext_to_txt_file_with_batch_list(source_file_list,
                                    text_file_path_list,
                                    batch_size, the_number_of_txt_file_list):
  progress_length = sum(the_number_of_txt_file_list)
  print("[Size]")
  print("The number of preprocessing corpus: " + str(progress_length))
  print("\n[Order]")
  pbar = tqdm(range(progress_length))
  num = 0
  
  for i in range(len(source_file_list)):

    source_file = source_file_list[i]     

    with open(source_file, 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    if 'ko_wiki_v1_squad' in source_file:
      source_list = make_sources(source_file, one_json_sample)
      
      n = batch_size
      source_batch_list = list(divide_source_file_list(source_list, n))
          
      for source_list in source_batch_list:
        num += 1
        print(str(num), end=" ")  
        
        with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
            fp.write("\n".join(source_list))   
                

    else:
      source_list = make_sources(source_file, one_json_sample)
    
      n = batch_size
      source_batch_list = list(divide_source_file_list(source_list, n))
          
      for source_list in source_batch_list:
  
        with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
            fp.write("\n".join(source_list))    
        num += 1  
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)      
  pbar.close()         

In [20]:
batch_size = 10000
the_number_of_txt_file, the_number_of_txt_file_list = count_number_of_txt_file_with_batch_list(json_file_list, batch_size)

Batch Size: 10000
The number of txt file: 131


In [21]:
source_file_by_batch_df = pd.read_excel('source_file_by_batch/general_common_sense.xlsx', engine='openpyxl')  
source_file_by_batch_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_일반상식\ko_wiki_v1_squad.json ~ AIHUB_일반상...,68538,7,
1,1,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 000...,26,1,not subject of batch. small source list.
2,2,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 006...,37,1,not subject of batch. small source list.
3,3,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 012...,6,1,not subject of batch. small source list.
4,4,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A2] 000...,50,1,not subject of batch. small source list.
...,...,...,...,...,...
120,120,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A3] 000...,51,1,not subject of batch. small source list.
121,121,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 009...,9,1,not subject of batch. small source list.
122,122,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 032...,8,1,not subject of batch. small source list.
123,123,AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 010...,7,1,not subject of batch. small source list.


In [34]:
batch_size = 10000
write_jsontext_to_txt_file_with_batch_list(json_file_list, txt_file_path_list, batch_size, the_number_of_txt_file_list)

[Size]
The number of preprocessing corpus: 131

[Order]


  0%|          | 0/131 [03:24<?, ?it/s]


1 2 3 4 5 6 




7 

 95%|█████████▍| 124/131 [00:14<00:00,  8.45it/s]A


#### Preprocess TXT File

In [None]:
from sentence_segmentation import preprocessing_text
from data_preprocessing import make_pro_post_txt_file_path_list
from data_preprocessing import merge_and_deduplicate_corpus_txt
from reading_data import reading_txt

In [None]:
pro_corpus_path = "AIHUB_corpus/exploration/general_common_sense_pro/AIHUB_general_common_sense_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [None]:
len(pro_total_corpus_path_list)

In [None]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "source"

reading_txt(pro_coprus_file, line_length, data_type)

In [None]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "preprocessing"

reading_txt(pro_coprus_file, line_length, data_type)

In [None]:
ray.init(num_cpus = 4)

@ray.remote
def ray_preprocessing_text(source, corpus_path):

    preprocessing_sentence_list = preprocessing_text(source, corpus_path)

    return preprocessing_sentence_list

In [None]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):
    
    progress_length = len(pro_total_corpus_path_list)
    print("[Size]")
    print("The number of preprocessing corpus: " + str(progress_length))
    print("\n[Order]")
    pbar = tqdm(range(progress_length))
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [ray_preprocessing_text.remote(lines[start_line:end_line][j], pro) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [ray_preprocessing_text.remote(lines[end_line:][j], pro) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))

        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))

        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)

    pbar.close() 

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list)

In [None]:
ray.shutdown()

In [None]:
preprocessing_corpus_path = "AIHUB_corpus/exploration/general_common_sense/AIHUB_general_common_sense_" +"*.txt"
merge_corpus_path = 'AIHUB_corpus/duplicate/AIHUB_general_common_sense.txt'
deduplicate_corpus_path = 'AIHUB_corpus/AIHUB_general_common_sense.txt'

In [None]:
merge_and_deduplicate_corpus_txt(preprocessing_corpus_path, merge_corpus_path, 
                                  deduplicate_corpus_path)