## AIHub Json Preprocessing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [1]:
import re
import os
import kss
import ray
import json
import time
from time import sleep
from tqdm import tqdm
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'D:\\AIHUB'

### AIHUB 특허분야 자동분류 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=547)

#### Convert JSON File to TXT File

In [None]:
from data_preprocessing import make_train_valid_json_txt_file_path_list
from data_preprocessing import divide_source_file_list
from extract_source_text import make_sources

In [11]:
json_path_list = ['AIHUB_특허 분야 자동분류 데이터/Training/'+ '/**/*.json', 
                  'AIHUB_특허 분야 자동분류 데이터/Validation/'+ '/**/*.json']
txt_path_list = ["exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_train_", 
                 "exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_valid_"]

In [12]:
train_json_file_list, valid_json_file_list, train_txt_file_path_list, valid_txt_file_path_list = \
    make_train_valid_json_txt_file_path_list(json_path_list, txt_path_list)

The number of file: 1128


In [14]:
source_file_index_df = pd.DataFrame(train_json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/automatic_patent_classification_data_source_train_file_index.xlsx", index=False)

source_file_index_df = pd.DataFrame(valid_json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/automatic_patent_classification_data_source_valid_file_index.xlsx", index=False)

In [21]:
def count_number_of_txt_file_with_batch_list(source_file_list, batch_size):
    
    source_file_by_batch_df = pd.DataFrame({'File':[0], 'Length of Source List':[0],
                                            'The Number of TXT File':[0], 
                                            'Description':[0]})
                                            
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    
    for i in range(len(source_file_list)):    
        
        source_file = source_file_list[i]        

        with open(source_file, 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file) 

        source_list = make_sources(one_json_sample)
        
        the_number_of_txt_file = ((len(source_list) // batch_size) + 1) 

        if len(source_list) >= batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file, ""]
            the_number_of_txt_file_list.append(the_number_of_txt_file)
            the_number_of_total_txt_file  += the_number_of_txt_file

        elif len(source_list) < batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file,
                                              "not subject of batch. small source list."]
            the_number_of_txt_file_list.append(1)
            the_number_of_total_txt_file  += 1

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    if 'rain' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/automatic_patent_classification_data_train.xlsx", index=False)
    elif 'alid' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/automatic_patent_classification_data_valid.xlsx", index=False)
    else:
         source_file_by_batch_df.to_excel("source_file_by_batch/machine_reading.xlsx", index=False)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [22]:
def write_jsontext_to_txt_file_with_batch_list(source_file_list, text_file_path_list, batch_size, the_number_of_txt_file_list):

  progress_length = sum(the_number_of_txt_file_list)
  print("[Size]")
  print("The number of preprocessing corpus: " + str(progress_length))
  print("\n[Order]")
  pbar = tqdm(range(progress_length))
  num = 0
  
  for i in range(len(source_file_list)):

    source_file = source_file_list[i]
    
    with open(source_file, 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    source_list = make_sources(one_json_sample)
    
    n = batch_size
    source_batch_list = list(divide_source_file_list(source_list, n))
      
    for source_list in source_batch_list:   
      with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:
        fp.write("\n".join(source_list))           
      num += 1  
      pbar.n += 1
      pbar.refresh()
      time.sleep(0.01)
  pbar.close()  

In [19]:
batch_size = 1000
the_number_of_train_txt_file, the_number_of_train_txt_file_list = count_number_of_txt_file_with_batch_list(train_json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 564


In [20]:
source_file_by_batch_train_df = pd.read_excel('source_file_by_batch_list/automatic_patent_classification_data_train.xlsx', engine='openpyxl')  
source_file_by_batch_train_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\B_광업_0...,800,1,not subject of batch. small source list.
1,1,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\A_농업_임...,760,1,not subject of batch. small source list.
2,2,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\A_농업_임...,799,1,not subject of batch. small source list.
3,3,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\A_농업_임...,718,1,not subject of batch. small source list.
4,4,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\A_농업_임...,762,1,not subject of batch. small source list.
...,...,...,...,...,...
559,559,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\C_제조업_...,748,1,not subject of batch. small source list.
560,560,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\J_정보통신...,660,1,not subject of batch. small source list.
561,561,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\C_제조업_...,800,1,not subject of batch. small source list.
562,562,AIHUB_특허 분야 자동분류 데이터/Training\원천데이터\TS1\C_제조업_...,800,1,not subject of batch. small source list.


In [23]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list, train_txt_file_path_list,
                batch_size, the_number_of_train_txt_file_list)

[Size]
The number of preprocessing corpus: 564

[Order]


100%|██████████| 564/564 [00:31<00:00, 17.97it/s]


In [24]:
batch_size = 1000
the_number_of_valid_txt_file, the_number_of_valid_txt_file_list = count_number_of_txt_file_with_batch_list(valid_json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 564


In [25]:
source_file_by_batch_valid_df = pd.read_excel('source_file_by_batch/automatic_patent_classification_data_valid.xlsx', engine='openpyxl')  
source_file_by_batch_valid_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\B_광업...,100,1,not subject of batch. small source list.
1,1,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\A_농업...,100,1,not subject of batch. small source list.
2,2,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\A_농업...,100,1,not subject of batch. small source list.
3,3,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\A_농업...,100,1,not subject of batch. small source list.
4,4,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\A_농업...,100,1,not subject of batch. small source list.
...,...,...,...,...,...
559,559,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\C_제조...,100,1,not subject of batch. small source list.
560,560,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\J_정보...,100,1,not subject of batch. small source list.
561,561,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\C_제조...,100,1,not subject of batch. small source list.
562,562,AIHUB_특허 분야 자동분류 데이터/Validation\원천데이터\VS1\C_제조...,100,1,not subject of batch. small source list.


In [26]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list, valid_txt_file_path_list,
                batch_size, the_number_of_valid_txt_file_list)

[Size]
The number of preprocessing corpus: 564

[Order]


100%|██████████| 564/564 [00:19<00:00, 29.09it/s]


#### Preprocess TXT File

In [None]:
from sentence_segmentation import preprocessing_text
from data_preprocessing import make_pro_post_txt_file_path_list
from data_preprocessing import merge_and_deduplicate_corpus_txt
from reading_data import reading_txt

In [16]:
pro_corpus_path = "AIHUB_corpus/exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [17]:
len(pro_total_corpus_path_list)

1128

In [19]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "source"

reading_txt(pro_coprus_file, line_length, data_type)

서로 연통 조립될 수 있는 다수의 재배조가 주의 환경에 맞는 크기로 자유로이 변형 설치되도록 하여 배양수와 양분을 공급하는 점프나 공급장치등의 중복 설치를 배제토록 하고, 배양수가 수생식물의 생장에 필요한 온도로 조절유지되도록 하는 수경재배조를 제공하기 위하여 펌프에 의해 배양수가 공급되며 결합 수단에 의해 다른 재배조와 크기의 조절이 가능하도록 조립되는 재배조와; 이러한 재배조의 내면에 씌워져 조립 부분이 수밀을 유지할 수 있도록 하는 비닐과; 상기 재배조의 내부에 관로를 형성하도록 설치되어 흐르는 유체의 온도에 따라 배양수의 수온을 조절할 수 있도록 하는 온도조절관과; 상기 재배조의 상부에 위치하여 수생식물이 그 뿌리가 침수된 상태로 수면에 부유되도록 하는 재배구로 이루어지도록 한것.



In [20]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "preprocessing"

reading_txt(pro_coprus_file, line_length, data_type)

서로 연통 조립될 수 있는 다수의 재배조가 주의 환경에 맞는 크기로 자유로이 변형 설치되도록 하여 배양수와 양분을 공급하는 점프나 공급장치등의 중복 설치를 배제토록 하고, 배양수가 수생식물의 생장에 필요한 온도로 조절유지되도록 하는 수경재배조를 제공하기 위하여 펌프에 의해 배양수가 공급되며 결합 수단에 의해 다른 재배조와 크기의 조절이 가능하도록 조립되는 재배조와 이러한 재배조의 내면에 씌워져 조립 부분이 수밀을 유지할 수 있도록 하는 비닐과 상기 재배조의 내부에 관로를 형성하도록 설치되어 흐르는 유체의 온도에 따라 배양수의 수온을 조절할 수 있도록 하는 온도조절관과 상기 재배조의 상부에 위치하여 수생식물이 그 뿌리가 침수된 상태로 수면에 부유되도록 하는 재배구로 이루어지도록 한것.



In [52]:
ray.init(num_cpus = 4)

@ray.remote
def ray_preprocessing_text(source, corpus_path):

    preprocessing_sentence_list = preprocessing_text(source, corpus_path)

    return preprocessing_sentence_list

2023-05-29 15:05:51,000	INFO worker.py:1625 -- Started a local Ray instance.


In [None]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):

    progress_length = len(pro_total_corpus_path_list)   
    print("[Size]")
    print("The number of preprocessing corpus: " + str(progress_length))
    print("\n[Order]")
    pbar = tqdm(range(progress_length))
    num = 0
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [ray_preprocessing_text.remote(lines[start_line:end_line][j], pro) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [ray_preprocessing_text.remote(lines[end_line:][j], pro) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))
        
        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))

        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)

    pbar.close() 

In [None]:
preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list)

In [51]:
ray.shutdown()

In [None]:
preprocessing_corpus_path = "AIHUB_corpus/exploration/AIHUB_automatic_patent_classification_data_post/AIHUB_AIHUB_automatic_patent_classification_data_" +"*.txt"
merge_corpus_path = 'AIHUB_corpus/duplicate/AIHUB_automatic_patent_classification_data.txt'
deduplicate_corpus_path = 'AIHUB_corpus/AIHUB_automatic_patent_classification_data.txt'

In [None]:
merge_and_deduplicate_corpus_txt(preprocessing_corpus_path, merge_corpus_path, 
                                  deduplicate_corpus_path)