## AIHub Json Preprocessing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [1]:
import re
import os
import kss
import ray
import json
import time
from time import sleep
from tqdm import tqdm
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'D:\\AIHUB'

### AIHUB 법률 규정 (판결서 약관 등) 텍스트 분석 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=580)

#### Convert JSON File to TXT File

In [None]:
from data_preprocessing import make_train_valid_json_txt_file_path_list
from data_preprocessing import divide_source_file_list
from extract_source_text import make_sources

In [10]:
json_path_list = ['AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training' + '/**/*.json',
             'AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation' + '/**/*.json']
txt_path_list = ["exploration/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_pro/AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_train_", 
                 "exploration/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_pro/AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_valid_"]

In [11]:
train_json_file_list, valid_json_file_list, train_txt_file_path_list, valid_txt_file_path_list = \
    make_train_valid_json_txt_file_path_list(json_path_list, txt_path_list)

The number of file: 9450


In [12]:
source_file_index_df = pd.DataFrame(train_json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_source_train_file_index.xlsx", index=False)

source_file_index_df = pd.DataFrame(valid_json_file_list, columns=['source_file_name'])
source_file_index_df.to_excel("source_file_index/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_source_valid_file_index.xlsx", index=False)

In [14]:
def count_number_of_txt_file_with_batch_list(source_file_list, batch_size):
    
    source_file_by_batch_df = pd.DataFrame({'File':[0], 'Length of Source List':[0],
                                            'The Number of txt File':[0], 
                                            'Description':[0]})
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    source_list = []
    
    for i in range(len(source_file_list)):    
        
        source_file = source_file_list[i]   
            
        with open(source_file, 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file) 

        sources = make_sources(source_file, one_json_sample)
        for source in sources:
            source_list.append(source[0])
            
        the_number_of_txt_file = 1

        if len(source_list) >= batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file, ""]
              
        elif len(source_list) < batch_size:
            source_file_by_batch_df.loc[i] = [source_file,
                                              len(source_list), the_number_of_txt_file,
                                              "not subject of batch. small source list."]
            
        the_number_of_txt_file_list.append(len(source_list))
        the_number_of_total_txt_file += len(source_list) 
        source_list = []

    the_number_of_total_txt_file = the_number_of_total_txt_file // batch_size                 
    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)

    source_file_by_batch_df = source_file_by_batch_df.astype({'Length of Source List':'int', 
                                                              'The Number of txt File':'int'})
    
    if 'rain' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_train.xlsx", index=False)
    elif 'alid' in source_file:
        source_file_by_batch_df.to_excel("source_file_by_batch/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_valid.xlsx", index=False)
    else:
         source_file_by_batch_df.to_excel("source_file_by_batch/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data.xlsx", index=False)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [15]:
def write_jsontext_to_txt_file_with_batch_list(source_file_list,
                                    text_file_path_list,
                                    batch_size, the_number_of_txt_file_list):

  progress_length = sum(the_number_of_txt_file_list) // batch_size
  print("[Size]")
  print("The number of preprocessing corpus: " + str(progress_length))
  print("\n[Order]")
  source_list = []
  pbar = tqdm(range(progress_length))
  num = 0
  
  for i in range(len(source_file_list)):

    source_file = source_file_list[i]
        
    with open(source_file, 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)
      
    sources = make_sources(source_file, one_json_sample)

    if len(source_list) >= batch_size:
        with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + ".txt"), "a", encoding='utf-8') as fp:        
            fp.write("\n".join(source_list)) 
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)  
  
        source_list = []
          
    elif i == (len(source_file_list) -1): 
        for source in sources:
          source_list.append(source)
        
        with open(os.path.join('AIHUB_corpus/' + text_file_path_list[i][:-4] + ".txt"), "a", encoding='utf-8') as fp:        
            fp.write("\n".join(source_list[0])) 
        num += 1  
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)
                    
    for source in sources:
      source_list.append(source[0])   
  pbar.close()      

In [16]:
batch_size = 1000
the_number_of_txt_file, the_number_of_txt_file_list = count_number_of_txt_file_with_batch_list(train_json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 30


In [17]:
source_file_by_batch_train_df = pd.read_excel('source_file_by_batch/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_train.xlsx', engine='openpyxl')  
source_file_by_batch_train_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,2,1,not subject of batch. small source list.
1,1,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,2,1,not subject of batch. small source list.
2,2,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,2,1,not subject of batch. small source list.
3,3,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,2,1,not subject of batch. small source list.
4,4,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,2,1,not subject of batch. small source list.
...,...,...,...,...,...
8395,8395,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,35,1,not subject of batch. small source list.
8396,8396,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,15,1,not subject of batch. small source list.
8397,8397,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,77,1,not subject of batch. small source list.
8398,8398,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training\라벨링...,55,1,not subject of batch. small source list.


In [18]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(train_json_file_list, train_txt_file_path_list,
                batch_size, the_number_of_txt_file_list)

[Size]
The number of preprocessing corpus: 30

[Order]


  0%|          | 0/30 [00:00<?, ?it/s]

31it [00:10,  2.90it/s]                        


In [19]:
batch_size = 1000
the_number_of_txt_file, the_number_of_txt_file_list = count_number_of_txt_file_with_batch_list(valid_json_file_list, batch_size)

Batch Size: 1000
The number of txt file: 4


In [164]:
source_file_by_batch_valid_df = pd.read_excel('source_file_by_batch/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_valid.xlsx', engine='openpyxl')  
source_file_by_batch_valid_df

Unnamed: 0.1,Unnamed: 0,File,Length of Source List,The Number of txt File,Description
0,0,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,2,1,not subject of batch. small source list.
1,1,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,2,1,not subject of batch. small source list.
2,2,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,2,1,not subject of batch. small source list.
3,3,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,2,1,not subject of batch. small source list.
4,4,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,2,1,not subject of batch. small source list.
...,...,...,...,...,...
1045,1045,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,56,1,not subject of batch. small source list.
1046,1046,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,36,1,not subject of batch. small source list.
1047,1047,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,28,1,not subject of batch. small source list.
1048,1048,AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation\라...,30,1,not subject of batch. small source list.


In [134]:
batch_size = 1000
write_jsontext_to_txt_file_with_batch_list(valid_json_file_list, valid_txt_file_path_list,
                batch_size, the_number_of_txt_file_list)

[Size]
The number of preprocessing corpus: 4

[Order]


100%|██████████| 4/4 [00:01<00:00,  3.37it/s]


#### Preprocess TXT File

In [None]:
from sentence_segmentation import preprocessing_text
from data_preprocessing import make_pro_post_txt_file_path_list
from data_preprocessing import merge_and_deduplicate_corpus_txt
from reading_data import reading_txt

In [138]:
pro_corpus_path = "AIHUB_corpus/exploration/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_pro/AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_" + "*.txt"
pro_total_corpus_path_list, post_total_corpus_path_list = make_pro_post_txt_file_path_list(pro_corpus_path)

In [139]:
len(pro_total_corpus_path_list)

35

In [141]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "source"

reading_txt(pro_coprus_file, line_length, data_type)

제2조(보증금액)

 ① 이 보증서에 의한 보증금액은 채권자의 채무자에 대한 보증부대출 예정금액에 보증비율을 곱한 금액으로 합니다.



In [142]:
pro_coprus_file = pro_total_corpus_path_list[0]
line_length = 1
data_type = "preprocessing"

reading_txt(pro_coprus_file, line_length, data_type)

이 보증서에 의한 보증금액은 채권자의 채무자에 대한 보증부대출 예정금액에 보증비율을 곱한 금액으로 합니다.



In [155]:
ray.init(num_cpus = 4)

@ray.remote
def ray_preprocessing_text(source, corpus_path):

    preprocessing_sentence_list = preprocessing_text(source, corpus_path)

    return preprocessing_sentence_list

2023-07-11 14:08:05,201	INFO worker.py:1625 -- Started a local Ray instance.


In [156]:
def preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list):

    progress_length = len(pro_total_corpus_path_list)
    print("[Size]")
    print("The number of preprocessing corpus: " + str(len(pro_total_corpus_path_list)))
    print("\n[Order]")
    pbar = tqdm(range(progress_length))
    process_num = 10    

    for pro, post in zip(pro_total_corpus_path_list, post_total_corpus_path_list):

        sentence_list = []

        with open(pro, 'r', encoding='utf-8') as f:
            lines = f.read().splitlines() 
            nested_lines_num = len(lines) // process_num
            for i in range(nested_lines_num - 1):
                start_line = process_num * i
                end_line = process_num * (i+1)
                futures = [preprocessing_text.remote(lines[start_line:end_line][j], pro) for j in range(process_num)]
                results = ray.get(futures)

                if i == nested_lines_num - 2:
                    futures = [preprocessing_text.remote(lines[end_line:][j], pro) for j in range(len(lines) - end_line)]
                    results = ray.get(futures)

                sentences = list(chain.from_iterable(results))
                sentence_list.append(sentences)

        sentence_list = list(chain.from_iterable(sentence_list))

        with open(post, 'a', encoding='utf-8') as fp:
            fp.write("\n".join(sentence_list))
            
        pbar.n += 1
        pbar.refresh()
        time.sleep(0.01)

    pbar.close() 

In [157]:
preprocessing_corpus_txt(pro_total_corpus_path_list, post_total_corpus_path_list)

[Size]
The number of preprocessing corpus: 35

[Order]


100%|██████████| 35/35 [03:55<00:00,  6.72s/it]


In [154]:
ray.shutdown()

In [None]:
preprocessing_corpus_path = "AIHUB_corpus/exploration/legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_post/AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data_" +"*.txt"
merge_corpus_path = 'AIHUB_corpus/duplicate/AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data.txt'
deduplicate_corpus_path = 'AIHUB_corpus/AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data.txt'

In [None]:
merge_and_deduplicate_corpus_txt(preprocessing_corpus_path, merge_corpus_path, 
                                  deduplicate_corpus_path)