## AIHub Json Parsing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [1]:
%pip install ray

Note: you may need to restart the kernel to use updated packages.


In [63]:
import re
import os
import kss
import ray
import json
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [64]:
pwd

'D:\\AIHUB'

### Function

In [65]:
def sorted_list(path_list):
    path_list = sorted(path_list, reverse=False)
    path_list = sorted(path_list, key=len)
    
    return path_list

In [66]:
def json_file_name_list(path_list):
    
    file_name  = [glob(i, recursive = True) for i in path_list][0]
    file_name = sorted_list(file_name)
    
    return file_name

In [67]:
def train_valid_json_file_name_list(path_list):

  train_file_name, valid_file_name = [glob(i, recursive = True) if 'rain' in i
                                      else glob(i, recursive = True)
                                      for i in path_list]

  train_file_name = sorted_list(train_file_name)
  valid_file_name = sorted_list(valid_file_name)
    
  return train_file_name, valid_file_name

In [68]:
def divide_source_file_list(l, n): 
    
  for i in range(0, len(l), n): 
    yield l[i:i + n] 

In [69]:
def txt_file_name_list(source_file_nested_list, folder_corpus_type_name):

  text_file_name_list = [folder_corpus_type_name + str(i) + ".txt"
                              for i in range(len(source_file_nested_list))]
    
  return text_file_name_list

In [70]:
def post_txt_file_name_list(corpus_list):
   
  post_corpus_list = [corpus_file.replace("pro", "post")
                      for corpus_file in corpus_list]

  post_corpus_list = sorted_list(post_corpus_list)

  return post_corpus_list

In [71]:
def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거
    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # Formal articles (wiki, news, essays): recommend to False
    # Informal articles (sns, blogs, messages): recommend to True
    
        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            # The String starts with a letter
            # The String ends with [. ! ?]
            # 다섯 어절 이하 문장 제외

            sentence = re.sub(r"\[.*?\]|\{.*?\}", "",  sentence)
            # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-.,]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호, 마침표, 쉼표 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거
            
            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외
                preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

### AIHUB 특허분야 자동분류 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=547)

In [72]:
path_list = ['AIHUB_특허 분야 자동분류 데이터/Training/'+ '/**/*.json', 
             'AIHUB_특허 분야 자동분류 데이터/Validation/'+ '/**/*.json']
train_file_name, valid_file_name = train_valid_json_file_name_list(path_list)

In [73]:
the_number_of_file = len(train_file_name) + len(valid_file_name)
print("The number of file:", the_number_of_file)
n = 20
print("The number of list element:", n)
train_file_nested_list = list(divide_source_file_list(train_file_name, n))
valid_file_nested_list = list(divide_source_file_list(valid_file_name, n))

The number of file: 1128
The number of list element: 20


In [74]:
train_text_file_name_list = txt_file_name_list(train_file_nested_list,
                                                           "exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_train_")
valid_text_file_name_list = txt_file_name_list(valid_file_nested_list,
                                                           "exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_valid_")
the_numer_of_txt_file = len(train_text_file_name_list) + len(valid_text_file_name_list)
print("The number of txt file:", the_numer_of_txt_file)

The number of txt file: 58


In [75]:
def list_length_checker(source_file_nested_list, batch_size):
    
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    temp_nested_index = []
    
    for source_file_list in source_file_nested_list:   
        temp_index = []
        for i in range(len(source_file_list)):
            sentence_list = []
            
            if type(source_file_list) == str:
                source_file = source_file_list

            elif type(source_file_list) != str:
                source_file = source_file_list[i]
            

            with open(source_file, 'r', encoding='utf-8') as one_json_file:
                one_json_sample = json.load(one_json_file) 

            source_list = list(pd.DataFrame(one_json_sample['dataset'])['abstract'].dropna())
            the_number_of_txt_file = ((len(source_list) // batch_size) + 1) 

            if len(source_list) >= 1000:
                print("File:", source_file)    
                print("Index:", i, "  ", "Length of Source List:", len(source_list), \
                    "  ", "The number of txt file:", the_number_of_txt_file, "\n")
                the_number_of_txt_file_list.append(the_number_of_txt_file)
                the_number_of_total_txt_file  += the_number_of_txt_file
            else:
                the_number_of_txt_file_list.append(1)
                the_number_of_total_txt_file  += 1
                if i > 0 and the_number_of_file % i == 0:
                    temp_index.append(i)
                    try:
                        if i == temp_nested_index[0][0] and len(temp_nested_index) <= 1:
                            print("[For Example]")
                            print("This is not subject of batch. It's small source list.")                            
                            print("File:", source_file)
                            print("Length of Source List:", len(source_list), 
                                  "  ", "The number of txt file:", 1, "\n") 
                    except:
                      pass

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [76]:
def make_corpus_txt_with_batch_list(source_file_nested_list,
                                    text_file_name_list,
                                    batch_size, the_number_of_txt_file_list):

  print("[Size]")
  print("The number of preprocessing corpus: " + str(sum(the_number_of_txt_file_list)))
  print("\n[Order]")
  num = 0
  for i in range(len(source_file_nested_list)):
    source_file_list = source_file_nested_list[i]
    
    for j in range(len(source_file_list)):

      if type(source_file_list) == str:
        source_file = source_file_list

      elif type(source_file_list) != str:
        source_file = source_file_list[j]

      with open(source_file, 'r', encoding='utf-8') as one_json_file:
        one_json_sample = json.load(one_json_file)

      source_list = list(pd.DataFrame(one_json_sample['dataset'])['abstract'].dropna())
      
      n = batch_size
      source_batch_list = list(divide_source_file_list(source_list, n))
        
      for source_list in source_batch_list:   
          
        num += 1
        print(str(num), end=" ")  
        
        with open(os.path.join('AIHUB_corpus/' + text_file_name_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
            fp.write("\n".join(source_list))   

In [78]:
batch_size = 1000
the_number_of_train_txt_file, the_number_of_train_txt_file_list = list_length_checker(train_file_nested_list, batch_size)

Batch Size: 1000
The number of txt file: 564


In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list, train_text_file_name_list,
                batch_size, the_number_of_train_txt_file_list)

In [79]:
batch_size = 1000
the_number_of_valid_txt_file, the_number_of_valid_txt_file_list = list_length_checker(valid_file_nested_list, batch_size)

Batch Size: 1000
The number of txt file: 564


In [53]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list, valid_text_file_name_list,
                batch_size, the_number_of_valid_txt_file_list)

[Size]
The numnber of preprocessing corpus: 564

[Order]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263

In [80]:
pro_total_corpus_list = glob("AIHUB_corpus/exploration/automatic_patent_classification_data_pro/AIHUB_automatic_patent_classification_data_" +"*.txt")

In [81]:
len(pro_total_corpus_list)

1128

In [82]:
post_total_corpus_list = post_txt_file_name_list(pro_total_corpus_list)

In [83]:
line_list = []
line_num = 0
with open(pro_total_corpus_list[0], 'r', encoding='utf-8') as f:
    lines = f.read().splitlines() 
    for line in lines:
        line_num += 1
        if line_num <= 1:
           line_list.append(line)
for line in line_list:
    print(line, end="\n\n")

서로 연통 조립될 수 있는 다수의 재배조가 주의 환경에 맞는 크기로 자유로이 변형 설치되도록 하여 배양수와 양분을 공급하는 점프나 공급장치등의 중복 설치를 배제토록 하고, 배양수가 수생식물의 생장에 필요한 온도로 조절유지되도록 하는 수경재배조를 제공하기 위하여 펌프에 의해 배양수가 공급되며 결합 수단에 의해 다른 재배조와 크기의 조절이 가능하도록 조립되는 재배조와; 이러한 재배조의 내면에 씌워져 조립 부분이 수밀을 유지할 수 있도록 하는 비닐과; 상기 재배조의 내부에 관로를 형성하도록 설치되어 흐르는 유체의 온도에 따라 배양수의 수온을 조절할 수 있도록 하는 온도조절관과; 상기 재배조의 상부에 위치하여 수생식물이 그 뿌리가 침수된 상태로 수면에 부유되도록 하는 재배구로 이루어지도록 한것.



In [84]:
line_list = []
line_num = 0
with open(pro_total_corpus_list[0], 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()
    for line in lines:
        line_num += 1
        if line_num <= 1:  
            sentences = formal_preprocessing_text(line)
            for sentence in sentences:
                line_list.append(sentence) 
            
for line in line_list:
    print(line, end="\n\n")

서로 연통 조립될 수 있는 다수의 재배조가 주의 환경에 맞는 크기로 자유로이 변형 설치되도록 하여 배양수와 양분을 공급하는 점프나 공급장치등의 중복 설치를 배제토록 하고, 배양수가 수생식물의 생장에 필요한 온도로 조절유지되도록 하는 수경재배조를 제공하기 위하여 펌프에 의해 배양수가 공급되며 결합 수단에 의해 다른 재배조와 크기의 조절이 가능하도록 조립되는 재배조와  이러한 재배조의 내면에 씌워져 조립 부분이 수밀을 유지할 수 있도록 하는 비닐과  상기 재배조의 내부에 관로를 형성하도록 설치되어 흐르는 유체의 온도에 따라 배양수의 수온을 조절할 수 있도록 하는 온도조절관과  상기 재배조의 상부에 위치하여 수생식물이 그 뿌리가 침수된 상태로 수면에 부유되도록 하는 재배구로 이루어지도록 한것.



In [87]:
ray.init()

@ray.remote

def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거
    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # Formal articles (wiki, news, essays): recommend to False
    # Informal articles (sns, blogs, messages): recommend to True
    
        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            # The String starts with a letter
            # The String ends with [. ! ?]
            # 다섯 어절 이하 문장 제외

            sentence = re.sub(r"\[.*?\]|\{.*?\}", "",  sentence)
            # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-.,]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호, 마침표, 쉼표 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거
            
            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외
                preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

2023-05-26 16:06:48,530	INFO worker.py:1625 -- Started a local Ray instance.


In [None]:
print("[Size]")
print("The number of preprocessing corpus: " + str(len(pro_total_corpus_list)))
print("\n[Order]")
num = 0
process_num = 10    
for pro, post in zip(pro_total_corpus_list, post_total_corpus_list):
    
    with open(pro, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines() 
        nested_lines_num = len(lines) // process_num
        for i in range(nested_lines_num - 1):
            start_line = process_num * i
            end_line = process_num * (i+1)
            futures = [formal_preprocessing_text.remote(lines[start_line:end_line][j]) for j in range(process_num)]
            results = ray.get(futures)
            if i == nested_lines_num - 2:
                futures = [formal_preprocessing_text.remote(lines[end_line:][j]) for j in range(len(lines) - end_line)]
                results = ray.get(futures)
        sentence_list = list(chain.from_iterable(results))

    num += 1
    print(str(num), end=" ")  
        
    with open(post, 'a', encoding='utf-8') as fp:
        fp.write("\n".join(sentence_list))

[Size]
The numnber of preprocessing corpus: 1128

[Order]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 26

In [86]:
ray.shutdown()

In [None]:
corpus_list = glob("AIHUB_corpus/exploration/AIHUB_automatic_patent_classification_data_post/AIHUB_AIHUB_automatic_patent_classification_data_" +"*.txt")

In [None]:
with open('AIHUB_corpus/AIHUB_automatic_patent_classification_data.txt', 'w') as f:
    for corpus in corpus_list:
        with open(corpus) as text:
            for line in text:
                f.write(line)

In [None]:
with open('AIHUB_corpus/AIHUB_automatic_patent_classification_data.txt', 'w', encoding='utf-8') as f1:
    with open('AIHUB_corpus/duplicate/AIHUB_automatic_patent_classification_data.txt', encoding='utf-8') as f2:
        lines = f2.read().splitlines()
        single_sentence_dict = dict.fromkeys(lines)
        single_sentence_list = list(single_sentence_dict)
        f1.write("\n".join(single_sentence_list))