## AIHub Json Parsing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [None]:
%pip install ray

In [1]:
import re
import os
import kss
import ray
import json
from mecab import MeCab
import pandas as pd
from glob import glob
from itertools import chain

In [2]:
pwd

'c:\\Users\\MinSeok\\Documents\\AIHUB'

### Function

In [3]:
def sorted_list(path_list):
    path_list = sorted(path_list, reverse=False)
    path_list = sorted(path_list, key=len)
    
    return path_list

In [None]:
def json_file_name_list(path_list):
    
    file_name  = [glob(i, recursive = True) for i in path_list][0]
    file_name = sorted_list(file_name)
    
    return file_name

In [4]:
def train_valid_json_file_name_list(path_list):

  train_file_name, valid_file_name = [glob(i, recursive = True) if 'rain' in i
                                      else glob(i, recursive = True)
                                      for i in path_list]

  train_file_name = sorted_list(train_file_name)
  valid_file_name = sorted_list(valid_file_name)
    
  return train_file_name, valid_file_name

In [5]:
def divide_source_file_list(l, n): 
    
  for i in range(0, len(l), n): 
    yield l[i:i + n] 

In [None]:
def txt_file_name_list(source_file_nested_list, folder_corpus_type_name):

  text_file_name_list = [folder_corpus_type_name + str(i) + ".txt"
                              for i in range(len(source_file_nested_list))]
    
  return text_file_name_list

In [6]:
def post_txt_file_name_list(corpus_list):
   
  post_corpus_list = [corpus_file.replace("pro", "post")
                      for corpus_file in corpus_list]

  post_corpus_list = sorted_list(post_corpus_list)

  return post_corpus_list

In [8]:
def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거

    source = re.sub(r"\[.*?\]|\{.*?\}", "", source)
    # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

    bracket_form = re.compile('\(([^)]+)')
    text_in_small_bracket = bracket_form.findall(source)
    
    if type(text_in_small_bracket) == str:
        
        text = text_in_small_bracket
        
        text_size = len(text)
        last_index = source.find(text) + len(text)
        if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
            source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
            
        if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
            small_bracket = "(" + text + ")"
            source = source.replace(small_bracket, text)    
        
    elif type(text_in_small_bracket) != str:
        
        for text in text_in_small_bracket:
            
            text_size = len(text)
            last_index = source.find(text) + len(text)
            if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
                source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
                
            if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
                small_bracket = "(" + text + ")"
                source = source.replace(small_bracket, text)    
                    
    # 마침표(.) 앞에 소괄호')'가 있을시 소괄호 제거와 함께 소괄호 내부 텍스트 제거
    # 소괄호 내부 텍스트가 5어절 이상이고 끝이 온점(.). 느낌표(!). 물음표(?)일 떼 소괄호 제거
    
    ganada_form = re.compile('[가나다라마바사아자차카타파하].')
    ganada_text = ganada_form.findall(source)
    
    if type(ganada_text) == str:
        
        ganada_index = source.find(ganada_text)
    
        if ganada_index == 0:
            source = re.sub(source[:2], "", source)
            
    elif type(ganada_text) != str:
        
        for text in ganada_text:
            ganada_index = source.find(text)

            if ganada_index == 0:
                source = re.sub(source[:2], "", source)
                
    source = re.sub(r' [가나다라마바사아자차카타파하].', "", source)
    # '가.', '나.', ... 형태의 문자열 제거   

    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # KSS(Korean Sentence Segmentation)로 문장 분리 
    # Formal articles (wiki, news, essays): recommend to False
    

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            # 문장의 시작이 특수문자인 문장(영어 대소문자, 한글, 한자, 숫자, -, + 제외
            # 문장의 끝이 온점(.). 느낌표(!). 물음표(?)가 아닌 문장 제외
            # 다섯 어절 이하 문장 제외


            if ']' in sentence and '[' not in sentence:
                sentence  = re.sub(r".*?]", "", sentence)    
            # 중괄호 앞에 있는 '성명/직함]' 형태 제거


            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-.,]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호, 마침표, 쉼표, 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거

            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외

                for sentence2 in kss.split_sentences(sentence, use_heuristic=False,
                                        num_workers=32):
                    for sentence3 in kss.split_sentences(sentence2, use_heuristic=False,
                                                         num_workers=32):
                        preprocessing_sentence_list.append(sentence3)

            # 마지막에 KSS(Korean Sentence Segmentation)로 문장 분리 2번 실행

  
    return preprocessing_sentence_list

### AIHUB 문서요약 텍스트

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=97)

In [9]:
path_list = ['AIHUB_문서요약 텍스트/Training/'+ '/**/*.json', 
             'AIHUB_문서요약 텍스트/Validation/'+ '/**/*.json']
train_file_name, valid_file_name = train_valid_json_file_name_list(path_list)

In [18]:
the_number_of_file = len(train_file_name) + len(valid_file_name)
print("The number of file:", the_number_of_file)
n = 1
print("The number of list element:", n)
train_file_nested_list = list(divide_source_file_list(train_file_name, n))
valid_file_nested_list = list(divide_source_file_list(valid_file_name, n))

The number of file: 6
The number of list element: 1


In [21]:
train_text_file_name_list = txt_file_name_list(train_file_nested_list,
                                                           "exploration/document_summary_text_pro/AIHUB_document_summary_text_train_")
valid_text_file_name_list = txt_file_name_list(valid_file_nested_list,
                                                           "exploration/document_summary_text_pro/AIHUB_document_summary_text_valid_")
the_numer_of_txt_file = len(train_text_file_name_list) + len(valid_text_file_name_list)
print("The number of txt file:", the_numer_of_txt_file)

The number of txt file: 6


In [50]:
def list_length_checker(source_file_nested_list, batch_size):
    
    the_number_of_total_txt_file = 0
    the_number_of_txt_file_list = []
    temp_nested_index = []

    for source_file_list in source_file_nested_list:   
        temp_index = []
        for i in range(len(source_file_list)):
                
            if type(source_file_list) == str:
                source_file = source_file_list

            elif type(source_file_list) != str:
                source_file = source_file_list[i]
            

            with open(source_file, 'r', encoding='utf-8') as one_json_file:
                one_json_sample = json.load(one_json_file)
                
            source_df = pd.DataFrame(one_json_sample['documents'])
            source_dict = dict(source_df['text'].explode())
            source_json = pd.json_normalize(source_dict)  
            
            source_list = []
            for source_index in source_json:
                for source_nested_list in source_json[source_index]:
                    for source_single_list in source_nested_list:
                        source_sentence = ""
                        for source_single in source_single_list:
                            if type(source_single) == dict:
                                for key, value in source_single.items():
                                    if key == "sentence":
                                        source_sentence += " " + value
                        if len(source_sentence) > 0:
                            source_list.append(source_sentence)

            if len(source_list) >= 1000:
                the_number_of_txt_file = ((len(source_list) // batch_size) + 1)
                print("File:", source_file)    
                print("Index:", i, "  ", "Length of Source List:", len(source_list), \
                "  ", "The number of txt file:", the_number_of_txt_file, "\n")
                the_number_of_txt_file_list.append(the_number_of_txt_file)
                the_number_of_total_txt_file +=  the_number_of_txt_file
            else:
                the_number_of_total_txt_file += 1
                the_number_of_txt_file_list.append(1)
                if i > 0 and the_number_of_file % i == 0:
                    temp_index.append(i)
                    try:
                        if i == temp_nested_index[0][0] and len(temp_nested_index) <= 1:
                            print("[For Example]")
                            print("This is not subject of batch. It's small source list")                            
                            print("File:", source_file)
                            print("Length of Source List:", len(source_list), "\n")
                    except:
                        pass
                    
        temp_nested_index.append(temp_index)
    
    print("")            
    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    return the_number_of_total_txt_file, the_number_of_txt_file_list

In [51]:
def make_corpus_txt_with_batch_list(source_file_nested_list,
                                    text_file_name_list,
                                    batch_size, the_number_of_txt_file_list):

  print("[Size]")
  print("The number of preprocessing corpus: " + str(sum(the_number_of_txt_file_list)))
  print("\n[Order]")
  num = 0
  for i in range(len(source_file_nested_list)):
    source_file_list = source_file_nested_list[i]
    
    for j in range(len(source_file_list)):

      if type(source_file_list) == str:
        source_file = source_file_list

      elif type(source_file_list) != str:
        source_file = source_file_list[j]

      with open(source_file, 'r', encoding='utf-8') as one_json_file:
        one_json_sample = json.load(one_json_file)

      source_df = pd.DataFrame(one_json_sample['documents'])
      source_dict = dict(source_df['text'].explode())
      source_json = pd.json_normalize(source_dict)  
      
      source_list = []
      for source_index in source_json:
          for source_nested_list in source_json[source_index]:
              for source_single_list in source_nested_list:
                  source_sentence = ""
                  for source_single in source_single_list:
                      if type(source_single) == dict:
                          for key, value in source_single.items():
                              if key == "sentence":
                                  source_sentence += " " + value
                  if len(source_sentence) > 0:
                    source_list.append(source_sentence)
      
      n = batch_size
      source_batch_list = list(divide_source_file_list(source_list, n))
      
      for source_list in source_batch_list:
        sentence_list = []                    
        for source in source_list:
          num += 1
          print(str(num), end=" ")  
          
          with open(os.path.join('AIHUB_corpus/' + text_file_name_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
              fp.write("\n".join(source_list))   
                

In [52]:
batch_size = 1000
the_number_of_train_txt_file, the_number_of_train_txt_file_list = list_length_checker(train_file_nested_list, batch_size)

File: AIHUB_문서요약 텍스트/Training\법률_train_original\train_original.json
Index: 0    Length of Source List: 42178    The number of txt file: 43 

File: AIHUB_문서요약 텍스트/Training\사설_train_original\train_original.json
Index: 0    Length of Source List: 242550    The number of txt file: 243 

File: AIHUB_문서요약 텍스트/Training\신문기사_train_original\train_original.json
Index: 0    Length of Source List: 2187380    The number of txt file: 2188 


Batch Size: 1000
The number of txt file: 2474


In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(train_file_nested_list, train_text_file_name_list,
                batch_size, the_number_of_train_txt_file_list)

In [53]:
batch_size = 1000
the_number_of_valid_txt_file, the_number_of_valid_txt_file_list = list_length_checker(valid_file_nested_list, batch_size)

File: AIHUB_문서요약 텍스트/Validation\법률_valid_original\valid_original.json
Index: 0    Length of Source List: 4780    The number of txt file: 5 

File: AIHUB_문서요약 텍스트/Validation\사설_valid_original\valid_original.json
Index: 0    Length of Source List: 25202    The number of txt file: 26 

File: AIHUB_문서요약 텍스트/Validation\신문기사_valid_original\valid_original.json
Index: 0    Length of Source List: 239892    The number of txt file: 240 


Batch Size: 1000
The number of txt file: 271


In [None]:
batch_size = 1000
make_corpus_txt_with_batch_list(valid_file_nested_list, valid_text_file_name_list,
                batch_size, the_number_of_valid_txt_file_list)

In [None]:
pro_total_corpus_list = glob("AIHUB_corpus/exploration/document_summary_text_pro/AIHUB_document_summary_text_" +"*.txt")

In [None]:
len(pro_total_corpus_list)

In [None]:
post_total_corpus_list = post_txt_file_name_list(pro_total_corpus_list)

In [None]:
line_list = []
line_num = 0
with open(pro_total_corpus_list[0], 'r', encoding='utf-8') as f:
    lines = f.read().splitlines() 
    for line in lines:
        line_num += 1
        if line_num <= 1:
           line_list.append(line)
for line in line_list:
    print(line, end="\n\n")

In [None]:
line_list = []
line_num = 0
with open(pro_total_corpus_list[0], 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()
    for line in lines:
        line_num += 1
        if line_num <= 1:  
            sentences = formal_preprocessing_text(line)
            for sentence in sentences:
                line_list.append(sentence) 
            
for line in line_list:
    print(line, end="\n\n")

In [None]:
ray.init()

@ray.remote

def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거

    source = re.sub(r"\[.*?\]|\{.*?\}", "", source)
    # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

    bracket_form = re.compile('\(([^)]+)')
    text_in_small_bracket = bracket_form.findall(source)
    
    if type(text_in_small_bracket) == str:
        
        text = text_in_small_bracket
        
        text_size = len(text)
        last_index = source.find(text) + len(text)
        if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
            source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
            
        if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
            small_bracket = "(" + text + ")"
            source = source.replace(small_bracket, text)    
        
    elif type(text_in_small_bracket) != str:
        
        for text in text_in_small_bracket:
            
            text_size = len(text)
            last_index = source.find(text) + len(text)
            if len(source) >= last_index+1 and source[last_index-text_size-1] == '(' and source[last_index+1] == '.':
                source = source.replace(source[last_index-text_size-1 : last_index+1] + ".", ".")
                
            if len(text.split()) > 5 and bool(re.match(r'[.]|[!]|[?]', text[-1])) == True:
                small_bracket = "(" + text + ")"
                source = source.replace(small_bracket, text)    
                    
    # 마침표(.) 앞에 소괄호')'가 있을시 소괄호 제거와 함께 소괄호 내부 텍스트 제거
    # 소괄호 내부 텍스트가 5어절 이상이고 끝이 온점(.). 느낌표(!). 물음표(?)일 떼 소괄호 제거
    
    ganada_form = re.compile('[가나다라마바사아자차카타파하].')
    ganada_text = ganada_form.findall(source)
    
    if type(ganada_text) == str:
        
        ganada_index = source.find(ganada_text)
    
        if ganada_index == 0:
            source = re.sub(source[:2], "", source)
            
    elif type(ganada_text) != str:
        
        for text in ganada_text:
            ganada_index = source.find(text)

            if ganada_index == 0:
                source = re.sub(source[:2], "", source)
                
    source = re.sub(r' [가나다라마바사아자차카타파하].', "", source)
    # '가.', '나.', ... 형태의 문자열 제거   

    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # KSS(Korean Sentence Segmentation)로 문장 분리 
    # Formal articles (wiki, news, essays): recommend to False
    

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            # 문장의 시작이 특수문자인 문장(영어 대소문자, 한글, 한자, 숫자, -, + 제외
            # 문장의 끝이 온점(.). 느낌표(!). 물음표(?)가 아닌 문장 제외
            # 다섯 어절 이하 문장 제외


            if ']' in sentence and '[' not in sentence:
                sentence  = re.sub(r".*?]", "", sentence)    
            # 중괄호 앞에 있는 '성명/직함]' 형태 제거


            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-.,]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호, 마침표, 쉼표, 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거

            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외

                for sentence2 in kss.split_sentences(sentence, use_heuristic=False,
                                        num_workers=32):
                    for sentence3 in kss.split_sentences(sentence2, use_heuristic=False,
                                                         num_workers=32):
                        preprocessing_sentence_list.append(sentence3)

            # 마지막에 KSS(Korean Sentence Segmentation)로 문장 분리 2번 실행

  
    return preprocessing_sentence_list

In [None]:
print("[Size]")
print("The number of preprocessing corpus: " + str(len(pro_total_corpus_list)))
print("\n[Order]")
num = 0
process_num = 10    
for pro, post in zip(pro_total_corpus_list, post_total_corpus_list):
    
    with open(pro, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines() 
        nested_lines_num = len(lines) // process_num
        for i in range(nested_lines_num - 1):
            start_line = process_num * i
            end_line = process_num * (i+1)
            futures = [formal_preprocessing_text.remote(lines[start_line:end_line][j]) for j in range(process_num)]
            results = ray.get(futures)
            if i == nested_lines_num - 2:
                futures = [formal_preprocessing_text.remote(lines[end_line:][j]) for j in range(len(lines) - end_line)]
                results = ray.get(futures)
        sentence_list = list(chain.from_iterable(results))

    num += 1
    print(str(num), end=" ")  
        
    with open(post, 'a', encoding='utf-8') as fp:
        fp.write("\n".join(sentence_list))

In [None]:
ray.shutdown()

In [None]:
corpus_list = glob("AIHUB_corpus/demo/document_summary_text_post/AIHUB_document_summary_text_" +"*.txt")

In [None]:
with open('AIHUB_corpus/duplicate/AIHUB_document_summary_text.txt', 'w') as f:
    for corpus in corpus_list:
        with open(corpus) as text:
            for line in text:
                f.write(line)

In [None]:
with open('AIHUB_corpus/AIHUB_document_summary_text.txt', 'w', encoding='utf-8') as f1:
    with open('AIHUB_corpus/duplicate/AIHUB_document_summary_text.txt', encoding='utf-8') as f2:
        lines = f2.read().splitlines()
        single_sentence_dict = dict.fromkeys(lines)
        single_sentence_list = list(single_sentence_dict)
        f1.write("\n".join(single_sentence_list))