## AIHub Json Parsing

### Development Environment

In [None]:
%pip install kss==3.7.3

KSS Argument Error: Restart Jupyter Kernel Runtime

In [None]:
%pip install python-mecab-ko

KSS 3.7.3 matches python-mecab-ko

In [None]:
%pip install pandas

In [1]:
import re
import os
import kss
import json
from mecab import MeCab
import pandas as pd
from glob import glob

In [2]:
pwd

'c:\\Users\\MinSeok\\Documents\\AIHUB'

### Function

In [3]:
def json_file_name_list(path_list):
    for i in path_list:
        file_name = glob(i, recursive = True)
    return file_name

In [4]:
def train_valid_json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [5]:
def divide_source_file_list(l, n): 
    for i in range(0, len(l), n): 
        yield l[i:i + n] 

In [6]:
def txt_file_name_list(source_file_nested_list, folder_corpus_type_name):
   
  text_file_name_list = []

  for i in range(len(source_file_nested_list)):
    txt_file_name = folder_corpus_type_name + str(i) + ".txt"
    text_file_name_list.append(txt_file_name)

  return text_file_name_list

In [7]:
def train_valid_txt_file_name_list(source_file_nested_list, folder_corpus_type_name):
   
  train_valid_set = source_file_nested_list[0][0]
  text_file_name_list = []

  if 'rain' in train_valid_set:
    for i in range(len(source_file_nested_list)):
      txt_file_name = folder_corpus_type_name + str(i) + ".txt"
      text_file_name_list.append(txt_file_name)

  elif 'alid' in train_valid_set:
    for i in range(len(source_file_nested_list)):
      txt_file_name = folder_corpus_type_name + str(i) + ".txt"
      text_file_name_list.append(txt_file_name)  

  return text_file_name_list

In [8]:
def formal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거
    for sentence in kss.split_sentences(source, use_heuristic=False,
                                        num_workers=32):
    # Formal articles (wiki, news, essays): recommend to False

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            sentence = re.sub(r"\[.*?\]|\{.*?\}", "",  sentence)
            # The String starts with a letter
            # The String ends with [. ! ?]
            # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거
            
            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외
                preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

In [9]:
def informal_preprocessing_text(source):
    preprocessing_sentence_list = []
    
    source = source.strip()
    # strip으로 앞뒤 공백 제거
    for sentence in kss.split_sentences(source, use_heuristic=True,
                                        num_workers=32):
    # Formal articles (wiki, news, essays): recommend to False

        if re.search("^[A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎]", sentence[0]) is not None and \
            bool(re.match(r'[.]|[!]|[?]', sentence[-1])) == True and \
            len(sentence.split()) > 5:
            sentence = re.sub(r"\[.*?\]|\{.*?\}", "",  sentence)
            # The String starts with a letter
            # The String ends with [. ! ?]
            # 기타 괄호 제거할 시 괄호 내부에 모든 텍스트 제거

            sentence = re.sub(r"[^A-Za-z0-9ㄱ-ㅎ가-힣一-鿕㐀-䶵豈-龎()+-]", " ", sentence)
            # 특수문자 제거(영어 대소문자, 한글, 한자, 숫자, -, +, 소괄호 제외)

            sentence = sentence.strip()
            # strip으로 앞뒤 공백 제거
            
            total_length = len(sentence.replace(" " , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", sentence))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.5:
            # 한글이 아닌 문자열이 50% 이상이 넘은 문장 제외
                preprocessing_sentence_list.append(sentence)

    return preprocessing_sentence_list

### AIHUB 일반상식

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=106)

In [10]:
path_list = ['AIHUB_일반상식/'+ '/**/*.json']
file_name = json_file_name_list(path_list)

In [12]:
the_number_of_file = len(file_name) 
print("The number of file:", the_number_of_file)
n = the_number_of_file // 4
print("The number of list element:", n)
file_nested_list = list(divide_source_file_list(file_name, n))

The number of file: 125
The number of list element: 31


In [13]:
text_file_name_list = txt_file_name_list(file_nested_list,
                                         "demo/general_common_sense/AIHUB_general_common_sense_")
the_numer_of_txt_file = len(text_file_name_list)
print("The number of txt file:", the_numer_of_txt_file)

The number of txt file: 5


In [20]:
def list_length_checker(source_file_nested_list, batch_size, the_number_of_file):
    
    the_number_of_total_txt_file = 0
    temp_nested_index = []
    
    for source_file_list in source_file_nested_list:   
        temp_index = []
        for i in range(len(source_file_list)):
            sentence_list = []
                
            if type(source_file_list) == str:
                source_file = source_file_list

            elif type(source_file_list) != str:
                source_file = source_file_list[i]
            

            with open(source_file, 'r', encoding='utf-8') as one_json_file:
                one_json_sample = json.load(one_json_file)

            if 'ko_wiki_v1_squad' in source_file:
                source_df = pd.DataFrame(one_json_sample['data'])
                source_dict = dict(source_df['paragraphs'].explode())
                source_json = pd.json_normalize(source_dict)
                source_list = list(source_json.filter(regex='context').values[0])
                
            else:
                source_list = (pd.DataFrame(one_json_sample['sentence'])['text'])
                
            the_number_of_txt_file = ((len(source_list) // batch_size) + 1)
            
            if len(source_list) >= 1000:
                print("File:", source_file)    
                print("Index:", i, "  ", "Length of Source List:", len(source_list), \
                "  ", "The number of txt file:", the_number_of_txt_file, "\n")
                the_number_of_total_txt_file  += the_number_of_txt_file
                
            else:
                the_number_of_total_txt_file  += 1
                if i > 0 and the_number_of_file % i == 0:
                    temp_index.append(i)
                    try:
                        if i == temp_nested_index[0][0] and len(temp_nested_index) <= 1:
                            print("[For Example]")
                            print("This is not subject of batch. It's small source list.")                            
                            print("File:", source_file)
                            print("Length of Source List:", len(source_list), 
                                   "  ", "The number of txt file:", 1, "\n")
                    except:
                        pass
                    
        temp_nested_index.append(temp_index)

    print("Batch Size:", batch_size)
    print("The number of txt file:", the_number_of_total_txt_file)
    
    return the_number_of_total_txt_file

In [56]:
def make_corpus_txt_with_batch_list(source_file_nested_list,
                                    text_file_name_list,
                                    batch_size, the_number_of_total_txt_file):
  print("[Size]")
  print("The numnber of preprocessing corpus: " + str(the_number_of_total_txt_file))
  print("\n[Order]")
  num = 0
  for i in range(len(source_file_nested_list)):
    source_file_list = source_file_nested_list[i]
    sentence_list_01 = []
    
    for j in range(len(source_file_list)):

      if type(source_file_list) == str:
        source_file = source_file_list

      elif type(source_file_list) != str:
        source_file = source_file_list[j]        

      with open(source_file, 'r', encoding='utf-8') as one_json_file:
        one_json_sample = json.load(one_json_file)

      if 'ko_wiki_v1_squad' in source_file:
        source_df = pd.DataFrame(one_json_sample['data'])
        source_dict = dict(source_df['paragraphs'].explode())
        source_json = pd.json_normalize(source_dict)
        source_list = list(source_json.filter(regex='context').values[0])
      
        n = batch_size
        source_batch_list = list(divide_source_file_list(source_list, n))
            
        for source_list in source_batch_list:
          sentence_list_02 = []    
          for source in source_list:
              sentences = formal_preprocessing_text(source)
              for sentence in sentences:
                  sentence_list_02.append(sentence) 
                  
        num += 1
        print(str(num), end=" ")   
              
        with open(os.path.join('AIHUB_corpus/' + text_file_name_list[i][:-4] + "_" + str(num) + ".txt"), "a", encoding='utf-8') as fp:        
            fp.write("\n".join(sentence_list_02))      
                  

      else:
        source_list = (pd.DataFrame(one_json_sample['sentence'])['text'])
        for source in source_list:
            sentences = formal_preprocessing_text(source)
            for sentence in sentences:
                sentence_list_01.append(sentence)            

    if len(sentence_list_01) == 0:
      pass
        
    else: 
      num += 1
      print(str(num), end=" ")    
          
      with open(os.path.join('AIHUB_corpus/', txt_file_name), "a", encoding='utf-8') as fp:        
          fp.write("\n".join(sentence_list_01))        

In [21]:
batch_size = 10000
the_number_of_txt_file = list_length_checker(file_nested_list, batch_size, the_number_of_file)

File: AIHUB_일반상식\ko_wiki_v1_squad.json
Index: 0    Length of Source List: 68538    The number of txt file: 7 

[For Example]
This is not subject of batch. It's small source list.
File: AIHUB_일반상식\mutual_무형대용어_상호참조복원\edited_[A1] 0858744_경제적 불평등(본문).json
Length of Source List: 18    The number of txt file: 1 

Batch Size: 10000
The number of txt file: 131


In [None]:
batch_size = 10000
make_corpus_txt_with_batch_list(file_nested_list, text_file_name_list, batch_size, the_number_of_txt_file)

In [None]:
corpus_list = glob("AIHUB_corpus/demo/general_common_sense/AIHUB_general_common_sense_" +"*.txt")

In [None]:
with open('AIHUB_corpus/demo/AIHUB_general_common_sense.txt', 'w') as f:
    for corpus in corpus_list:
        with open(corpus) as text:
            for line in text:
                f.write(line)