## AI Hub Json Parsing

### Development Environment

In [None]:
%pip install kss

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [5]:
import re
import os
import kss
import json
import pandas as pd
from glob import glob
import konlpy
from konlpy.tag import Mecab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### AIHUB 대규모 구매도서 기반 한국어 말뭉치 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=653)

In [13]:
file_name = '/content/drive/MyDrive/AIHUB/AIHUB_대규모 구매도서 기반 한국어 말뭉치 데이터/000_DATA.tsv'
dataset = pd.read_csv(file_name, sep = '\t', on_bad_lines='skip')

In [16]:
def make_corpus_txt(dataset, corpus_file_name):
    sentence_list = []
    for i in dataset['contents1']:
        for sentence in kss.split_sentences(i):
          if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]', sentence[0])) == False:  
                sentence_list.append(sentence)  

    with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:      
        fp.write("\n".join(sentence_list))

In [18]:
corpus_file_name = "AIHUB_korean_corpus_data_based_on_large_scale_purchase_books.txt"
make_corpus_txt(dataset, corpus_file_name)

### AIHUB 문서요약 텍스트

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=97)

In [19]:
def json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [20]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_문서요약 텍스트/Training/'+ '/**/*.json', 
             '/content/drive/MyDrive/AIHUB/AIHUB_문서요약 텍스트/Validatoin/'+ '/**/*.json']
train_file_name, valid_file_name = json_file_name_list(path_list)

In [24]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []
  for i in range(len(file_name_list)):
    with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

      for j in one_json_sample['documents']:
        for k in j['text'][0]:
          sentence = k['sentence']
          if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[(]', sentence[0])) == False and \
          bool(re.match(r'[가-힣]+.', sentence[:2])) == False and \
          bool(re.match(r'[+[0-9]+]', sentence[:3])) == False and "[다수의견]" not in sentence:
            sentence_list.append(sentence)  

    with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:      
        fp.write("\n".join(sentence_list))

In [25]:
corpus_file_name = "AIHUB_document_summary_text.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)

### AIHUB 논문자료 요약

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=90)

In [1]:
def json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [7]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_논문자료 요약/Training/'+ '/**/*.json', 
             '/content/drive/MyDrive/AIHUB/AIHUB_논문자료 요약/Validatoin/'+ '/**/*.json']
train_file_name, valid_file_name = json_file_name_list(path_list)

In [22]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []

  for i in range(len(file_name_list)):
    with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)
    
    for j in one_json_sample['data']:
      try:
        summary_entire = j['summary_entire'][0]
      except KeyError:
        pass 
      finally: 
        summary_section = j['summary_section'][0]

      if '논문/논문요약' in file_name_list[i]:
        try:
          for sentence in kss.split_sentences(summary_entire['orginal_text']):
              if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                  sentence_list.append(sentence)  
        except KeyError:
          pass            

      if 1 > 0:    
        try:    
          summary_entire = j['summary_entire'][0]
          for sentence in kss.split_sentences(summary_entire['summary_text']):
              if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                  sentence_list.append(sentence)  
        except KeyError:
          pass

        finally:
          for sentence in kss.split_sentences(summary_section['orginal_text']):
              if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                  sentence_list.append(sentence)  
          for sentence in kss.split_sentences(summary_section['summary_text']):
              if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                  sentence_list.append(sentence) 
       
  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list)) 

In [None]:
corpus_file_name = "AIHUB_summary_of_thesis_materials.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)