## AI Hub Json Parsing

### Development Environment

In [None]:
%pip install kss

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [5]:
import re
import os
import kss
import json
import pandas as pd
from glob import glob
import konlpy
from konlpy.tag import Mecab
from collections import OrderedDict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### AIHUB 행정 문서 대상 기계독해 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=569)

In [125]:
def json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [126]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_행정 문서 대상 기계독해 데이터/Training/'+ '/**/*.json', 
             '/content/drive/MyDrive/AIHUB/AIHUB_행정 문서 대상 기계독해 데이터/validation/'+ '/**/*.json']
train_file_name, valid_file_name = json_file_name_list(path_list)

In [127]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []

  for i in range(len(file_name_list)):
    with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)
      
      if 'multiple_choice' in file_name_list[i]:
        for j in one_json_sample['data']:
            for sentence in kss.split_sentences(j['paragraphs'][0]['context']):
                if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[❍]|[-]|[ ]|[ ]|[○]|[Ⅰ]|[Ⅱ]', sentence[0])) == False:
                    sentence_list.append(sentence)  

  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list))                  

In [128]:
corpus_file_name = "AIHUB_machine_reading_data_for_administrative_documents.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)

### AIHUB 도서자료 기계독해

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=92)

In [129]:
def json_file_name_list(path_list):
    for i in path_list:
      file_name = glob(i, recursive = True)
    return file_name

In [130]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_도서자료 기계독해/'+ '/**/*.json']
file_name = json_file_name_list(path_list)

In [131]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []

  for i in range(len(file_name_list)):
    with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    for j in one_json_sample['data']:
        for sentence in kss.split_sentences(j['paragraphs'][0]['context']):
            if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                sentence_list.append(sentence)  

  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list)) 

In [132]:
corpus_file_name = "AIHUB_reading_books_by_machine.txt"
make_corpus_txt(file_name, corpus_file_name)

### AIHUB 특허분야 자동분류 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=547)

In [7]:
def json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [8]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_특허 분야 자동분류 데이터/Training/'+ '/**/*.json', 
             '/content/drive/MyDrive/AIHUB/AIHUB_특허 분야 자동분류 데이터/Validatoin/'+ '/**/*.json']
train_file_name, valid_file_name = json_file_name_list(path_list)

In [24]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []

  for i in range(len(file_name_list)):
    with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
      one_json_sample = json.load(one_json_file)

    for j in one_json_sample['dataset']:
        try:
          for sentence in kss.split_sentences(j['abstract']):
            if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
              sentence_list.append(sentence) 
        except KeyError:
          pass

  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list)) 

In [27]:
corpus_file_name = "AIHUB_automatic_patent_classification_data.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)