## AI Hub Json Parsing

### Development Environment

In [1]:
import re
import os
import kss
import json
import MeCab
import pandas as pd
from glob import glob
from Korpora import Korpora

### AIHUB 대규모 웹데이터 기반 한국어 말뭉치 데이터

[Source](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=624)

In [16]:
def json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [17]:
path_list = ['AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Training/원천데이터/TS1/' + '**/*.json',
             'AIHUB_웹데이터 기반 한국어 말뭉치 데이터/Validation/원천데이터/VS1/' + '**/*.json']
train_file_name, valid_file_name = json_file_name_list(path_list)

In [29]:
def make_corpus_txt(file_name_list, corpus_file_name):
    sentence_list = []
    for i in range(len(file_name_list)):
        with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file)
        
        source_list = list(pd.DataFrame(one_json_sample['SJML']['text'])['content'])
        for source in source_list:
            for sentence in kss.split_sentences(source):
                if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]', sentence[0])) == False:  
                    sentence_list.append(sentence)  

    with open(os.path.join('AIHUB_corpus/', corpus_file_name), "a", encoding='utf-8') as fp:        
        fp.write("\n".join(sentence_list))              

In [31]:
corpus_file_name = "AIHUB_web_data_based_korean_corpus_data_source.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)

### AIHUB 기계독해

[Source](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=89)

In [2]:
def json_file_name_list(path_list):
    for i in path_list:
        file_name = glob(i, recursive = True)
    return file_name

In [3]:
path_list = ['AIHUB_기계독해'+ '/**/*.json']
file_name = json_file_name_list(path_list)

In [5]:
def make_corpus_txt(file_name_list, corpus_file_name):

    sentence_list = []
    
    for i in range(len(file_name_list)):
        with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file)

            with open(os.path.join('AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:
                for j in one_json_sample['data']:
                    for sentence in kss.split_sentences(j['paragraphs'][0]['context']):
                        if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]', sentence[0])) == False:  
                            sentence_list.append(sentence) 

    with open(os.path.join('AIHUB_corpus/', corpus_file_name), "a", encoding='utf-8') as fp:        
        fp.write("\n".join(sentence_list))  

In [6]:
corpus_file_name = "AIHUB_machine_reading.txt"
make_corpus_txt(file_name, corpus_file_name)

### AIHUB 요약문 및 레포트 생성 데이터

[Source](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=582)

In [17]:
path_list = ['AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/01.news_r/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/02.briefing/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/03.his_cul/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/04.paper/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/05.minute/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/06.edit/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/07.public/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/08.speech/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/09.literature/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Training/원천데이터/TS1/10.narration/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/01.news_r/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/02.briefing/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/03.his_cul/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/04.paper/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/05.minute/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/06.edit/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/07.public/' + '**/*.json',             
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/08.speech/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/09.literature/' + '**/*.json',
'AIHUB_요약문 및 레포트 생성 데이터/Validation/원천데이터/VS1/10.narration/' + '**/*.json']

train_file_name_01 = glob(path_list[0], recursive = True)
train_file_name_02 = glob(path_list[1], recursive = True)
train_file_name_03 = glob(path_list[2], recursive = True)
train_file_name_04 = glob(path_list[3], recursive = True)
train_file_name_05 = glob(path_list[4], recursive = True)
train_file_name_06 = glob(path_list[5], recursive = True)
train_file_name_07 = glob(path_list[6], recursive = True)
train_file_name_08 = glob(path_list[7], recursive = True)
train_file_name_09 = glob(path_list[8], recursive = True)
train_file_name_10 = glob(path_list[9], recursive = True)
valid_file_name_01 = glob(path_list[10], recursive = True)
valid_file_name_02 = glob(path_list[11], recursive = True)
valid_file_name_03 = glob(path_list[12], recursive = True)
valid_file_name_04 = glob(path_list[13], recursive = True)
valid_file_name_05 = glob(path_list[14], recursive = True)
valid_file_name_06 = glob(path_list[15], recursive = True)
valid_file_name_07 = glob(path_list[16], recursive = True)
valid_file_name_08 = glob(path_list[17], recursive = True)
valid_file_name_09 = glob(path_list[18], recursive = True)
valid_file_name_10 = glob(path_list[19], recursive = True)  

In [20]:
def make_corpus_txt(file_name_list, corpus_file_name):

    sentence_list = []
    
    for i in range(len(file_name_list)):
        with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
            one_json_sample = json.load(one_json_file)

        for sentence in kss.split_sentences(one_json_sample['Meta(Refine)']['passage']):
            if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]', sentence[0])) == False:  
                sentence_list.append(sentence) 

    with open(os.path.join('AIHUB_corpus/', corpus_file_name), "a", encoding='utf-8') as fp:        
        fp.write("\n".join(sentence_list))  

In [None]:
corpus_file_name = "AIHUB_summary_and_report_generation_data.txt"
make_corpus_txt(train_file_name_01 , corpus_file_name)
make_corpus_txt(train_file_name_02 , corpus_file_name)
make_corpus_txt(train_file_name_03 , corpus_file_name)
make_corpus_txt(train_file_name_04 , corpus_file_name)
make_corpus_txt(train_file_name_05 , corpus_file_name)
make_corpus_txt(train_file_name_06 , corpus_file_name)
make_corpus_txt(train_file_name_07 , corpus_file_name)
make_corpus_txt(train_file_name_08 , corpus_file_name)
make_corpus_txt(train_file_name_09 , corpus_file_name)
make_corpus_txt(train_file_name_10 , corpus_file_name)

In [None]:
make_corpus_txt(valid_file_name_01 , corpus_file_name)
make_corpus_txt(valid_file_name_02 , corpus_file_name)
make_corpus_txt(valid_file_name_03 , corpus_file_name)
make_corpus_txt(valid_file_name_04 , corpus_file_name)
make_corpus_txt(valid_file_name_05 , corpus_file_name)
make_corpus_txt(valid_file_name_06 , corpus_file_name)
make_corpus_txt(valid_file_name_07 , corpus_file_name)
make_corpus_txt(valid_file_name_08 , corpus_file_name)
make_corpus_txt(valid_file_name_09 , corpus_file_name)
make_corpus_txt(valid_file_name_10 , corpus_file_name)