## AIHub Json Parsing

### Development Environment

In [None]:
%pip install kss

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [6]:
import re
import os
import kss
import json
import pandas as pd
from glob import glob
import konlpy
from konlpy.tag import Mecab
from lxml import etree
import xml.etree.ElementTree as ET
from collections import OrderedDict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### AIHUB 일반상식

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=106)

In [None]:
def json_file_name_list(path_list):
    for i in path_list:
        file_name = glob(i, recursive = True)
    return file_name

In [None]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_일반상식/'+ '/**/*.json']
file_name = json_file_name_list(path_list)

In [None]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []

  for i in range(len(file_name_list)):
      with open(file_name_list[i], 'r', encoding='utf-8') as one_json_file:
          one_json_sample = json.load(one_json_file)

          with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:
            if 'ko_wiki_v1_squad' in file_name_list[i]:
                for j in one_json_sample['data']:
                  for sentence in kss.split_sentences(j['paragraphs'][0]['context']):
                    if sentence[-1] == ".":
                      if sentence != ".":
                          sentence_list.append(sentence) 
            else:
              for j in one_json_sample['sentence']:
                  for sentence in kss.split_sentences(j['text']):
                    if sentence[-1] == ".":
                      if sentence != ".":
                          sentence_list.append(sentence) 

  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list)) 

In [None]:
corpus_file_name = "AIHUB_general_common_sense.txt"
make_corpus_txt(file_name, corpus_file_name)

### AIHUB 법률 규정 (판결서 약관 등) 텍스트 분석 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=116&topMenu=100&aihubDataSe=ty&dataSetSn=580)

In [13]:
def xml_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [8]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Training/' + '/**/*.xml',
             '/content/drive/MyDrive/AIHUB/AIHUB_법률 규정 (판결서 약관 등) 텍스트 분석 데이터/Validation/' + '/**/*.xml']

train_file_name, valid_file_name = xml_file_name_list(path_list)

In [11]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []
  parser = etree.XMLParser(remove_blank_text=True,recover=True) 

  for i in range(len(file_name_list)):
    try:
      root = etree.XML(open(file_name_list[i],'r').read().encode('utf-8'),parser) 
    except:
      pass

    if '약관' in file_name_list[i]:
      try:
        root_text = root[0][2].text
        for sentence in kss.split_sentences(re.sub(r"\n|제+[0-9]+조+ (\([^)]*\))|00000 약관|[0-9].  |", "", str(root_text))):
          if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
            sentence_list.append(sentence)  

      except IndexError:
        pass
            
      except TypeError:
        pass
      

    elif '01.민사' in file_name_list[i]:
      num = 0
      try:
        root_text = root[0][2].text
        for sentence in kss.split_sentences(' '.join(re.sub(r"\n", "", str(root_text)).split())):
          if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
            if '2. 원고의 청구에 대한 판단' in sentence:
              num += 1
              if num == 1 and '2. 원고의 청구에 대한 판단' not in sentence:
                sentence_list.append(sentence)  
      except IndexError:
        pass
            
      except TypeError:
        pass

    elif '02.형사' in file_name_list[i]:
      num = 0
      try:
        root_text = root[0][2].text
        for sentence in kss.split_sentences(' '.join(re.sub(r"\n", "", str(root_text)).split())):
          if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
            if '판례 검색' in sentence:
              num += 1
              if num == 1 and '판례 검색' not in sentence:
                sentence_list.append(sentence)  

      except IndexError:
        pass
            
      except TypeError:
        pass


    elif '03.행정' in file_name_list[i]:
      num = 0
      try:
        root_text = root[0][2].text
        for sentence in kss.split_sentences(' '.join(re.sub(r"\n", "", root[0][2].text).split())):
          if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                if '관계 법령' in sentence:
                  num += 1
                  if num == 1 and '관계 법령' not in sentence:
                    sentence_list.append(sentence)  

      except IndexError:
        pass
            
      except TypeError:
        pass

  only_sentence_list = list(OrderedDict.fromkeys(sentence_list))
  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list)) 

In [12]:
corpus_file_name = "AIHUB_legal_regulations_(such_as_terms_and_conditions_of_judgment)_text_analysis_data.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)

### AIHUB 뉴스기사 기계독해 데이터

[Source](https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=577)

In [9]:
def json_file_name_list(path_list):
    for i in path_list:
        if 'rain' in i:
            train_file_name = glob(i, recursive = True)
        elif 'alid' in i:  
            valid_file_name = glob(i, recursive = True)
    return train_file_name, valid_file_name

In [10]:
path_list = ['/content/drive/MyDrive/AIHUB/AIHUB_뉴스 기사 기계독해 데이터/Training/'+ '/**/*.json', 
             '/content/drive/MyDrive/AIHUB/AIHUB_뉴스 기사 기계독해 데이터/Validation/'+ '/**/*.json']
train_file_name, valid_file_name = json_file_name_list(path_list)

In [37]:
def make_corpus_txt(file_name_list, corpus_file_name):

  sentence_list = []

  for i in range(len(file_name_list)):
    with open(train_file_name[2], 'r', encoding='utf-8-sig', errors='ignore') as one_json_file:
      one_json_sample = json.load(one_json_file, strict=False)
      
    for j in one_json_sample['data']:
        for sentence in kss.split_sentences(j['paragraphs'][0]['context']):
            if bool(re.match(r'[.]|[,]|[◆]|[◇]|[△]|[▲]|[▽]|[▼]|[▷]|[▶]|[<]|[>]|[0-9]|[《]|[/]|[○]|[-]| ]', sentence[0])) == False:
                sentence_list.append(sentence)  

  with open(os.path.join('/content/drive/MyDrive/AIHUB/AIHUB_corpus/', corpus_file_name), 'a', encoding="UTF-8") as fp:       
      fp.write("\n".join(sentence_list)) 

In [46]:
corpus_file_name = "AIHUB_news_article_machine_reading_data.txt"
make_corpus_txt(train_file_name, corpus_file_name)
make_corpus_txt(valid_file_name, corpus_file_name)