<a href="https://colab.research.google.com/github/maedaak/ndlsearch_oai2csv/blob/master/ndlsearch_oai2csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from google.colab import files
import xml.etree.ElementTree as ET
import pandas as pd
import csv
import sys

NS = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'oai': 'http://www.openarchives.org/OAI/2.0/',
      'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
      'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#',
      'owl' : 'http://www.w3.org/2002/07/owl#',
      'dc' : 'http://purl.org/dc/elements/1.1/',
      'dcterms' : 'http://purl.org/dc/terms/',
      'foaf' : 'http://xmlns.com/foaf/0.1/',
      'dcndl_simple' : 'http://ndl.go.jp/dcndl/dcndl_simple/',
      'dcndl' : 'http://ndl.go.jp/dcndl/terms/',
      'xsi' : 'http://www.w3.org/2001/XMLSchema-instance'}

xsi_type = '{' + NS["xsi"] + '}type'
rdf_type = '{' + NS["rdf"] + '}type'
rdf_resource = '{' + NS["rdf"] + '}resource'
xml_lang = '{' + NS["xml"] + '}lang'

record_path = "./oai:ListRecords/oai:record"
metadata_path = "./oai:metadata/dcndl_simple:dc"

columns = ['dc:title', 'dcndl:titleTranscription', 'dcterms:alternative',
           'dcndl:alternativeTranscription', 'dcndl:volume', 'dcndl:volumeTranscription',
           'dcndl:volumeTitle', 'dcndl:volumeTitleTranscription',
           'dcndl:seriesTitle', 'dcndl:seriesTitleTranscription',
           'dcndl:partTitle', 'dcndl:partTitleTranscription',
           'dc:creator', 'dcndl:creatorTranscription',
           'dcndl:seriesCreator', 'dcndl_partCreator', 'dcndl:edition',
           'dc:publisher', 'dcndl:publicationPlace', 'dc:date', 
           'dcterms:issued', 'dcndl:digitizedPublisher',
           'dcndl:dateDigitized', 
           'dc:subject', 'dcndl:NDLC', 'dcndl:NDC9',
           'dc:language', 
           'dcterms:description', 'dcterms:abstract',
           'dcndl:DOI', 'dcndl:ISBN', 'dcndl:NDLBibID',
           'dcndl:JPNO',
           'dcndl:publicationName', 'dcndl:publicationVolume',
           'dcndl:number', 'dcndl:issue', 'dcndl:pageRange',
           'dcndl:materialType', 'dcterms:rights',  
           'dcterms:extent', 
           'source_url']

output_csv = 'dcndl.csv'


In [None]:
def single(metadata, path):
    data = metadata.find(path, NS)
    result = ""
    if ET.iselement(data):
        if data.text:
            result = data.text
    return result


In [None]:
def repeat(metadata, path):
    data = metadata.findall(path, NS)
    data_list = []
    result = ""
    if len(data) > 0 and ET.iselement(data[0]):
        for work in data:
            if work.text:
                data_list.append(work.text)
    result = "|".join(data_list)
    return result


In [None]:
def dcndlsimple_parse(data, data_row):
    root = ET.fromstring(data)
    records = root.findall(record_path, NS)
    for record in records:
        # 変数の初期化
        dc_title = ""
        dcndl_titleTranscription = ""
        dcterms_alternative = ""
        dcndl_alternativeTranscription = ""
        dcndl_volume = ""
        dcndl_volumeTranscription = ""
        dcndl_volumeTitle = ""
        dcndl_volumeTitleTranscription = ""
        dcndl_seriesTitle = ""
        dcndl_seriesTitleTranscription = ""
        dcndl_partTitle = ""
        dcndl_partTitleTranscription = ""
        dc_creator = ""
        dcndl_creatorTranscription = ""
        dcndl_seriesCreator = ""
        dcndl_partCreator = ""
        dcndl_edition = ""
        dc_publisher =""
        dcndl_publicationPlace = ""
        dc_date = ""
        dcterms_issued = ""
        dcndl_digitizedPublisher = ""
        dcndl_dateDigitized = ""
        subject = ""
        NDLC = ""
        NDC9 = ""
        dc_language = ""
        dcterms_description = ""
        dcterms_abstract = ""
        dcterms_extent = ""
        dcndl_materialType = ""
        DOI = ""
        ISBN = ""
        NDLBibID = ""
        JPNO = ""
        dcndl_publicationName = ""
        dcndl_publicationVolume = ""
        dcndl_number = ""
        dcndl_issue = ""
        dcndl_pageRange = ""
        dcterms_rights  = ""
        source_uri = ""

        if metadata_path:
            metadata = record.find(metadata_path, NS)
        else:
           metadata = record
        
        if not ET.iselement(metadata):
            continue

        dc_title = repeat(metadata, "./dc:title")

        dcndl_titleTranscription = \
            single(metadata, "./dcndl:titleTranscription")

        dcterms_alternative = \
            single(metadata, "./dcterms:alternative")

        dcndl_alternativeTranscription = \
            single(metadata, "./dcndl:alternativeTranscription")

        dcndl_volume = single(metadata, "./dcndl:volume")

        dcndl_volumeTranscription = \
           single(metadata, "./dcndl:volumeTranscription")

        dcndl_volumeTitle = \
           single(metadata, "./dcndl:volumeTitle")

        dcndl_volumeTitleTranscription = \
           single(metadata, "./dcndl:volumeTitleTranscription")

        dcndl_seriesTitle = \
           single(metadata, ".dcndl:seriesTitle")

        dcndl_seriesTitleTranscription = \
           single(metadata, "./dcndl:seriesTitleTranscription")

        dcndl_partTitle = \
           repeat(metadata, "./dcndl:partTitle")

        dcndl_partTitleTranscription = \
           repeat(metadata, "./dcndl:partTitleTranscription")

        dc_creator = repeat(metadata, "./dc:creator")

        dcndl_creatorTranscription = repeat(metadata, "./dcndl:creatorTranscription")

        dcndl_seriesCreator = \
            repeat(metadata, "./dcndl:seriesCreator")

        dcndl_partCreator = \
            repeat(metadata, "./dcndl:partCreator")

        dcndl_edition = \
            repeat(metadata, "./dcndl:edition")

        dc_publisher = repeat(metadata, "./dc:publisher")

        dcndl_publicationPlace = repeat(metadata, "./dcndl:publicationPlace")

        dc_date = repeat(metadata, "./dc:date")

        dcterms_issued = repeat(metadata, "./dcterms:issued")

        dcndl_digitizedPublisher = repeat(metadata, "./dcndl:digitizedPublisher")

        dcndl_dateDigitized = repeat(metadata, "./dcndl:dateDigitized")

        # dc_subject
        dc_subject = \
           metadata.findall("./dc:subject", NS)
        if len(dc_subject) > 0:
            subject_list = []
            for value in dc_subject:
                if xsi_type in value.attrib:
                    if value.attrib[xsi_type] == "dcndl:NDLC":
                        NDLC = value.text
                    elif value.attrib[xsi_type] == "dcndl:NDC9":
                    	NDC9 = value.text
                else:
                    subject_list.append(value.text)
            subject = "|".join(subject_list)

        dc_language = repeat(metadata, "./dc:language")

        dcterms_description = repeat(metadata, "./dcterms:description")

        dcterms_abstract = repeat(metadata, "./dcterms:abstract")

        dcterms_extent = repeat(metadata, "./dcterms:extent")

        dcndl_materialType = repeat(metadata, "./dcndl:materialType")

        # dc:identifier
        dc_identifier = \
           metadata.findall("./dc:identifier", NS)
        if len(dc_identifier) > 0:
            for value in dc_identifier:
                if xsi_type in value.attrib:
                    if value.attrib[xsi_type] == "dcndl:DOI":
                        DOI = value.text
                    elif value.attrib[xsi_type] == "dcndl:ISBN":
                        ISBN = value.text
                    elif value.attrib[xsi_type] == "dcndl:NDLBibID":
                    	NDLBibID = value.text
                    elif value.attrib[xsi_type] == "dcndl:JPNO":
                    	JPNO = value.text

        dcndl_publicationName = \
            repeat(metadata, "./dcndl:publicationName")

        dcndl_publicationVolume = \
            repeat(metadata, "./dcndl:publicationVolume")

        dcndl_number = \
            repeat(metadata, "./dcndl:number")

        dcndl_issue = \
            repeat(metadata, "./dcndl:issue")

        dcndl:pageRange = \
            repeat(metadata, "./dcndl:pageRange")

        dcterms_rights = single(metadata, "./dcterms:rights")

        # source_uri
        dcterms_source = metadata.find("./dcterms:source", NS)
        if ET.iselement(dcterms_source):
            source_uri  = dcterms_source.attrib[rdf_resource]

        row = [dc_title, dcndl_titleTranscription, dcterms_alternative,
           dcndl_alternativeTranscription, dcndl_volume, dcndl_volumeTranscription,
           dcndl_volumeTitle, dcndl_volumeTitleTranscription,
           dcndl_seriesTitle, dcndl_seriesTitleTranscription,
           dcndl_partTitle, dcndl_partTitleTranscription,
           dc_creator, dcndl_creatorTranscription, dcndl_seriesCreator,
           dcndl_partCreator, dcndl_edition,
           dc_publisher, dcndl_publicationPlace, dc_date,
           dcterms_issued, dcndl_digitizedPublisher, dcndl_dateDigitized,
           subject, NDLC, NDC9,
           dc_language, dcterms_description, dcterms_abstract,
           DOI, ISBN, NDLBibID, JPNO,
           dcndl_publicationName, dcndl_publicationVolume,
           dcndl_number, dcndl_issue, dcndl_pageRange,
           dcndl_materialType, dcterms_rights, 
           dcterms_extent, 
           source_uri]
        data_row.append(row)


In [None]:
if __name__ == "__main__":
    data_row = []
    uploaded = files.upload()
    for fn in uploaded.keys():
        pass
    f = open(fn)
    data = f.read()
    f.close()


Saving 0000.xml to 0000.xml


In [None]:
    dcndlsimple_parse(data, data_row)
    dcndl_data = pd.DataFrame(data_row)
    dcndl_data.columns = columns
    dcndl_data.to_csv(output_csv, encoding='utf-8_sig', quoting=csv.QUOTE_ALL, index=False )
files.download(output_csv)