In [1]:
import xmltodict
import os
import timeit

from rdflib import RDF, RDFS, XSD, DCTERMS, SKOS, OWL, DCAT, Literal, Graph, Namespace, URIRef, BNode

In [2]:
def parse_xml(file):

    xmlfile = open(file, 'r')

    # read xml file
    xml_content = xmlfile.read()

    # change xml format to ordered dict
    xml_dict = xmltodict.parse(xml_content)
    
    return xml_dict

In [3]:
def define_dataset_iri(xml_dict, easy_dataset_ns):
    
    # define DATASET start of xml tree
    iri_str = xml_dict['record']['header']['identifier']

    # define DATASET IRI
    dataset_iri = easy_dataset_ns + 'ds' + iri_str
    
    return dataset_iri

In [4]:
def add_dataset_triple(graph, dataset_iri, xml_dict):
    
    # add DATASET triples to graph

    graph.add((URIRef(dataset_iri), RDF.type, easy_ns.DataSet))
    graph.add((URIRef(dataset_iri), easy_dataset_ns.hasAccess, Literal(xml_dict['record']['metadata']['resource']['rightsList']['rights'][0])))
    graph.add((URIRef(dataset_iri), easy_dataset_ns.hasLicense, Literal(xml_dict['record']['metadata']['resource']['rightsList']['rights'][1]['@rightsURI'])))
    
    return graph

In [5]:
def write_output_file(graph, ofile):
    
    # serialize graph and create output
    
    with open(ofile, "w") as f:
        f.write(graph.serialize(format="turtle"))

In [6]:
# path to directory
directory_path = '../../odissei-data/LISS-harvester/easy-data/rec/easy/'

# output file name
ofile = "output.ttl"

# define namespaces
easy_ns = Namespace("https://portal.odissei-data.nl/data/easy/")
easy_dataset_ns = Namespace("https://portal.odissei-data.nl/data/easy/datasets/")

In [7]:
# make graph
graph = Graph()

# namespace binding to graph
graph.bind("easy", easy_ns)
graph.bind("easyData", easy_dataset_ns)


start = timeit.default_timer()


for file in os.listdir(directory_path):
    
    if file.endswith('.xml'):

        xml_dict = parse_xml(os.path.join(directory_path, file))

        dataset_iri = define_dataset_iri(xml_dict, easy_dataset_ns)

        add_dataset_triple(graph, dataset_iri, xml_dict)

write_output_file(graph, ofile)


stop = timeit.default_timer()

print('Time: ', stop - start)  

Time:  302.95340245799997
