In [1]:
import xmltodict
import os
import timeit

from rdflib import RDF, RDFS, XSD, DCTERMS, SKOS, OWL, DCAT, Literal, Graph, Namespace, URIRef, BNode

In [2]:
def parse_xml(file):

    xmlfile = open(file, 'r')

    # read xml file
    xml_content = xmlfile.read()

    # change xml format to ordered dict
    xml_dict = xmltodict.parse(xml_content)
    
    return xml_dict

In [3]:
def define_dataset_iri(xml_dict, cbs_dataset_ns):
    
    # define DATASET start of xml tree
    dataset_dict = dict(xml_dict['Dataontwerpversies']['Versie']['Dataontwerp'])

    # define DATASET IRI
    dataset_iri = cbs_dataset_ns + 'ds' + dataset_dict['Id']
    
    return dataset_iri, dataset_dict

In [4]:
def define_variable_iri(var, cbs_variable_ns):

    # define VARIABLE IRI
    variable_iri = cbs_variable_ns + 'vr' + var['Variabele']['Id'] 
    
    return variable_iri

In [5]:
def add_dataset_triple(graph, dataset_iri, dataset_dict):
    
    # add DATASET triples to graph
    graph.add((URIRef(dataset_iri), RDF.type, SKOS.Concept))
    graph.add((URIRef(dataset_iri), RDF.type, cbs_ns.DataSet))
    graph.add((URIRef(dataset_iri), cbs_dataset_ns.dataDesignGroupPath, Literal(dataset_dict['Dataontwerpgroeppad'], lang='nl')))
    graph.add((URIRef(dataset_iri), SKOS.definition, Literal(dataset_dict['Beschrijving'], lang='nl')))
    graph.add((URIRef(dataset_iri), SKOS.altLabel, Literal(dataset_dict['VerkorteSchrijfwijzeNaamDataontwerp'], lang='nl')))
    graph.add((URIRef(dataset_iri), cbs_dataset_ns.descriptionOfThePopulation, Literal(dataset_dict['BeschrijvingVanDePopulatie'], lang='nl')))

    if isinstance(dataset_dict['GebruikteMethodologie'], str):
        graph.add((URIRef(dataset_iri), cbs_dataset_ns.methodology, Literal(dataset_dict['GebruikteMethodologie'], lang='nl')))

    graph.add((URIRef(dataset_iri), cbs_dataset_ns.typeReportingPeriod, Literal(dataset_dict['TypeVerslagperiode'], lang='nl')))
    graph.add((URIRef(dataset_iri), ids_ns.dataType, Literal(dataset_dict['SoortData'], lang='nl')))
    graph.add((URIRef(dataset_iri), dqm_ns.validFrom, Literal(dataset_dict['GeldigVanaf'], datatype=XSD.date)))
    
    #graph.add((URIRef(dataset_iri), DCAT.keyword, Literal(dataset_dict['Trefwoorden']['Trefwoord'], lang='nl')))
    #graph.add((URIRef(dataset_iri), ids_ns.theme, Literal(dataset_dict['Themas']['Thema'], lang='nl')))
    
    return graph

In [6]:
def make_variable_dict_list(xml_dict):
    
    # define VARIABLE dictionary list

    variable_dictionary_list = xml_dict['Dataontwerpversies']['Versie']['Dataontwerp']['Contextvariabelen']['Contextvariabele']

    return variable_dictionary_list

In [7]:
def define_context_iri(cbs_context_ns, variable_iri, cbs_variable_ns, dataset_iri, cbs_dataset_ns):
    
    # define the IRI for the context of the variable
    
    context_iri = cbs_context_ns + variable_iri.replace(cbs_variable_ns, "") + dataset_iri.replace(cbs_dataset_ns, "")
    
    return context_iri

In [8]:
def add_context_triple(var, context_iri, variable_iri, dataset_iri):
    
    # add CONTEXT triples
    
    graph.add((URIRef(context_iri), RDF.type, SKOS.Concept))
    graph.add((URIRef(context_iri), RDF.type, cbs_ns.Context))
    graph.add((URIRef(context_iri), cbs_context_ns.hasDatasetContext, URIRef(dataset_iri)))
    graph.add((URIRef(context_iri), cbs_context_ns.hasVariableContext, URIRef(variable_iri)))
    graph.add((URIRef(context_iri), SKOS.altLabel, Literal(var['VerkorteSchrijfwijzeNaamVariabele'])))
    
    return graph

In [9]:
def add_variable_triple(variable_dictionary_list, dataset_iri, graph):
    
    # add VARIABLE triples to graph

    for var in variable_dictionary_list:

        variable_iri = define_variable_iri(var, cbs_variable_ns)
        context_iri = define_context_iri(cbs_context_ns, variable_iri, cbs_variable_ns, dataset_iri, cbs_dataset_ns)

        graph.add((URIRef(dataset_iri), cbs_dataset_ns.hasVariable, URIRef(variable_iri)))

        graph.add((URIRef(variable_iri), RDF.type, SKOS.Concept))
        graph.add((URIRef(variable_iri), RDF.type, cbs_ns.Variable))
        graph.add((URIRef(variable_iri), SKOS.altLabel, Literal(var['Variabele']['UniekeNaam'], lang='nl')))
        graph.add((URIRef(variable_iri), SKOS.definition, Literal(var['Variabele']['Definitie'], lang='nl')))
        graph.add((URIRef(variable_iri), dqm_ns.validFrom, Literal(var['Variabele']['GeldigVanaf'], datatype=XSD.date)))

        if isinstance(var['ToelichtingBijDeDefinitie'], str):
            graph.add((URIRef(variable_iri), lexinfo_ns.explanation, Literal(var['ToelichtingBijDeDefinitie'], lang='nl')))

        graph.add((URIRef(variable_iri), SKOS.prefLabel, Literal(var['LabelVanDeVariabele'], lang='nl')))
        graph.add((URIRef(variable_iri), ids_ns.dataType, Literal(var['Datatype'], lang='nl')))
        
        graph.add((URIRef(variable_iri), cbs_variable_ns.hasContext, URIRef(context_iri)))
        add_context_triple(var, context_iri, variable_iri, dataset_iri)
        
    return graph

In [10]:
def write_output_file(graph, ofile):
    
    # serialize graph and create output
    
    with open(ofile, "w") as f:
        f.write(graph.serialize(format="turtle"))

In [11]:
# path to directory
directory_path = '../../CBS-metadata/ODISSEI_Full_Export_20210924'

# output file name
ofile = "output.ttl"

# define namespaces
cbs_ns = Namespace("https://portal.odissei-data.nl/data/cbs/")
cbs_dataset_ns = Namespace("https://portal.odissei-data.nl/data/cbs/datasets/")
cbs_variable_ns = Namespace("https://portal.odissei-data.nl/data/cbs/variables/")
cbs_context_ns = Namespace("https://portal.odissei-data.nl/data/cbs/contexts/")
dqm_ns = Namespace("http://purl.org/dqm-vocabulary/v1/dqm#")
lexinfo_ns = Namespace("http://www.lexinfo.net/ontology/2.0/lexinfo#")
ids_ns = Namespace("https://w3id.org/idsa/core/")

In [12]:
start = timeit.default_timer()

# make graph
graph = Graph()

# namespace binding to graph
graph.bind("cbs", cbs_ns)
graph.bind("cbsData", cbs_dataset_ns)
graph.bind("cbsVar", cbs_variable_ns)
graph.bind("cbsCon", cbs_context_ns)
graph.bind("skos", SKOS)
graph.bind("rdf", RDF)
graph.bind("xsd", XSD)
graph.bind("dqm", dqm_ns)
graph.bind("lexinfo", lexinfo_ns)
graph.bind("ids", ids_ns)
graph.bind("dcat", DCAT)


for file in os.listdir(directory_path):
    
    if file.endswith('.dsc'):

        xml_dict = parse_xml(os.path.join(directory_path, file))

        dataset_iri, dataset_dict = define_dataset_iri(xml_dict, cbs_dataset_ns)

        variable_dictionary_list = make_variable_dict_list(xml_dict)

        add_dataset_triple(graph, dataset_iri, dataset_dict)

        add_variable_triple(variable_dictionary_list, dataset_iri, graph)

write_output_file(graph, ofile)
    

stop = timeit.default_timer()

print('Time: ', stop - start)  

Time:  59.737149792000004
