In [1]:
import xmltodict
import os
import hashlib

from rdflib import RDF, RDFS, XSD, DCTERMS, SKOS, OWL, DCAT, Literal, Graph, Namespace, URIRef, BNode

In [2]:
# path to directory
#directory_path = './test-data' ## FOR TESTING
directory_path = '../../CBS-metadata/ODISSEI_Full_Export_20210924'

# output file name
ofile = "cbs-metadata.ttl"

# define namespaces
cbs_ns = Namespace("https://portal.odissei-data.nl/data/cbs/metadata/")
var_ns = Namespace("https://portal.odissei-data.nl/data/cbs/variableThesaurus/")
cbs_prop_ns = Namespace("https://portal.odissei-data.nl/data/cbs/metadataProperties/")

In [3]:
# make graph
graph = Graph()

# namespace binding to graph
graph.bind("cbs", cbs_ns)
graph.bind("cbsVar", var_ns)
graph.bind("cbsProp", cbs_prop_ns)
graph.bind("skos", SKOS)
graph.bind("rdf", RDF)
graph.bind("xsd", XSD)
graph.bind("dcat", DCAT)

In [4]:
def make_xml_dictionary(file):
    """
    Use the xmltodict library to create a dictionary from the xml file.
    """
    xmlfile = open(file, 'r')
    xml_content = xmlfile.read()
    xml_dictionary = xmltodict.parse(xml_content)
    return xml_dictionary

In [5]:
def define_cbs_metadata_scheme(cbs_ns):
    """
    Create concept scheme of the CBS Metadata.
    """
    graph.add((URIRef(cbs_ns), RDF.type, SKOS.ConceptScheme))
    graph.add((URIRef(cbs_ns), SKOS.prefLabel, Literal("CBS Metadata Concepts Scheme")))
    return graph 

In [6]:
def make_dataset_id(cbs_ns, xml_dictionary):
    """
    Create ID of the dataset.
    A 'd' has been added at the begging of the ID to indicate that
    we are refering to a dataset.
    """
    dataset_id = cbs_ns + "d" + xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['Id']
    return dataset_id

In [7]:
def add_methodology_triples(xml_dictionary, dataset_id, cbs_prop_ns):
    """
    Adding triples to the graph about the methodology information.
    """
    if isinstance(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['GebruikteMethodologie'], str):
        graph.add((URIRef(dataset_id), cbs_prop_ns.methodology, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['GebruikteMethodologie'], lang='nl')))
    return graph

In [8]:
def make_variables_list(xml_dictionary):
    """
    Make a list out of the variables in the dictionary.
    """
    variables_list = xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['Contextvariabelen']['Contextvariabele']
    return variables_list

In [9]:
def make_narrower_variable_id(var, var_ns):
    """
    Create ID of the 'context' (or narrower) variable.
    A 'c' has been added at the begging of the ID to indicate that
    we are refering to a context variable.
    """
    id_hash = hashlib.sha256((var['Variabele']['Id'] + var['LabelVanDeVariabele']).encode('utf-8')).hexdigest()
    narrower_variable_id = var_ns + "c" + id_hash
    return narrower_variable_id

In [10]:
def make_keyword_id(keyword, cbs_ns):
    """
    Create ID of the keyword.
    A 'k' has been added at the begging of the ID to indicate that
    we are refering to a keyword.
    """
    keyword_hash = hashlib.sha256(keyword.encode('utf-8')).hexdigest()
    keyword_id = cbs_ns + "k" + keyword_hash
    return keyword_id

In [11]:
def make_theme_id(theme, cbs_ns):
    """
    Create ID of the theme.
    A 't' has been added at the begging of the ID to indicate that
    we are refering to a theme.
    """
    theme_hash = hashlib.sha256(theme.encode('utf-8')).hexdigest()
    theme_id = cbs_ns + "t" + theme_hash
    return theme_id

In [12]:
def add_keywords_triples(xml_dictionary, dataset_id, cbs_prop_ns, cbs_ns):
    """
    Adding triples to the graph about the keywords.
    """
    keywords = xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['Trefwoorden']
    keyword = keywords.get('Trefwoord')
    if keyword:
        if isinstance(keyword, str):
            graph.add((URIRef(dataset_id), cbs_prop_ns.keyword, URIRef(make_keyword_id(keyword, cbs_ns))))
        if isinstance(keyword, list):
            for word in keyword:
                if isinstance(word, str):
                    graph.add((URIRef(dataset_id), cbs_prop_ns.keyword, URIRef(make_keyword_id(word, cbs_ns)))) 
    return graph    

In [13]:
def add_themes_triples(xml_dictionary, dataset_id, cbs_prop_ns, cbs_ns):
    """
    Adding triples to the graph about the themes.
    """
    themes = xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['Themas']['Thema']
    if isinstance(themes, str):
        graph.add((URIRef(dataset_id), cbs_prop_ns.theme, URIRef(make_theme_id(themes, cbs_ns))))
    if isinstance(themes, list):
        for theme in themes:
            graph.add((URIRef(dataset_id), cbs_prop_ns.theme, URIRef(make_theme_id(theme, cbs_ns)))) 
    return graph   

In [14]:
def add_variable_triples(xml_dictionary, dataset_id, var_ns, cbs_prop_ns):
    """
    Adding triples to the graph about the variables.
    """
    variables_list = make_variables_list(xml_dictionary) 
    for var in variables_list:
        narrower_variable_id = make_narrower_variable_id(var, var_ns)
        graph.add((URIRef(dataset_id), cbs_prop_ns.hasVariable, URIRef(narrower_variable_id)))
    return graph

In [15]:
def add_metadata_triples(xml_dictionary):
    """
    Adding all triples to the graph about the.
    """
    dataset_id = make_dataset_id(cbs_ns, xml_dictionary)
    
    graph.add((URIRef(dataset_id), RDF.type, SKOS.Concept))
    graph.add((URIRef(dataset_id), SKOS.definition, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['Beschrijving'], lang='nl')))
    graph.add((URIRef(dataset_id), cbs_prop_ns.dataDesignGroupPath, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['Dataontwerpgroeppad'], lang='nl')))
    graph.add((URIRef(dataset_id), SKOS.prefLabel, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['VerkorteSchrijfwijzeNaamDataontwerp'], lang='nl')))
    graph.add((URIRef(dataset_id), cbs_prop_ns.descriptionOfThePopulation, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['BeschrijvingVanDePopulatie'], lang='nl')))
    
    add_methodology_triples(xml_dictionary, dataset_id, cbs_prop_ns)
    
    graph.add((URIRef(dataset_id), cbs_prop_ns.typeReportingPeriod, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['TypeVerslagperiode'], lang='nl')))
    graph.add((URIRef(dataset_id), cbs_prop_ns.dataType, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['SoortData'], lang='nl')))    
    graph.add((URIRef(dataset_id), cbs_prop_ns.validFrom, Literal(xml_dictionary['Dataontwerpversies']['Versie']['Dataontwerp']['GeldigVanaf'], datatype=XSD.date)))
    
    add_keywords_triples(xml_dictionary, dataset_id, cbs_prop_ns, cbs_ns)
    add_themes_triples(xml_dictionary, dataset_id, cbs_prop_ns, cbs_ns)
    add_variable_triples(xml_dictionary, dataset_id, var_ns, cbs_prop_ns)
    
    return graph

In [16]:
def write_output_file(graph, ofile):
    """
    Write the content of the graph into an output file.
    """
    
    # serialize graph and create output
    
    with open(ofile, "w") as f:
        f.write(graph.serialize(format="turtle"))

In [17]:
for file in os.listdir(directory_path):
    
    if file.endswith('.dsc'):

        xml_dictionary = make_xml_dictionary(os.path.join(directory_path, file))
        
        add_metadata_triples(xml_dictionary)

write_output_file(graph, ofile)