# This is the enrichment pipe

### Load libraries

In [1]:
import xmltodict
import os
import hashlib

from rdflib import SKOS, RDF, XSD, Literal, Graph, Namespace, URIRef

### Select your input folder

In [2]:
# path to input folder

input_path = './input/'

### Serialize your target vocabulary

In [3]:
# path to target vocabulary

elsst = Graph().parse('./target-vocabs/elsst-vocab.ttl')

### Define namespaces

In [4]:
CMDI = Namespace("https://dataverse.org/schema/cbs/")

### Serialize your input graph

In [5]:
def serialize_input_graph(input_file, input_file_path):
    
    input_graph = Graph().parse(input_file_path)
    
    return input_graph

### CBS Entity Extraction
###### Extract Keywords

In [6]:
def extract_keywords(input_graph):
    
    keywords = []
    
    for s, p, o in input_graph.triples((None, CMDI.Trefwoord, None)):
        keywords.append(str(o))
        
    return keywords

### ELLST Mapping
###### Find CBS keywords in ELSST

In [16]:
def keywords_to_elsst(elsst, keywords):
    
    for word in keywords:

        keywords_query = (
            """
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

                SELECT DISTINCT ?elsstConcept ?trefwoord
                WHERE {{
                    ?elsstConcept skos:altLabel ?altLabel ;
                                  skos:prefLabel ?prefLabel .
                                  
                    FILTER(LANG(?altLabel) = "nl" && LANG(?prefLabel) = "nl") .
                    FILTER(STR(?altLabel) = UCASE(STR(?trefwoord)) 
                           || STR(?prefLabel) = UCASE(STR(?trefwoord))) .

                }}
            """
        )

        response_keywords_to_elsst = elsst.query(keywords_query, initBindings={'trefwoord': word})
        
        return response_keywords_to_elsst

### CBS Enrichment
###### Enrich keywords with ELSST

In [8]:
def enrich_keywords_with_elsst(enriched_graph, response_keywords_to_elsst):
    
    for s, p, o in enriched_graph.triples((None, CMDI.Trefwoord, None)):
        for key, value in response_keywords_to_elsst.items():
            enriched_graph.add((s, CMDI.Trefwoord, URIRef(value[0])))
    return enriched_graph

### Main function - iteration through files

In [9]:
for input_file in os.listdir(input_path):
    
    if input_file.endswith('.ttl'):
    
        input_file_path = os.path.join(input_path, input_file)
        
        input_graph = serialize_input_graph(input_file, input_file_path)
        
        enriched_graph = input_graph
        
        keywords = extract_keywords(input_graph)
        
        response_keywords_to_elsst = keywords_to_elsst(elsst, keywords)
        
        #enrich_keywords_with_elsst(enriched_graph, response_keywords_to_elsst)
             

In [17]:
response = keywords_to_elsst(elsst, keywords)

In [18]:
for row in response:
    print(row)

(rdflib.term.URIRef('https://elsst.cessda.eu/id/30dc65f3-99cc-41a9-aca4-00ce24ab386d'), 'Detailhandel')


nf42eb363fe8b4a80ad30a5be34b7870ab1175 https://dataverse.org/schema/cbs/Trefwoord Detailhandel
nf42eb363fe8b4a80ad30a5be34b7870ab1175 https://dataverse.org/schema/cbs/Trefwoord FINANCIËN
