# This is the enrichment pipe

### Load libraries

In [30]:
import hashlib
import json
import os
import rdflib

from rdflib import SKOS, RDF, XSD, Literal, Graph, Namespace, URIRef

### Select your input folder

In [2]:
# path to input folder

input_folder = './input/'

### Define ELSST dictionary

In [108]:
def define_elsst_dict() -> dict:
    
    elsst_dict = {"typeName":"keywordVocabulary","multiple":False,"typeClass":"primitive","value":"ELSST"}
    
    return elsst_dict

### Define CBS Taxonomy dictionary

In [107]:
def define_cbs_dict() -> dict:
    
    cbs_dict = {"typeName":"keywordVocabulary","multiple":False,"typeClass":"primitive","value":"CBS Taxonomy"}
    
    return cbs_dict

### Define Keyword Vocabulary URI dictionary

In [106]:
def define_kw_voc_dict(kw_uri) -> dict:
    
    kw_voc_dict = {"typeName":"keywordVocabularyURI","multiple":False,"typeClass":"primitive","value": kw_uri}
    
    return kw_voc_dict

### Read input file

In [3]:
def read_json(input_file: str) -> dict :
    
    with open(input_file) as f:
        
        return json.load(f)

### Make list of dictionaries for citation block

In [9]:
def make_dicts_list(json_dict: dict) -> list :
    
    dicts_list = json_dict['datasetVersion']['metadataBlocks']['citation']['fields']
    
    return dicts_list

### Select keyword block

In [16]:
def select_kw_block(dicts_list: list) -> list:

    for item in dicts_list:
        
        if item['typeName'] == 'keyword' :
            
            kw_block = item['value']
            
            return kw_block

### Iterate through keywords

In [None]:
def iterate_kw(kw_block: list) -> :
    
    for kw_dict in kw_block:
        
        for k, v in kw_dict.items():
            
            kw_value = v['value']
            
            

### Find enrichment

### Add enrichment

### Deserialize your enrichment vocabulary

In [25]:
# path to target vocabulary

voc = Graph().parse('./enrichment/cbs2elsstCbs.ttl')

### Define namespaces

In [52]:
ELSST = Namespace('https://elsst.cessda.eu/id/')
CBS = Namespace('https://taxonomie.cbs.nl/vocab/id/')

### Deserialize your input graph

In [9]:
def deserialize_input_graph(input_file):
    
    input_graph = Graph().parse(input_file)
    
    return input_graph

### CBS Entity Extraction
###### Extract Keywords

In [10]:
def extract_keywords(input_graph):
    
    keywords = []
    
    for s, p, o in input_graph.triples((None, CMDI.Trefwoord, None)):
        keywords.append(str(o))
        
    return keywords

### Main

In [13]:
def main():

    for input_file in os.listdir(input_folder):
    
        if input_file.endswith('.json'):

            input_file = os.path.join(input_folder, input_file)
            
            json_dict = read_json(input_file)
            
            dicts_list = make_dicts_list(json_dict)
            
            kw_block = select_kw_block(dicts_list)
    
    

             

In [14]:
if __name__ == "__main__":
    main()

In [19]:
for input_file in os.listdir(input_folder):
    
    if input_file.endswith('.json'):

        input_file = os.path.join(input_folder, input_file)

        json_dict = read_json(input_file)
        
        dicts_list = make_dicts_list(json_dict)
        
        kw_block = select_kw_block(dicts_list)

[{'keywordValue': {'typeName': 'keywordValue',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Sociaaleconomische en ruimtelijke statistieken/Sociaal-economisch totaalbeeld/Onderwijs'}},
 {'keywordValue': {'typeName': 'keywordValue',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Opleiding'}},
 {'keywordValue': {'typeName': 'keywordValue',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Persoon (juridisch)'}}]

In [94]:
cbs_dict = define_cbs_dict()

In [53]:
for s, p, o in voc.triples((None, None, None)):
    if type(o) is URIRef:
        if ELSST in o:
            print(o, "elsst")
        elif CBS in o:
            print(o, "cbs")

https://taxonomie.cbs.nl/vocab/id/738 cbs
https://taxonomie.cbs.nl/vocab/id/1569 cbs
https://elsst.cessda.eu/id/28e2d994-4be2-459e-912d-2a4c0a9d2100 elsst
https://taxonomie.cbs.nl/vocab/id/4949 cbs
https://elsst.cessda.eu/id/b9653315-230b-4ce1-9c43-ce86988a86da elsst
https://taxonomie.cbs.nl/vocab/id/2339 cbs
https://elsst.cessda.eu/id/7b97948d-8304-4e9a-86c8-6bc9445e5ccf elsst
https://taxonomie.cbs.nl/vocab/id/374 cbs
https://taxonomie.cbs.nl/vocab/id/1084 cbs
https://taxonomie.cbs.nl/vocab/id/158 cbs
https://taxonomie.cbs.nl/vocab/id/853 cbs
https://elsst.cessda.eu/id/ebf35305-df80-4770-8bfe-2311389c5331 elsst
https://taxonomie.cbs.nl/vocab/id/3002 cbs
https://taxonomie.cbs.nl/vocab/id/392 cbs
https://taxonomie.cbs.nl/vocab/id/2378 cbs
https://taxonomie.cbs.nl/vocab/id/424 cbs
https://taxonomie.cbs.nl/vocab/id/374 cbs
https://taxonomie.cbs.nl/vocab/id/3429 cbs
https://taxonomie.cbs.nl/vocab/id/4230 cbs
https://taxonomie.cbs.nl/vocab/id/1541 cbs
https://taxonomie.cbs.nl/vocab/id/1101 

In [55]:
for kw_dict in kw_block:
    for k, v in kw_dict.items():
        kw_value = v['value']
        print(kw_value)

Sociaaleconomische en ruimtelijke statistieken/Sociaal-economisch totaalbeeld/Onderwijs
Opleiding
Persoon (juridisch)


In [72]:
kw_value = 'Politie'

In [112]:
for kw_dict in kw_block:
    for k, v in kw_dict.items():
        kw_value = v['value']
        
        for s, p, o in voc.triples((None, None, None)):
            if p == SKOS.prefLabel:
                if str(o).casefold() == kw_value.casefold():
                    
                    for sub, pre, obj in voc.triples((s, SKOS.relatedMatch, None)):
                        if CBS in obj:
                            kw_dict['keywordVocabulary'] = cbs_dict
                            kw_dict['keywordVocabularyURI'] = define_kw_voc_dict(str(o))
                    

In [113]:
kw_block

[{'keywordValue': {'typeName': 'keywordValue',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Sociaaleconomische en ruimtelijke statistieken/Sociaal-economisch totaalbeeld/Onderwijs'}},
 {'keywordValue': {'typeName': 'keywordValue',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Opleiding'},
  'keywordVocabulary': {'typeName': 'keywordVocabulary',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'CBS Taxonomy'},
  'keywordVocabularyURI': {'typeName': 'keywordVocabularyURI',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Opleiding'}},
 {'keywordValue': {'typeName': 'keywordValue',
   'multiple': False,
   'typeClass': 'primitive',
   'value': 'Persoon (juridisch)'}}]

In [None]:
def extract_keywords(input_graph):
    
    keywords = []
    
    for s, p, o in input_graph.triples((None, CMDI.Trefwoord, None)):
        keywords.append(str(o))
        
    return keywords