In [1]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid import config
from sem_covid.services.store_registry import store_registry
import pandas as pd
from io import StringIO
import json

In [2]:
MINIO_RML_BUCKET = 'rdf-transformer'
SC_BASE_URI = 'http://publications.europa.eu/resource/ontology/sc#'
DS_UNIFIED_DATASET_CATEGORICAL_FIELDS = ['country', 'pwdb_category', 'pwdb_target_group_l1',
       'pwdb_funding', 'pwdb_type_of_measure', 'pwdb_actors','eu_cellar_subject_matter_labels','eu_cellar_resource_type_labels',
       'eu_cellar_author_labels', 'pwdb_target_group_l2', 'ireland_keyword', 'ireland_page_type', 'eu_timeline_topic', 'eu_cellar_directory_code_labels'
       ]
EUVOC_TERMS_MAPPING = [("corporate_body.csv", "eu_cellar_author_labels"),
                       ("resource_type.csv", "eu_cellar_resource_type_labels"),
                       ("subject_matter.csv", "eu_cellar_subject_matter_labels"),
                       ('country.csv', "country")
                       ]

In [3]:
df = store_registry.es_index_store().get_dataframe(index_name='ds_unified_dataset')

100% (6360 of 6360) |####################| Elapsed Time: 0:00:16 Time:  0:00:16


In [4]:
def export_categorical_field_from_dataset(dataset: pd.DataFrame, field_name: str) -> dict:
    field_values = list(set(dataset[field_name].explode().values))
    result = {}
    result[field_name] = [ {'uri': f"{SC_BASE_URI}{field_name}_{index}", "name": str(field_values[index])}
    for index in range(0,len(field_values))]
    return result

In [5]:
def enrich_with_euvoc_terms(euvoc_terms_file: str, column_name: str):
    minio = store_registry.minio_object_store(minio_bucket=MINIO_RML_BUCKET)
    euvoc_terms_buffer = minio.get_object(object_name='euvoc_terms/'+euvoc_terms_file).decode('utf8')
    data_buffer = minio.get_object(object_name=f'fields/{column_name}.json').decode('utf8')
    euvoc_df = pd.read_csv(StringIO(euvoc_terms_buffer))
    data = json.loads(data_buffer)
    records = data[column_name]
    remove_indexes = []
    for index in range(0, len(records)):
        values = euvoc_df[euvoc_df['label'] == records[index]['name']]['uri'].values
        if len(values):
            records[index]['uri'] = values[0]
        else:
            remove_indexes.append(index)
    for remove_index in sorted(remove_indexes, reverse=True):
        records.pop(remove_index)
    data[column_name] = records
    minio.put_object(object_name=f'fields/{column_name}.json', content=json.dumps(data))

In [6]:

def export_categorical_fields_from_dataset(dataset: pd.DataFrame, field_names: list, euvoc_terms_mapping: list):
    minio = store_registry.minio_object_store(minio_bucket=MINIO_RML_BUCKET)
    for field_name in field_names:
        tmp_dict = export_categorical_field_from_dataset(dataset=dataset, field_name=field_name)
        tmp_json = json.dumps(tmp_dict)
        file_name = f"fields/{field_name}.json"
        minio.put_object(object_name=file_name, content=tmp_json)
    for euvoc_terms_file, column_name in EUVOC_TERMS_MAPPING:
        enrich_with_euvoc_terms(euvoc_terms_file=euvoc_terms_file, column_name=column_name) 

In [7]:
export_categorical_fields_from_dataset(dataset=df, field_names= DS_UNIFIED_DATASET_CATEGORICAL_FIELDS, euvoc_terms_mapping = EUVOC_TERMS_MAPPING)