In [1]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
import pandas as pd
import json

In [2]:
SEMANTIC_SIMILARITY_MINIO_BUCKET = 'semantic-similarity-matrices'
RDF_TRANSFORMER_MINIO_BUCKET = 'rdf-transformer'
SEMANTIC_SIMILARITY_DATAFRAME = 'unified_dataset_similarity_matrix.pkl'
DS_UNIFIED_SEM_SIMILARITY_MATRIX = 'ds_unified_sem_similarity_matrix'

In [3]:
feature_store = store_registry.minio_feature_store(minio_bucket=SEMANTIC_SIMILARITY_MINIO_BUCKET)
minio = store_registry.minio_object_store(minio_bucket=RDF_TRANSFORMER_MINIO_BUCKET)
sim_df = feature_store.get_features(features_name=SEMANTIC_SIMILARITY_DATAFRAME)

In [4]:
tmp = pd.DataFrame(sim_df.head(5).copy())

In [5]:
result = []
for iter, rows in tmp.iterrows():
    result+=[{"measure_src": iter,"measure_dest": dest, "similarity": rows[dest]} for dest in rows.index[:5]]

In [6]:
json_result = json.dumps({'similarity_matrix':result})

In [7]:
minio.put_object(object_name=f'sources/{DS_UNIFIED_SEM_SIMILARITY_MATRIX}.json', content=json_result)

2104

In [12]:
from sem_covid import config
from sem_covid.adapters.rml_mapper import RMLMapper
MINIO_RML_RULES_DIR = 'rml_rules'
RML_RULES_FILE_NAME = 'ds_unified_similarity_matrix.ttl'

rml_rule = minio.get_object(
            object_name=f'{MINIO_RML_RULES_DIR}/{RML_RULES_FILE_NAME}').decode('utf8')

rml_mapper = RMLMapper(rml_mapper_url= config.RML_MAPPER_URL)
sources = {'ds_unified_sem_similarity_matrix.json': json_result}
result = rml_mapper.transform(rml_rule=rml_rule, sources=sources)

In [14]:
import numpy as np
from sem_covid.adapters.abstract_store import ObjectStoreABC, FeatureStoreABC, TripleStoreABC
from sem_covid.adapters.rml_mapper import RMLMapperABC
from typing import List

MINIO_RML_RULES_DIR = 'rml_rules'
MINIO_RML_SOURCES_DIR = 'fields'
MINIO_RML_RESULTS_DIR = 'results'
DATASET_INDEX_NAME = 'ds_unified_dataset'
DATASET_PART_SIZE = 100
RDF_RESULT_FORMAT = 'nt11'

class SemanticSimilarityMapRMLTransformPipeline:
    """

    """
    def __init__(self,
                 rml_rules_file_name: str,
                 source_file_names: List[str],
                 rdf_result_file_name: str,
                 rml_mapper: RMLMapperABC,
                 object_storage: ObjectStoreABC,
                 feature_storage: FeatureStoreABC,
                 triple_storage: TripleStoreABC,
                 use_sample_data: bool = False
                 ):
        """

        :param rml_rules_file_name:
        :param source_file_names:
        :param rdf_result_file_name:
        :param rml_mapper:
        :param object_storage:
        :param index_storage:
        :param triple_storage:
        """
        self.rml_rules_file_name = rml_rules_file_name
        self.source_file_names = source_file_names
        self.rdf_result_file_name = rdf_result_file_name
        self.rml_mapper = rml_mapper
        self.object_storage = object_storage
        self.feature_storage = feature_storage
        self.triple_storage = triple_storage
        self.rml_rule = None
        self.sources = None
        self.rdf_results = None
        self.dataset_parts = None
        self.use_sample_data = use_sample_data

    def extract(self):
        """

        :return:
        """
        self.rml_rule = self.object_storage.get_object(
            object_name=f'{MINIO_RML_RULES_DIR}/{self.rml_rules_file_name}').decode('utf8')
        self.sources = {
            file_name: self.object_storage.get_object(object_name=f'{MINIO_RML_SOURCES_DIR}/{file_name}').decode('utf8')
            for file_name in self.source_file_names}
        dataset = self.feature_storage.get_dataframe(index_name=DATASET_INDEX_NAME)
        dataset['index'] = dataset.index
        if self.use_sample_data:
            self.dataset = dataset.head(100)
        else:
            self.dataset = dataset
        df_size = len(self.dataset)
        part_size = DATASET_PART_SIZE
        number_of_parts = int(round(df_size / part_size, 0)) + 1
        self.dataset_parts = np.array_split(self.dataset, number_of_parts)

    def transform(self):
        """

        :return:
        """
        assert self.rml_rule is not None
        assert self.sources is not None
        assert self.dataset_parts is not None
        self.rdf_results = []
        for dataset_part in self.dataset_parts:
            sources = self.sources.copy()
            sources['data.json'] = dataset_part.to_json(orient='index')
            self.rdf_results.append(self.rml_mapper.transform(rml_rule=self.rml_rule, sources=sources))
        self.rdf_results = '\n'.join(self.rdf_results)  # this is the source of potential resource issues

    def load(self):
        """

        :return:
        """
        assert self.rdf_results is not None
        self.object_storage.put_object(object_name=f'{MINIO_RML_RESULTS_DIR}/{self.rdf_result_file_name}',
                                       content=self.rdf_results.encode('utf8'))
        self.triple_storage.create_dataset(dataset_id=DATASET_INDEX_NAME)
        self.triple_storage.upload_triples(dataset_id=DATASET_INDEX_NAME, quoted_triples=self.rdf_results,
                                           rdf_fmt=RDF_RESULT_FORMAT)

    def execute(self):
        """

        :return:
        """
        self.extract()
        self.transform()
        self.load()

In [None]:
MINIO_RML_BUCKET = 'rdf-transformer'
RML_RULES_FILE_NAME = 'ds_unified_dataset.ttl'
RDF_RESULT_FILE_NAME = 'ds_unified_dataset_result.ttl'
RML_MAPPING_SOURCES = ['country.json', 'datasets.json', 'eu_cellar_author_labels.json',
                       'eu_cellar_directory_code_labels.json', 'eu_cellar_resource_type_labels.json',
                       'eu_cellar_subject_matter_labels.json', 'eu_timeline_topic.json', 'ireland_keyword.json',
                       'ireland_page_type.json', 'pwdb_actors.json', 'pwdb_category.json', 'pwdb_funding.json',
                       'pwdb_target_group_l1.json', 'pwdb_target_group_l2.json', 'pwdb_type_of_measure.json']