Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Morph-KGC in-memory extension #160

Merged
merged 6 commits into from
Apr 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/dataframe/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[DataSource]
mappings=./mapping_rml.ttl
26 changes: 26 additions & 0 deletions examples/dataframe/kg_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
__author__ = "Ioannis Dasoulas"
__credits__ = ["Julián Arenas-Guerrero","Ioannis Dasoulas"]

__license__ = "Apache-2.0"
__maintainer__ = "Ioannis Dasoulas"
__email__ = "ioannis.dasoulas@kuleuven.be"

import morph_kgc
import pandas as pd

users_df = pd.DataFrame({'Id': [1,2,3,4],\
'Username': ["@jude","@emily","@wayne","@jordan1"], \
'Name': ["Jude", "Emily", "Wayne", "Jordan"],\
'Surname': ["White", "Van de Beeck", "Peterson", "Stones"]})

followers_df = pd.DataFrame({'Id': [1,2,3,4],\
'Followers': [344, 456, 1221, 23]})

data_dict = {"variable1": users_df,
"variable2": followers_df}

g_rdflib = morph_kgc.materialize('./config.ini', data_dict)

print("Knowledge graphs triples:")
for s,p,o in g_rdflib.triples((None, None, None)):
print(s,p,o)
107 changes: 107 additions & 0 deletions examples/dataframe/mapping_rml.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
@prefix insta: <http://instagram.com/data/>.
@prefix rr: <http://www.w3.org/ns/r2rml#>.
@prefix rml: <http://semweb.mmlab.be/ns/rml#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix ql: <http://semweb.mmlab.be/ns/ql#>.
@prefix d2rq: <http://www.wiwiss.fu-berlin.de/suhl/bizer/D2RQ/0.1#>.
@prefix foaf: <http://xmlns.com/foaf/0.1/>.
@base <http://example.com/ns#>.
@prefix schema: <http://schema.org/>.
@prefix sd: <https://w3id.org/okn/o/sd/>.
@prefix kg4di: <https://w3id.org/kg4di/definedBy>.


<people_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable1";
sd:hasDataTransformation [
sd:hasSoftwareRequirements "pandas>=1.1.0";
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:DataFrame;
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{Id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant rdf:type;
];
rr:objectMap [
a rr:ObjectMap;
rr:constant insta:User;
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:username;
];
rr:objectMap [
a rr:ObjectMap;
rml:reference "Username";
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:name;
];
rr:objectMap [
a rr:ObjectMap;
rr:template "{Name} {Surname}";
rr:datatype rdfs:Literal
];
].


<followers_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable2";
sd:hasDataTransformation [
sd:hasSoftwareRequirements "pandas>=1.1.0";
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:DataFrame;
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{Id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:followersNumber;
];
rr:objectMap [
a rr:ObjectMap;
rml:reference "Followers";
];
].

ql:DataFrame a rml:ReferenceFormulation;
kg4di:definedBy "Pandas".






2 changes: 2 additions & 0 deletions examples/json_in_memory/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[DataSource]
mappings=./mapping_rml.ttl
68 changes: 68 additions & 0 deletions examples/json_in_memory/kg_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
__author__ = "Ioannis Dasoulas"
__credits__ = ["Julián Arenas-Guerrero","Ioannis Dasoulas"]

__license__ = "Apache-2.0"
__maintainer__ = "Ioannis Dasoulas"
__email__ = "ioannis.dasoulas@kuleuven.be"

import morph_kgc

users_dict = {"users": [
{
"id": 1,
"username": "@jude",
"name": "Jude",
"surname": "White"
},
{
"id": 2,
"username": "@emily",
"name": "Emily",
"surname": "Van de Beeck"
},
{
"id": 3,
"username": "@wayne",
"name": "Wayne",
"surname": "Peterson"
},
{
"id": 4,
"username": "@jordan1",
"name": "Jordan",
"surname": "Stones"
}
]}


followers_dict = {"followers": [
{
"id": 1,
"follows": [2,3],
"followed_by": 2
},
{
"id": 2,
"follows": [3,5],
"followed_by": [1,3,4,5]
},
{
"id": 3,
"follows": [1,2],
"followed_by": 1
},
{
"id": 4,
"follows": [1,2,3],
"followed_by": [2,3]
}
]}

data_dict = {"variable1": users_dict,
"variable2": followers_dict}

g_rdflib = morph_kgc.materialize('./config.ini', data_dict)

print("Knowledge graphs triples:")
for s,p,o in g_rdflib.triples((None, None, None)):
print(s,p,o)
108 changes: 108 additions & 0 deletions examples/json_in_memory/mapping_rml.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
@prefix insta: <http://instagram.com/data/>.
@prefix rr: <http://www.w3.org/ns/r2rml#>.
@prefix rml: <http://semweb.mmlab.be/ns/rml#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix ql: <http://semweb.mmlab.be/ns/ql#>.
@prefix d2rq: <http://www.wiwiss.fu-berlin.de/suhl/bizer/D2RQ/0.1#>.
@prefix foaf: <http://xmlns.com/foaf/0.1/>.
@base <http://example.com/ns#>.
@prefix schema: <http://schema.org/>.
@prefix sd: <https://w3id.org/okn/o/sd/>.
@prefix kg4di: <https://w3id.org/kg4di/definedBy>.


<people_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable1";
sd:hasDataTransformation [
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:Dictionary;
rml:iterator "$.users[*]";
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant rdf:type;
];
rr:objectMap [
a rr:ObjectMap;
rr:constant insta:User;
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:username;
];
rr:objectMap [
a rr:ObjectMap;
rml:reference "username";
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:name;
];
rr:objectMap [
a rr:ObjectMap;
rr:template "{name} {surname}";
rr:datatype rdfs:Literal
];
].


<followers_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable2";
sd:hasDataTransformation [
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:Dictionary;
rml:iterator "$.followers[*]";
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:follows;
];
rr:objectMap [
a rr:ObjectMap;
rr:template "http://instagram.com/data/user{follows}";
rr:termType rr:IRI
];
].

ql:Dictionary a rml:ReferenceFormulation;
kg4di:definedBy "Python".






16 changes: 8 additions & 8 deletions src/morph_kgc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from .constants import R2RML_TRIPLES_MAP_CLASS


def materialize_set(config):
def materialize_set(config, python_source=None):
config = load_config_from_argument(config)

# parallelization when running as a library is only enabled for Linux see #94
Expand All @@ -36,7 +36,7 @@ def materialize_set(config):

setup_oracle(config)

rml_df, fno_df = retrieve_mappings(config)
rml_df, fno_df = retrieve_mappings(config, python_source)

# keep only asserted mapping rules
asserted_mapping_df = rml_df.loc[rml_df['triples_map_type'] == R2RML_TRIPLES_MAP_CLASS]
Expand All @@ -47,21 +47,21 @@ def materialize_set(config):

pool = mp.Pool(config.get_number_of_processes())
triples = set().union(
*pool.starmap(_materialize_mapping_group_to_set, zip(mapping_groups, repeat(rml_df), repeat(fno_df), repeat(config))))
*pool.starmap(_materialize_mapping_group_to_set, zip(mapping_groups, repeat(rml_df), repeat(fno_df), repeat(config), repeat(python_source))))
pool.close()
pool.join()
else:
triples = set()
for mapping_group in mapping_groups:
triples.update(_materialize_mapping_group_to_set(mapping_group, rml_df, fno_df, config))
triples.update(_materialize_mapping_group_to_set(mapping_group, rml_df, fno_df, config, python_source))

logging.info(f'Number of triples generated in total: {len(triples)}.')

return triples


def materialize(config):
triples = materialize_set(config)
def materialize(config, python_source=None):
triples = materialize_set(config, python_source)

graph = Graph()
rdf_ntriples = '.\n'.join(triples)
Expand All @@ -73,8 +73,8 @@ def materialize(config):
return graph


def materialize_oxigraph(config):
triples = materialize_set(config)
def materialize_oxigraph(config, python_source=None):
triples = materialize_set(config, python_source)

graph = Store()
rdf_ntriples = '.\n'.join(triples)
Expand Down
7 changes: 7 additions & 0 deletions src/morph_kgc/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,15 @@
POSTGRESQL = 'POSTGRESQL'
SQLITE = 'SQLITE'

# in-memory data
PYTHON_SOURCE = 'PYTHON_SOURCE'
DATAFRAME = 'DATAFRAME'
DICTIONARY = 'DICTIONARY'
JSON_STRING = 'JSON_STRING'

FILE_SOURCE_TYPES = [CSV, TSV, PARQUET, ORC, STATA, SPSS, JSON, XML] + EXCEL + FEATHER + SAS + ODS
DATA_SOURCE_TYPES = [RDB] + FILE_SOURCE_TYPES
IN_MEMORY_TYPES = [PYTHON_SOURCE, DATAFRAME, DICTIONARY, JSON_STRING]

# RDF serializations
NTRIPLES = 'N-TRIPLES'
Expand Down