Skip to content

Commit

Permalink
Merge pull request #160 from TheRazorace/main
Browse files Browse the repository at this point in the history
Morph-KGC in-memory extension
  • Loading branch information
arenas-guerrero-julian committed Apr 9, 2023
2 parents 25f8d08 + b76ab4b commit 425494d
Show file tree
Hide file tree
Showing 219 changed files with 6,575 additions and 25 deletions.
2 changes: 2 additions & 0 deletions examples/dataframe/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[DataSource]
mappings=./mapping_rml.ttl
26 changes: 26 additions & 0 deletions examples/dataframe/kg_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
__author__ = "Ioannis Dasoulas"
__credits__ = ["Julián Arenas-Guerrero","Ioannis Dasoulas"]

__license__ = "Apache-2.0"
__maintainer__ = "Ioannis Dasoulas"
__email__ = "ioannis.dasoulas@kuleuven.be"

import morph_kgc
import pandas as pd

users_df = pd.DataFrame({'Id': [1,2,3,4],\
'Username': ["@jude","@emily","@wayne","@jordan1"], \
'Name': ["Jude", "Emily", "Wayne", "Jordan"],\
'Surname': ["White", "Van de Beeck", "Peterson", "Stones"]})

followers_df = pd.DataFrame({'Id': [1,2,3,4],\
'Followers': [344, 456, 1221, 23]})

data_dict = {"variable1": users_df,
"variable2": followers_df}

g_rdflib = morph_kgc.materialize('./config.ini', data_dict)

print("Knowledge graphs triples:")
for s,p,o in g_rdflib.triples((None, None, None)):
print(s,p,o)
107 changes: 107 additions & 0 deletions examples/dataframe/mapping_rml.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
@prefix insta: <http://instagram.com/data/>.
@prefix rr: <http://www.w3.org/ns/r2rml#>.
@prefix rml: <http://semweb.mmlab.be/ns/rml#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix ql: <http://semweb.mmlab.be/ns/ql#>.
@prefix d2rq: <http://www.wiwiss.fu-berlin.de/suhl/bizer/D2RQ/0.1#>.
@prefix foaf: <http://xmlns.com/foaf/0.1/>.
@base <http://example.com/ns#>.
@prefix schema: <http://schema.org/>.
@prefix sd: <https://w3id.org/okn/o/sd/>.
@prefix kg4di: <https://w3id.org/kg4di/definedBy>.


<people_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable1";
sd:hasDataTransformation [
sd:hasSoftwareRequirements "pandas>=1.1.0";
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:DataFrame;
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{Id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant rdf:type;
];
rr:objectMap [
a rr:ObjectMap;
rr:constant insta:User;
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:username;
];
rr:objectMap [
a rr:ObjectMap;
rml:reference "Username";
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:name;
];
rr:objectMap [
a rr:ObjectMap;
rr:template "{Name} {Surname}";
rr:datatype rdfs:Literal
];
].


<followers_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable2";
sd:hasDataTransformation [
sd:hasSoftwareRequirements "pandas>=1.1.0";
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:DataFrame;
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{Id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:followersNumber;
];
rr:objectMap [
a rr:ObjectMap;
rml:reference "Followers";
];
].

ql:DataFrame a rml:ReferenceFormulation;
kg4di:definedBy "Pandas".






2 changes: 2 additions & 0 deletions examples/json_in_memory/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[DataSource]
mappings=./mapping_rml.ttl
68 changes: 68 additions & 0 deletions examples/json_in_memory/kg_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
__author__ = "Ioannis Dasoulas"
__credits__ = ["Julián Arenas-Guerrero","Ioannis Dasoulas"]

__license__ = "Apache-2.0"
__maintainer__ = "Ioannis Dasoulas"
__email__ = "ioannis.dasoulas@kuleuven.be"

import morph_kgc

users_dict = {"users": [
{
"id": 1,
"username": "@jude",
"name": "Jude",
"surname": "White"
},
{
"id": 2,
"username": "@emily",
"name": "Emily",
"surname": "Van de Beeck"
},
{
"id": 3,
"username": "@wayne",
"name": "Wayne",
"surname": "Peterson"
},
{
"id": 4,
"username": "@jordan1",
"name": "Jordan",
"surname": "Stones"
}
]}


followers_dict = {"followers": [
{
"id": 1,
"follows": [2,3],
"followed_by": 2
},
{
"id": 2,
"follows": [3,5],
"followed_by": [1,3,4,5]
},
{
"id": 3,
"follows": [1,2],
"followed_by": 1
},
{
"id": 4,
"follows": [1,2,3],
"followed_by": [2,3]
}
]}

data_dict = {"variable1": users_dict,
"variable2": followers_dict}

g_rdflib = morph_kgc.materialize('./config.ini', data_dict)

print("Knowledge graphs triples:")
for s,p,o in g_rdflib.triples((None, None, None)):
print(s,p,o)
108 changes: 108 additions & 0 deletions examples/json_in_memory/mapping_rml.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
@prefix insta: <http://instagram.com/data/>.
@prefix rr: <http://www.w3.org/ns/r2rml#>.
@prefix rml: <http://semweb.mmlab.be/ns/rml#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix ql: <http://semweb.mmlab.be/ns/ql#>.
@prefix d2rq: <http://www.wiwiss.fu-berlin.de/suhl/bizer/D2RQ/0.1#>.
@prefix foaf: <http://xmlns.com/foaf/0.1/>.
@base <http://example.com/ns#>.
@prefix schema: <http://schema.org/>.
@prefix sd: <https://w3id.org/okn/o/sd/>.
@prefix kg4di: <https://w3id.org/kg4di/definedBy>.


<people_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable1";
sd:hasDataTransformation [
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:Dictionary;
rml:iterator "$.users[*]";
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant rdf:type;
];
rr:objectMap [
a rr:ObjectMap;
rr:constant insta:User;
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:username;
];
rr:objectMap [
a rr:ObjectMap;
rml:reference "username";
];
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:name;
];
rr:objectMap [
a rr:ObjectMap;
rr:template "{name} {surname}";
rr:datatype rdfs:Literal
];
].


<followers_0> a rr:TriplesMap;

rml:logicalSource [
a rml:LogicalSource;
rml:source [
a sd:DatasetSpecification;
sd:name "variable2";
sd:hasDataTransformation [
sd:hasSourceCode [
sd:programmingLanguage "Python3.9";
];
];
];
rml:referenceFormulation ql:Dictionary;
rml:iterator "$.followers[*]";
];
rr:subjectMap [
a rr:SubjectMap;
rr:template "http://instagram.com/data/user{id}";
];
rr:predicateObjectMap [
rr:predicateMap [
a rr:PredicateMap;
rr:constant insta:follows;
];
rr:objectMap [
a rr:ObjectMap;
rr:template "http://instagram.com/data/user{follows}";
rr:termType rr:IRI
];
].

ql:Dictionary a rml:ReferenceFormulation;
kg4di:definedBy "Python".






16 changes: 8 additions & 8 deletions src/morph_kgc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from .constants import R2RML_TRIPLES_MAP_CLASS


def materialize_set(config):
def materialize_set(config, python_source=None):
config = load_config_from_argument(config)

# parallelization when running as a library is only enabled for Linux see #94
Expand All @@ -36,7 +36,7 @@ def materialize_set(config):

setup_oracle(config)

rml_df, fno_df = retrieve_mappings(config)
rml_df, fno_df = retrieve_mappings(config, python_source)

# keep only asserted mapping rules
asserted_mapping_df = rml_df.loc[rml_df['triples_map_type'] == R2RML_TRIPLES_MAP_CLASS]
Expand All @@ -47,21 +47,21 @@ def materialize_set(config):

pool = mp.Pool(config.get_number_of_processes())
triples = set().union(
*pool.starmap(_materialize_mapping_group_to_set, zip(mapping_groups, repeat(rml_df), repeat(fno_df), repeat(config))))
*pool.starmap(_materialize_mapping_group_to_set, zip(mapping_groups, repeat(rml_df), repeat(fno_df), repeat(config), repeat(python_source))))
pool.close()
pool.join()
else:
triples = set()
for mapping_group in mapping_groups:
triples.update(_materialize_mapping_group_to_set(mapping_group, rml_df, fno_df, config))
triples.update(_materialize_mapping_group_to_set(mapping_group, rml_df, fno_df, config, python_source))

logging.info(f'Number of triples generated in total: {len(triples)}.')

return triples


def materialize(config):
triples = materialize_set(config)
def materialize(config, python_source=None):
triples = materialize_set(config, python_source)

graph = Graph()
rdf_ntriples = '.\n'.join(triples)
Expand All @@ -73,8 +73,8 @@ def materialize(config):
return graph


def materialize_oxigraph(config):
triples = materialize_set(config)
def materialize_oxigraph(config, python_source=None):
triples = materialize_set(config, python_source)

graph = Store()
rdf_ntriples = '.\n'.join(triples)
Expand Down
7 changes: 7 additions & 0 deletions src/morph_kgc/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,15 @@
POSTGRESQL = 'POSTGRESQL'
SQLITE = 'SQLITE'

# in-memory data
PYTHON_SOURCE = 'PYTHON_SOURCE'
DATAFRAME = 'DATAFRAME'
DICTIONARY = 'DICTIONARY'
JSON_STRING = 'JSON_STRING'

FILE_SOURCE_TYPES = [CSV, TSV, PARQUET, ORC, STATA, SPSS, JSON, XML] + EXCEL + FEATHER + SAS + ODS
DATA_SOURCE_TYPES = [RDB] + FILE_SOURCE_TYPES
IN_MEMORY_TYPES = [PYTHON_SOURCE, DATAFRAME, DICTIONARY, JSON_STRING]

# RDF serializations
NTRIPLES = 'N-TRIPLES'
Expand Down

0 comments on commit 425494d

Please sign in to comment.