Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Begin using curies.Converter in more places #397

Merged
merged 42 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
9ee0af2
Use `curies` in `sparql_util.py`
cthoyt Jul 22, 2023
6bdf3b6
Update sparql_util.py
cthoyt Jul 22, 2023
8ebea27
Additional cleanup of sparql endpoint
cthoyt Jul 22, 2023
e3383a4
Additional updates to RDF
cthoyt Jul 22, 2023
e9f4747
Fix bug where endpoint config is built up over time
cthoyt Jul 22, 2023
da7ad44
Deprecate old compression function
cthoyt Jul 22, 2023
483d10a
Update pyproject.toml
cthoyt Jul 22, 2023
7e19639
Update util.py
cthoyt Jul 24, 2023
ec3452f
Update util.py
cthoyt Jul 24, 2023
118017d
Add typing.deprecated to curie_from_uri
cthoyt Jul 24, 2023
1849d60
Update lock
cthoyt Jul 24, 2023
032be37
Add typing extensions
cthoyt Jul 24, 2023
8868d08
Update util.py
cthoyt Jul 24, 2023
29c1802
testing latest version
hrshdhgd Jul 24, 2023
fcc6a78
anchor to 1.4.2 like other projects
hrshdhgd Jul 24, 2023
2c75910
using snok poetry from marketplace
hrshdhgd Jul 24, 2023
faeca94
remove pip update
hrshdhgd Jul 24, 2023
ebcab13
lock file updated
hrshdhgd Jul 24, 2023
1cc5ea9
added --no-interaction
hrshdhgd Jul 24, 2023
13d9cad
virtualenv causing the errors
hrshdhgd Jul 24, 2023
ba21248
anchor versions
hrshdhgd Jul 24, 2023
651d278
poetry == 1.4.2
hrshdhgd Jul 24, 2023
6713ad0
remove poetry.lock from source control
hrshdhgd Jul 24, 2023
f4f9106
remove poetry.lock from source control
hrshdhgd Jul 24, 2023
d5624dc
remove version anchor for poetry
hrshdhgd Jul 24, 2023
752e320
reanchor poetry to 1.3.2as before
hrshdhgd Jul 24, 2023
0e028c2
Clean up
cthoyt Jul 25, 2023
172463a
Fix delete
cthoyt Jul 25, 2023
2610322
Update context.py
cthoyt Jul 25, 2023
6abda6f
Update parsers.py
cthoyt Jul 25, 2023
fe21fc7
Update util.py
cthoyt Jul 25, 2023
25d74ae
Clean DC and update tests
cthoyt Jul 25, 2023
1c25c94
Update sparql_util.py
cthoyt Jul 25, 2023
afe95a8
Update context.py
cthoyt Jul 25, 2023
439f561
Add implicit prefix map validity checker
cthoyt Jul 25, 2023
f0cd478
Add text explanation.
cthoyt Jul 25, 2023
a9b0f27
Update test_parsers.py
cthoyt Jul 25, 2023
e24fec2
Remove DC cleanup
cthoyt Jul 25, 2023
e5afffb
removing poetry.lock from gitignore and commiting the lock file from …
hrshdhgd Jul 25, 2023
3ddf061
Update .gitignore
cthoyt Jul 25, 2023
db44005
Merge branch 'master' into improve-sparql-util
cthoyt Jul 25, 2023
36180e1
Remove test_broken_obographs test are reinstating the correct one
matentzn Jul 27, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ schema/sssom_datamodel.py
sssom/internal_context.py
sssom/sssom_datamodel.py
*.rej
poetry.lock
cthoyt marked this conversation as resolved.
Show resolved Hide resolved
2,380 changes: 0 additions & 2,380 deletions poetry.lock

This file was deleted.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
"Chris Mungall <cjmungall@lbl.gov>",
"Nicolas Matentzoglu <nicolas.matentzoglu@gmail.com>",
"Harshad Hegde <hhegde@lbl.gov>"
]
]
license = "MIT"
readme = "README.md"

Expand All @@ -19,6 +19,7 @@ bioregistry = ">=0.9.43"
deprecation = ">=2.1.0"
linkml-runtime = ">=1.5.3"
networkx = ">=3.1"
curies = ">=0.5.7"
pandas = ">=2.0.2"
pansql = "^0.0.1"
pyyaml = ">=6.0"
Expand Down
7 changes: 2 additions & 5 deletions src/sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,8 @@ def sparql(
endpoint.limit = limit
if object_labels is not None:
endpoint.include_object_labels = object_labels
if prefix is not None:
if endpoint.prefix_map is None:
endpoint.prefix_map = {}
for k, v in prefix:
endpoint.prefix_map[k] = v
for k, v in prefix or []:
cthoyt marked this conversation as resolved.
Show resolved Hide resolved
endpoint.prefix_map[k] = v
msdf = query_mappings(endpoint)
write_table(msdf, output)

Expand Down
20 changes: 19 additions & 1 deletion src/sssom/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def get_extended_prefix_map():
:return: Prefix map.
"""
converter = Converter.from_extended_prefix_map(EXTENDED_PREFIX_MAP)
return converter.prefix_map
return {record.prefix: record.uri_prefix for record in converter.records}


def get_built_in_prefix_map() -> PrefixMap:
Expand Down Expand Up @@ -86,9 +86,23 @@ def add_built_in_prefixes_to_prefix_map(
raise ValueError(
f"Built-in prefix {k} is specified ({prefix_map[k]}) but differs from default ({builtinmap[k]})"
)
clean_dc(prefix_map)
return prefix_map


def clean_dc(prefix_map):
"""Remove a common issue with prefix maps in-place for DC/DCTERMS.

This happens when both DC and DCTERMS are set to the new URI prefix
for DCTERMS. DC has historically been used to point to DC elements.
If this happens, then the prefix map doesn't follow the bijectivity
rule. This function deletes the DC annotation in favor of the keeping
the DCTERMS one. This should be fixed upstream in the SSSOM Schema.
"""
if "dc" in prefix_map and "dcterms" in prefix_map and prefix_map["dc"] == prefix_map["dcterms"]:
del prefix_map["dc"]
matentzn marked this conversation as resolved.
Show resolved Hide resolved


def get_default_metadata() -> Metadata:
"""Get @context property value from the sssom_context variable in the auto-generated 'internal_context.py' file.

Expand All @@ -108,8 +122,12 @@ def get_default_metadata() -> Metadata:
if "@id" in v and "@prefix" in v:
if v["@prefix"]:
prefix_map[key] = v["@id"]
del prefix_map["@vocab"]

prefix_map.update({(k, v) for k, v in contxt_external.items() if k not in prefix_map})
clean_dc(prefix_map)
# Tests if the prefix map is a valid bijective map
Converter.from_prefix_map(prefix_map)
matentzn marked this conversation as resolved.
Show resolved Hide resolved

metadata = Metadata(prefix_map=prefix_map, metadata=metadata_dict)
metadata.metadata["mapping_set_id"] = DEFAULT_MAPPING_SET_ID
Expand Down
55 changes: 27 additions & 28 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas as pd
import requests
import yaml
from curies import Converter
from deprecation import deprecated
from linkml_runtime.loaders.json_loader import JSONLoader
from pandas.errors import EmptyDataError
Expand Down Expand Up @@ -64,11 +65,10 @@
SSSOM_DEFAULT_RDF_SERIALISATION,
URI_SSSOM_MAPPINGS,
MappingSetDataFrame,
NoCURIEException,
curie_from_uri,
get_file_extension,
is_multivalued_slot,
raise_for_bad_path,
safe_compress,
to_mapping_set_dataframe,
)

Expand Down Expand Up @@ -506,6 +506,7 @@ def from_sssom_rdf(
:return: MappingSetDataFrame object
"""
prefix_map = _ensure_prefix_map(prefix_map)
converter = Converter.from_prefix_map(prefix_map)

ms = _init_mapping_set(meta)
mlist: List[Mapping] = []
Expand All @@ -515,7 +516,7 @@ def from_sssom_rdf(
for _s, p, o in g.triples((ox, None, None)):
if isinstance(p, URIRef):
try:
p_id = curie_from_uri(p, prefix_map)
p_id = safe_compress(p, converter)
k = None

if p_id.startswith("sssom:"):
Expand All @@ -529,14 +530,14 @@ def from_sssom_rdf(

if isinstance(o, URIRef):
v: Any
v = curie_from_uri(o, prefix_map)
v = safe_compress(o, converter)
else:
v = o.toPython()
if k:
v = _address_multivalued_slot(k, v)
mdict[k] = v

except NoCURIEException as e:
except ValueError as e:
logging.warning(e)
if mdict:
m = _prepare_mapping(Mapping(**mdict))
Expand Down Expand Up @@ -596,6 +597,7 @@ def from_alignment_minidom(
"""
# FIXME: should be prefix_map = _check_prefix_map(prefix_map)
_ensure_prefix_map(prefix_map)
converter = Converter.from_prefix_map(prefix_map)
ms = _init_mapping_set(meta)
mlist: List[Mapping] = []
# bad_attrs = {}
Expand All @@ -612,7 +614,7 @@ def from_alignment_minidom(
cell = e.getElementsByTagName("Cell")
for c_node in cell:
mdict = _cell_element_values(
c_node, prefix_map, mapping_predicates=mapping_predicates
c_node, converter, mapping_predicates=mapping_predicates
)
if mdict:
m = _prepare_mapping(mdict)
Expand Down Expand Up @@ -665,6 +667,7 @@ def from_obographs(
:return: An SSSOM data frame (MappingSetDataFrame)
"""
_ensure_prefix_map(prefix_map)
converter = Converter.from_prefix_map(prefix_map)
ms = _init_mapping_set(meta)
mlist: List[Mapping] = []
# bad_attrs = {}
Expand Down Expand Up @@ -705,13 +708,13 @@ def from_obographs(
xref_id = xref["val"]
mdict: Dict[str, Any] = {}
try:
mdict[SUBJECT_ID] = curie_from_uri(nid, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(xref_id, prefix_map)
mdict[SUBJECT_ID] = safe_compress(nid, converter)
mdict[OBJECT_ID] = safe_compress(xref_id, converter)
mdict[SUBJECT_LABEL] = label
mdict[PREDICATE_ID] = "oboInOwl:hasDbXref"
mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
mlist.append(Mapping(**mdict))
except NoCURIEException as e:
except ValueError as e:
# FIXME this will cause all sorts of ragged Mappings
logging.warning(e)
if "basicPropertyValues" in n["meta"]:
Expand All @@ -721,15 +724,15 @@ def from_obographs(
xref_id = value["val"]
mdict = {}
try:
mdict[SUBJECT_ID] = curie_from_uri(nid, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(xref_id, prefix_map)
mdict[SUBJECT_ID] = safe_compress(nid, converter)
mdict[OBJECT_ID] = safe_compress(xref_id, converter)
mdict[SUBJECT_LABEL] = label
mdict[PREDICATE_ID] = curie_from_uri(pred, prefix_map)
mdict[PREDICATE_ID] = safe_compress(pred, converter)
mdict[
MAPPING_JUSTIFICATION
] = MAPPING_JUSTIFICATION_UNSPECIFIED
mlist.append(Mapping(**mdict))
except NoCURIEException as e:
except ValueError as e:
# FIXME this will cause ragged mappings
logging.warning(e)
if "edges" in g:
Expand All @@ -739,15 +742,15 @@ def from_obographs(
predicate_id = _get_obographs_predicate_id(edge["pred"])
object_id = edge["obj"]
if predicate_id in mapping_predicates:
mdict[SUBJECT_ID] = curie_from_uri(subject_id, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(object_id, prefix_map)
mdict[SUBJECT_ID] = safe_compress(subject_id, converter)
mdict[OBJECT_ID] = safe_compress(object_id, converter)
mdict[SUBJECT_LABEL] = (
labels[subject_id] if subject_id in labels.keys() else ""
)
mdict[OBJECT_LABEL] = (
labels[object_id] if object_id in labels.keys() else ""
)
mdict[PREDICATE_ID] = curie_from_uri(predicate_id, prefix_map)
mdict[PREDICATE_ID] = safe_compress(predicate_id, converter)
mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
mlist.append(Mapping(**mdict))
if "equivalentNodesSets" in g and OWL_EQUIV_CLASS_URI in mapping_predicates:
Expand All @@ -757,10 +760,10 @@ def from_obographs(
for ec2 in equivalents["nodeIds"]:
if ec1 != ec2:
mdict = {}
mdict[SUBJECT_ID] = curie_from_uri(ec1, prefix_map)
mdict[OBJECT_ID] = curie_from_uri(ec2, prefix_map)
mdict[PREDICATE_ID] = curie_from_uri(
OWL_EQUIV_CLASS_URI, prefix_map
mdict[SUBJECT_ID] = safe_compress(ec1, converter)
mdict[OBJECT_ID] = safe_compress(ec2, converter)
mdict[PREDICATE_ID] = safe_compress(
OWL_EQUIV_CLASS_URI, converter
)
mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
mdict[SUBJECT_LABEL] = (
Expand Down Expand Up @@ -868,19 +871,15 @@ def _set_metadata_in_mapping_set(
mapping_set[k] = v


def _cell_element_values(cell_node, prefix_map: PrefixMap, mapping_predicates) -> Optional[Mapping]:
def _cell_element_values(cell_node, converter: Converter, mapping_predicates) -> Optional[Mapping]:
mdict: Dict[str, Any] = {}
for child in cell_node.childNodes:
if child.nodeType == Node.ELEMENT_NODE:
try:
if child.nodeName == "entity1":
mdict[SUBJECT_ID] = curie_from_uri(
child.getAttribute("rdf:resource"), prefix_map
)
mdict[SUBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter)
elif child.nodeName == "entity2":
mdict[OBJECT_ID] = curie_from_uri(
child.getAttribute("rdf:resource"), prefix_map
)
mdict[OBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter)
elif child.nodeName == "measure":
mdict[CONFIDENCE] = child.firstChild.nodeValue
elif child.nodeName == "relation":
Expand All @@ -902,7 +901,7 @@ def _cell_element_values(cell_node, prefix_map: PrefixMap, mapping_predicates) -
logging.warning(f"{relation} not a recognised relation type.")
else:
logging.warning(f"Unsupported alignment api element: {child.nodeName}")
except NoCURIEException as e:
except ValueError as e:
logging.warning(e)

mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED
Expand Down
21 changes: 8 additions & 13 deletions src/sssom/rdf_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import logging
from typing import Any, Dict, List, Optional

from curies import Converter
from linkml_runtime.utils.metamodelcore import URIorCURIE
from rdflib import Graph, URIRef

# from .sssom_datamodel import EntityReference, Mapping
from sssom_schema import EntityReference, Mapping

from .parsers import to_mapping_set_document
Expand All @@ -24,17 +23,12 @@ def rewire_graph(
precedence: Optional[List[str]] = None,
) -> int:
"""Rewire an RDF Graph replacing using equivalence mappings."""
pm = mset.prefix_map
mdoc = to_mapping_set_document(mset)
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}

def expand_curie(curie: str) -> URIRef:
"""Expand CURIE into URIRef."""
pfx, local = curie.split(":")
return URIRef(f"{pm[pfx]}{local}")

if mdoc.mapping_set.mappings is None:
raise TypeError

converter = Converter.from_prefix_map(mdoc.prefix_map)
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}
for m in mdoc.mapping_set.mappings:
if not isinstance(m, Mapping):
continue
Expand All @@ -49,8 +43,8 @@ def expand_curie(curie: str) -> URIRef:
curr_tgt = rewire_map[src]
logging.info(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}")
if precedence:
curr_pfx, _ = curr_tgt.split(":")
tgt_pfx, _ = tgt.split(":")
curr_pfx, _ = converter.parse_curie(curr_tgt)
tgt_pfx, _ = converter.parse_curie(tgt)
if tgt_pfx in precedence:
if curr_pfx not in precedence or precedence.index(
tgt_pfx
Expand All @@ -63,7 +57,8 @@ def expand_curie(curie: str) -> URIRef:
rewire_map[src] = tgt

uri_ref_rewire_map: Dict[URIRef, URIRef] = {
expand_curie(k): expand_curie(v) for k, v in rewire_map.items()
URIRef(converter.expand_strict(k)): URIRef(converter.expand_strict(v))
for k, v in rewire_map.items()
}

def rewire_node(n: Any):
Expand Down