diff --git a/pyproject.toml b/pyproject.toml index 3f4ea599..0855f76a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = [ "Chris Mungall ", "Nicolas Matentzoglu ", "Harshad Hegde " - ] +] license = "MIT" readme = "README.md" @@ -19,12 +19,13 @@ bioregistry = ">=0.9.43" deprecation = ">=2.1.0" linkml-runtime = ">=1.5.3" networkx = ">=3.1" +curies = ">=0.5.7" pandas = ">=2.0.2" pansql = "^0.0.1" pyyaml = ">=6.0" rdflib = ">=6.3.2" sparqlwrapper = ">=2.0.0" -sssom-schema = ">=0.13.0" +sssom-schema = ">=0.14.0" validators = ">=0.20.0" scipy = {version = "*", extras = ["scipy"]} diff --git a/src/sssom/cli.py b/src/sssom/cli.py index 0c09e94d..aab38d65 100644 --- a/src/sssom/cli.py +++ b/src/sssom/cli.py @@ -356,11 +356,8 @@ def sparql( endpoint.limit = limit if object_labels is not None: endpoint.include_object_labels = object_labels - if prefix is not None: - if endpoint.prefix_map is None: - endpoint.prefix_map = {} - for k, v in prefix: - endpoint.prefix_map[k] = v + for k, v in prefix or []: + endpoint.prefix_map[k] = v msdf = query_mappings(endpoint) write_table(msdf, output) diff --git a/src/sssom/context.py b/src/sssom/context.py index bb904c33..c0f94727 100644 --- a/src/sssom/context.py +++ b/src/sssom/context.py @@ -44,7 +44,7 @@ def get_extended_prefix_map(): :return: Prefix map. """ converter = Converter.from_extended_prefix_map(EXTENDED_PREFIX_MAP) - return converter.prefix_map + return {record.prefix: record.uri_prefix for record in converter.records} def get_built_in_prefix_map() -> PrefixMap: @@ -108,8 +108,10 @@ def get_default_metadata() -> Metadata: if "@id" in v and "@prefix" in v: if v["@prefix"]: prefix_map[key] = v["@id"] + del prefix_map["@vocab"] prefix_map.update({(k, v) for k, v in contxt_external.items() if k not in prefix_map}) + _raise_on_invalid_prefix_map(prefix_map) metadata = Metadata(prefix_map=prefix_map, metadata=metadata_dict) metadata.metadata["mapping_set_id"] = DEFAULT_MAPPING_SET_ID @@ -117,6 +119,16 @@ def get_default_metadata() -> Metadata: return metadata +def _raise_on_invalid_prefix_map(prefix_map): + """Raise an exception if the prefix map is not bijective. + + This uses :meth:`curies.Converter.from_prefix_map` to try and load a + prefix map. If there are any duplicate values (i.e., it is _not_ bijective) + then it throws a value error. + """ + Converter.from_prefix_map(prefix_map) + + def set_default_mapping_set_id(meta: Metadata) -> Metadata: """Provide a default mapping_set_id if absent in the MappingSetDataFrame. diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index d31da86c..dfc1cf1e 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -15,6 +15,7 @@ import pandas as pd import requests import yaml +from curies import Converter from deprecation import deprecated from linkml_runtime.loaders.json_loader import JSONLoader from pandas.errors import EmptyDataError @@ -64,11 +65,10 @@ SSSOM_DEFAULT_RDF_SERIALISATION, URI_SSSOM_MAPPINGS, MappingSetDataFrame, - NoCURIEException, - curie_from_uri, get_file_extension, is_multivalued_slot, raise_for_bad_path, + safe_compress, to_mapping_set_dataframe, ) @@ -506,6 +506,7 @@ def from_sssom_rdf( :return: MappingSetDataFrame object """ prefix_map = _ensure_prefix_map(prefix_map) + converter = Converter.from_prefix_map(prefix_map) ms = _init_mapping_set(meta) mlist: List[Mapping] = [] @@ -515,7 +516,7 @@ def from_sssom_rdf( for _s, p, o in g.triples((ox, None, None)): if isinstance(p, URIRef): try: - p_id = curie_from_uri(p, prefix_map) + p_id = safe_compress(p, converter) k = None if p_id.startswith("sssom:"): @@ -529,14 +530,14 @@ def from_sssom_rdf( if isinstance(o, URIRef): v: Any - v = curie_from_uri(o, prefix_map) + v = safe_compress(o, converter) else: v = o.toPython() if k: v = _address_multivalued_slot(k, v) mdict[k] = v - except NoCURIEException as e: + except ValueError as e: logging.warning(e) if mdict: m = _prepare_mapping(Mapping(**mdict)) @@ -596,6 +597,7 @@ def from_alignment_minidom( """ # FIXME: should be prefix_map = _check_prefix_map(prefix_map) _ensure_prefix_map(prefix_map) + converter = Converter.from_prefix_map(prefix_map) ms = _init_mapping_set(meta) mlist: List[Mapping] = [] # bad_attrs = {} @@ -612,7 +614,7 @@ def from_alignment_minidom( cell = e.getElementsByTagName("Cell") for c_node in cell: mdict = _cell_element_values( - c_node, prefix_map, mapping_predicates=mapping_predicates + c_node, converter, mapping_predicates=mapping_predicates ) if mdict: m = _prepare_mapping(mdict) @@ -665,6 +667,7 @@ def from_obographs( :return: An SSSOM data frame (MappingSetDataFrame) """ _ensure_prefix_map(prefix_map) + converter = Converter.from_prefix_map(prefix_map) ms = _init_mapping_set(meta) mlist: List[Mapping] = [] # bad_attrs = {} @@ -705,13 +708,13 @@ def from_obographs( xref_id = xref["val"] mdict: Dict[str, Any] = {} try: - mdict[SUBJECT_ID] = curie_from_uri(nid, prefix_map) - mdict[OBJECT_ID] = curie_from_uri(xref_id, prefix_map) + mdict[SUBJECT_ID] = safe_compress(nid, converter) + mdict[OBJECT_ID] = safe_compress(xref_id, converter) mdict[SUBJECT_LABEL] = label mdict[PREDICATE_ID] = "oboInOwl:hasDbXref" mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED mlist.append(Mapping(**mdict)) - except NoCURIEException as e: + except ValueError as e: # FIXME this will cause all sorts of ragged Mappings logging.warning(e) if "basicPropertyValues" in n["meta"]: @@ -721,15 +724,15 @@ def from_obographs( xref_id = value["val"] mdict = {} try: - mdict[SUBJECT_ID] = curie_from_uri(nid, prefix_map) - mdict[OBJECT_ID] = curie_from_uri(xref_id, prefix_map) + mdict[SUBJECT_ID] = safe_compress(nid, converter) + mdict[OBJECT_ID] = safe_compress(xref_id, converter) mdict[SUBJECT_LABEL] = label - mdict[PREDICATE_ID] = curie_from_uri(pred, prefix_map) + mdict[PREDICATE_ID] = safe_compress(pred, converter) mdict[ MAPPING_JUSTIFICATION ] = MAPPING_JUSTIFICATION_UNSPECIFIED mlist.append(Mapping(**mdict)) - except NoCURIEException as e: + except ValueError as e: # FIXME this will cause ragged mappings logging.warning(e) if "edges" in g: @@ -739,15 +742,15 @@ def from_obographs( predicate_id = _get_obographs_predicate_id(edge["pred"]) object_id = edge["obj"] if predicate_id in mapping_predicates: - mdict[SUBJECT_ID] = curie_from_uri(subject_id, prefix_map) - mdict[OBJECT_ID] = curie_from_uri(object_id, prefix_map) + mdict[SUBJECT_ID] = safe_compress(subject_id, converter) + mdict[OBJECT_ID] = safe_compress(object_id, converter) mdict[SUBJECT_LABEL] = ( labels[subject_id] if subject_id in labels.keys() else "" ) mdict[OBJECT_LABEL] = ( labels[object_id] if object_id in labels.keys() else "" ) - mdict[PREDICATE_ID] = curie_from_uri(predicate_id, prefix_map) + mdict[PREDICATE_ID] = safe_compress(predicate_id, converter) mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED mlist.append(Mapping(**mdict)) if "equivalentNodesSets" in g and OWL_EQUIV_CLASS_URI in mapping_predicates: @@ -757,10 +760,10 @@ def from_obographs( for ec2 in equivalents["nodeIds"]: if ec1 != ec2: mdict = {} - mdict[SUBJECT_ID] = curie_from_uri(ec1, prefix_map) - mdict[OBJECT_ID] = curie_from_uri(ec2, prefix_map) - mdict[PREDICATE_ID] = curie_from_uri( - OWL_EQUIV_CLASS_URI, prefix_map + mdict[SUBJECT_ID] = safe_compress(ec1, converter) + mdict[OBJECT_ID] = safe_compress(ec2, converter) + mdict[PREDICATE_ID] = safe_compress( + OWL_EQUIV_CLASS_URI, converter ) mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED mdict[SUBJECT_LABEL] = ( @@ -868,19 +871,15 @@ def _set_metadata_in_mapping_set( mapping_set[k] = v -def _cell_element_values(cell_node, prefix_map: PrefixMap, mapping_predicates) -> Optional[Mapping]: +def _cell_element_values(cell_node, converter: Converter, mapping_predicates) -> Optional[Mapping]: mdict: Dict[str, Any] = {} for child in cell_node.childNodes: if child.nodeType == Node.ELEMENT_NODE: try: if child.nodeName == "entity1": - mdict[SUBJECT_ID] = curie_from_uri( - child.getAttribute("rdf:resource"), prefix_map - ) + mdict[SUBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter) elif child.nodeName == "entity2": - mdict[OBJECT_ID] = curie_from_uri( - child.getAttribute("rdf:resource"), prefix_map - ) + mdict[OBJECT_ID] = safe_compress(child.getAttribute("rdf:resource"), converter) elif child.nodeName == "measure": mdict[CONFIDENCE] = child.firstChild.nodeValue elif child.nodeName == "relation": @@ -902,7 +901,7 @@ def _cell_element_values(cell_node, prefix_map: PrefixMap, mapping_predicates) - logging.warning(f"{relation} not a recognised relation type.") else: logging.warning(f"Unsupported alignment api element: {child.nodeName}") - except NoCURIEException as e: + except ValueError as e: logging.warning(e) mdict[MAPPING_JUSTIFICATION] = MAPPING_JUSTIFICATION_UNSPECIFIED diff --git a/src/sssom/rdf_util.py b/src/sssom/rdf_util.py index dfac8d2e..df2cb843 100644 --- a/src/sssom/rdf_util.py +++ b/src/sssom/rdf_util.py @@ -3,10 +3,9 @@ import logging from typing import Any, Dict, List, Optional +from curies import Converter from linkml_runtime.utils.metamodelcore import URIorCURIE from rdflib import Graph, URIRef - -# from .sssom_datamodel import EntityReference, Mapping from sssom_schema import EntityReference, Mapping from .parsers import to_mapping_set_document @@ -24,17 +23,12 @@ def rewire_graph( precedence: Optional[List[str]] = None, ) -> int: """Rewire an RDF Graph replacing using equivalence mappings.""" - pm = mset.prefix_map mdoc = to_mapping_set_document(mset) - rewire_map: Dict[URIorCURIE, URIorCURIE] = {} - - def expand_curie(curie: str) -> URIRef: - """Expand CURIE into URIRef.""" - pfx, local = curie.split(":") - return URIRef(f"{pm[pfx]}{local}") - if mdoc.mapping_set.mappings is None: raise TypeError + + converter = Converter.from_prefix_map(mdoc.prefix_map) + rewire_map: Dict[URIorCURIE, URIorCURIE] = {} for m in mdoc.mapping_set.mappings: if not isinstance(m, Mapping): continue @@ -49,8 +43,8 @@ def expand_curie(curie: str) -> URIRef: curr_tgt = rewire_map[src] logging.info(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}") if precedence: - curr_pfx, _ = curr_tgt.split(":") - tgt_pfx, _ = tgt.split(":") + curr_pfx, _ = converter.parse_curie(curr_tgt) + tgt_pfx, _ = converter.parse_curie(tgt) if tgt_pfx in precedence: if curr_pfx not in precedence or precedence.index( tgt_pfx @@ -63,7 +57,8 @@ def expand_curie(curie: str) -> URIRef: rewire_map[src] = tgt uri_ref_rewire_map: Dict[URIRef, URIRef] = { - expand_curie(k): expand_curie(v) for k, v in rewire_map.items() + URIRef(converter.expand_strict(k)): URIRef(converter.expand_strict(v)) + for k, v in rewire_map.items() } def rewire_node(n: Any): diff --git a/src/sssom/sparql_util.py b/src/sssom/sparql_util.py index 3caae00b..e5826026 100644 --- a/src/sssom/sparql_util.py +++ b/src/sssom/sparql_util.py @@ -1,15 +1,17 @@ """Utilities for querying mappings with SPARQL.""" import logging -from dataclasses import dataclass -from typing import Dict, List, Mapping, Optional +from dataclasses import dataclass, field +from textwrap import dedent +from typing import Dict, List, Optional import pandas as pd +from curies import Converter from rdflib import URIRef from rdflib.namespace import RDFS, SKOS from SPARQLWrapper import JSON, SPARQLWrapper -from .util import MappingSetDataFrame +from .util import MappingSetDataFrame, safe_compress __all__ = [ "EndpointConfig", @@ -26,13 +28,18 @@ class EndpointConfig: predmap: Dict[str, str] predicates: Optional[List[str]] limit: Optional[int] - prefix_map: Optional[Dict[str, str]] include_object_labels: bool = False + prefix_map: Dict[str, str] = field(default_factory=dict) def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: """Query a SPARQL endpoint to obtain a set of mappings.""" - sparql = SPARQLWrapper(config.url) + if not config.prefix_map: + raise TypeError( + "A query can not be made since the configuration does not have a valid prefix map" + ) + converter = Converter.from_prefix_map(config.prefix_map) + if config.graph is None: g = "?g" elif isinstance(config.graph, str): @@ -42,7 +49,7 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: if config.predicates is None: predicates = [SKOS.exactMatch, SKOS.closeMatch] else: - predicates = [expand_curie(predicate, config) for predicate in config.predicates] + predicates = [URIRef(converter.expand_strict(predicate)) for predicate in config.predicates] predstr = " ".join(URIRef(predicate).n3() for predicate in predicates) if config.limit is not None: limitstr = f"LIMIT {config.limit}" @@ -59,7 +66,8 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: cols.insert(-1, "object_label") colstr = " ".join([f"?{c}" for c in cols]) olq = "OPTIONAL { ?object_id rdfs:label ?object_label }" if config.include_object_labels else "" - q = f"""\ + sparql = dedent( + f"""\ PREFIX rdfs: {RDFS.uri.n3()} SELECT {colstr} WHERE {{ @@ -72,56 +80,16 @@ def query_mappings(config: EndpointConfig) -> MappingSetDataFrame: BIND({g} as ?mapping_provider) }} {limitstr} """ - logging.info(q) - sparql.setQuery(q) - sparql.setReturnFormat(JSON) - results = sparql.query().convert() - rows = [] - for result in results["results"]["bindings"]: - row = {k: v["value"] for k, v in result.items()} - rows.append(curiefy_values(row, config)) - df = pd.DataFrame(rows) - if config.prefix_map is None: - raise TypeError + ) + logging.info(sparql) + + sparql_wrapper = SPARQLWrapper(config.url, returnFormat=JSON) + sparql_wrapper.setQuery(sparql) + results = sparql_wrapper.query().convert() + df = pd.DataFrame( + [ + {key: safe_compress(v["value"], converter) for key, v in result.items()} + for result in results["results"]["bindings"] + ] + ) return MappingSetDataFrame(df=df, prefix_map=config.prefix_map) - - -def curiefy_values(row: Mapping[str, str], config: EndpointConfig) -> Dict[str, str]: - """Convert all values in the dict from URIs to CURIEs. - - :param row: A dictionary of string keys to URIs - :param config: Configuration - :return: A dictionary of string keys to CURIEs - """ - return {k: contract_uri(v, config) for k, v in row.items()} - - -def contract_uri(uri: str, config: EndpointConfig) -> str: - """Replace the URI with a CURIE based on the prefix map in the given configuration. - - :param uri: A uniform resource identifier - :param config: Configuration - :return: A CURIE if it's able to contract, otherwise return the original URI - """ - if config.prefix_map is None: - return uri - for k, v in config.prefix_map.items(): - if uri.startswith(v): - return uri.replace(v, f"{k}:") - return uri - - -def expand_curie(curie: str, config: EndpointConfig) -> URIRef: - """Expand a CURIE to a URI. - - :param curie: CURIE - :param config: Configuration - :return: URI of CURIE - """ - if config.prefix_map is None: - return URIRef(curie) - for k, v in config.prefix_map.items(): - prefix = f"{k}:" - if curie.startswith(prefix): - return URIRef(curie.replace(prefix, v)) - return URIRef(curie) diff --git a/src/sssom/util.py b/src/sssom/util.py index 003ea8e1..81ce6150 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -1,4 +1,5 @@ """Utility functions.""" + import hashlib import json import logging @@ -30,12 +31,10 @@ import pandas as pd import validators import yaml +from curies import Converter from jsonschema import ValidationError from linkml_runtime.linkml_model.types import Uriorcurie from pandas.errors import EmptyDataError - -# from .sssom_datamodel import Mapping as SSSOM_Mapping -# from .sssom_datamodel import slots from sssom_schema import Mapping as SSSOM_Mapping from sssom_schema import slots @@ -1082,10 +1081,6 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) -> return map_dict -class NoCURIEException(ValueError): - """An exception raised when a CURIE can not be parsed with a given prefix map.""" - - CURIE_RE = re.compile(r"[A-Za-z0-9_.]+[:][A-Za-z0-9_]") @@ -1107,42 +1102,6 @@ def get_prefix_from_curie(curie: str) -> str: return "" -def curie_from_uri(uri: str, prefix_map: Mapping[str, str]) -> str: - """Parse a CURIE from an IRI. - - :param uri: The URI to parse. If this is already a CURIE, return directly. - :param prefix_map: The prefix map against which the IRI is checked - :return: A CURIE - :raises NoCURIEException: if a CURIE can not be parsed - - Example parsing: - >>> m = {"hgnc.genegroup": "https://example.org/hgnc.genegroup:"} - >>> curie_from_uri("https://example.org/hgnc.genegroup:1234", {}) - 'hgnc.genegroup:1234' - - Example CURIE passthrough: - >>> curie_from_uri("hgnc:1234", {}) - 'hgnc:1234' - >>> curie_from_uri("hgnc.genegroup:1234", {}) - 'hgnc.genegroup:1234' - """ - # TODO consider replacing with :func:`bioregistry.curie_from_iri` - # FIXME what if the curie has a subspace in it? RE will fail - if is_curie(uri): - return uri - for prefix in prefix_map: - uri_prefix = prefix_map[prefix] - if uri.startswith(uri_prefix): - remainder = uri.replace(uri_prefix, "") - curie = f"{prefix}:{remainder}" - if is_curie(curie): - return f"{prefix}:{remainder}" - else: - logging.warning(f"{prefix}:{remainder} is not a CURIE ... skipping") - continue - raise NoCURIEException(f"{uri} does not follow any known prefixes") - - def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]: """Get a list of prefixes used in CURIEs in key feature columns in a dataframe.""" prefixes = list(SSSOM_BUILT_IN_PREFIXES) @@ -1601,3 +1560,20 @@ def invert_mappings( def _invert_column_names(column_names: list, columns_invert_map: dict) -> dict: """Return a dictionary for column renames in pandas DataFrame.""" return {x: columns_invert_map[x] for x in column_names} + + +def safe_compress(uri: str, converter: Converter) -> str: + """Parse a CURIE from an IRI. + + :param uri: The URI to parse. If this is already a CURIE, return directly. + :param converter: Converter used for compression + :return: A CURIE + """ + if not is_curie(uri): + return converter.compress_strict(uri) + rv = converter.standardize_curie(uri) + if rv is None: + raise ValueError( + f"CURIE appeared where there should be a URI, and could not be standardized: {uri}" + ) + return rv diff --git a/tests/data/basic-meta-external.yml b/tests/data/basic-meta-external.yml index fb092300..3a85e6be 100644 --- a/tests/data/basic-meta-external.yml +++ b/tests/data/basic-meta-external.yml @@ -13,3 +13,4 @@ curie_map: b: "http://example.org/b/" c: "http://example.org/c/" d: "http://example.org/d/" + oio: "http://www.geneontology.org/formats/oboInOwl#" diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 950ea22d..e513af23 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -12,7 +12,7 @@ import yaml from rdflib import Graph -from sssom.context import get_default_metadata +from sssom.context import _raise_on_invalid_prefix_map, get_default_metadata from sssom.parsers import ( from_alignment_minidom, from_obographs, @@ -63,6 +63,7 @@ def setUp(self) -> None: self.alignmentxml_file = f"{test_data_dir}/oaei-ordo-hp.rdf" self.alignmentxml = minidom.parse(self.alignmentxml_file) self.metadata = get_default_metadata() + _raise_on_invalid_prefix_map(self.metadata.prefix_map) def test_parse_sssom_dataframe_from_file(self): """Test parsing a TSV.""" @@ -117,21 +118,10 @@ def test_parse_obographs(self): write_table(msdf, file) self.assertEqual( len(msdf.df), - 9881, + 8099, f"{self.obographs_file} has the wrong number of mappings.", ) - def test_broken_obographs(self): - """Test parsing OBO Graph JSON.""" - prefix_map = self.metadata.prefix_map - prefix_map["OMIM"] = "http://omim.org/entry/" - with self.assertRaises(ValueError): - from_obographs( - jsondoc=self.broken_obographs, - prefix_map=prefix_map, - meta=self.metadata.metadata, - ) - def test_parse_tsv(self): """Test parsing TSV.""" msdf = from_sssom_dataframe(df=self.df, prefix_map=self.df_prefix_map, meta=self.df_meta)