From b139c26b915cd991013c6267906191a330b10543 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Mon, 1 Sep 2025 18:34:52 +0100 Subject: [PATCH 01/12] Add method to infer minimum compatible SSSOM version. Add a new method to the MappingSetDataFrame class to automatically determine the minimum version of the SSSOM specification the set is compatible with -- that is, the earliest version that defines all the slots and all the enum values present in the set. --- src/sssom/constants.py | 22 +++++++++++++++++++ src/sssom/util.py | 50 ++++++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 37 +++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index b858405c..d7cfa808 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -10,6 +10,7 @@ import yaml from linkml_runtime.utils.schema_as_dict import schema_as_dict from linkml_runtime.utils.schemaview import SchemaView +from sssom_schema.datamodel.sssom_schema import SssomVersionEnum HERE = pathlib.Path(__file__).parent.resolve() @@ -278,6 +279,27 @@ def propagatable_slots(self) -> List[str]: slots.append(slot_name) return slots + def get_minimum_version(self, slot_name: str, class_name: str = "mapping"): + """Get the minimum version of SSSOM required for a given slot. + + :param slot_name: The queried slot. + :param class_name: The class the slot belongs to. This is needed + because a slot may have been added to a class + in a later version than the version in which + it was first introduced in the schema. + :return: A SssomVersionEnum value representing the earliest + version of SSSOM that defines the given slot in the + given class. May be None if the requested slot name + is not a valid slot name. + """ + try: + slot = self.view.induced_slot(slot_name, class_name) + return SssomVersionEnum(slot.annotations.added_in.value) + except AttributeError: # No added_in annotation, defaults to 1.0 + return SssomVersionEnum("1.0") + except ValueError: # No such slot + return None + @lru_cache(1) def _get_sssom_schema_object() -> SSSOMSchemaView: diff --git a/src/sssom/util.py b/src/sssom/util.py index b49d08fd..8425f9e7 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -393,6 +393,56 @@ def condense(self) -> List[str]: self.df.drop(columns=condensed, inplace=True) return condensed + def get_compatible_version(self): + """Get the minimum version of SSSOM this set is compatible with.""" + schema = SSSOMSchemaView() + versions = set() + + # First get the minimum versions required by the slots present + # in the set; this is entirely provided by the SSSOM model. + for slot in self.metadata.keys(): + version = schema.get_minimum_version(slot, "mapping set") + if version is not None: + versions.add(str(version)) + for slot in self.df.columns: + version = schema.get_minimum_version(slot, "mapping") + if version is not None: + versions.add(str(version)) + + # Then take care of enum values; we cannot use the SSSOM model + # for that (enum values are not tagged with an "added_in" + # annotation the way slots are), so this has to be handled + # "manually" based on the informations provided in + # . + if ( + self.metadata.get("subject_type") == "composed entity expression" + or self.metadata.get("subject_type") == "composed entity expression" + or ( + "subject_type" in self.df.columns + and "composed entity expression" in self.df["subject_type"].values + ) + or ( + "object_type" in self.df.columns + and "composed entity expression" in self.df["object_type"].values + ) + ): + versions.add("1.1") + + if ( + "mapping_cardinality" in self.df.columns + and "0:0" in self.df["mapping_cardinality"].values + ): + versions.add("1.1") + + # Get the highest of the accumulated versions. We do a numerical + # sort, so that version 1.10 (if we ever get that far in the 1.x + # branch) does not get sorted before version 1.9. + def _version_to_compare_key(version): + major, minor = [int(s) for s in version.split(".")] + return (major * 100) + minor + + return sorted(versions, key=_version_to_compare_key)[-1] + def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: """Standardize a CURIE or IRI, returning the original if not possible. diff --git a/tests/test_utils.py b/tests/test_utils.py index 91e187d0..206deffc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -595,3 +595,40 @@ def test_propagation_fill_empty_mode(self) -> None: self.assertIn("mapping_tool", propagated_slots) self.assertNotIn("mapping_tool", msdf.metadata) self.assertEqual(2, len(msdf.df["mapping_tool"].unique())) + + def test_inferring_compatible_version(self) -> None: + """Test that we can correctly infer the version a set is compatible with.""" + msdf10 = parse_sssom_table(f"{data_dir}/basic.tsv") + + # Nothing in that set requires 1.1 + self.assertEqual("1.0", msdf10.get_compatible_version()) + + def _clone(msdf): + return MappingSetDataFrame(df=msdf.df.copy(), metadata=msdf.metadata.copy()) + + # Inject a 1.1-specific mapping set slot + msdf11 = _clone(msdf10) + msdf11.metadata["cardinality_scope"] = "predicate_id" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Inject a 1.1-specific mapping slot + msdf11 = _clone(msdf10) + msdf11.df["predicate_type"] = "owl object property" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Inject a 1.1-specific entity_type_enum value + msdf11 = _clone(msdf10) + msdf11.metadata["subject_type"] = "composed entity expression" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Same, but on a single mapping record + msdf11 = _clone(msdf10) + msdf11.df["object_type"] = "owl class" + msdf11.df.loc[2, "object_type"] = "composed entity expression" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Inject the 1.1-specific "0:0" cardinality value + msdf11 = _clone(msdf10) + msdf11.df["mapping_cardinality"] = "1:1" + msdf11.df.loc[9, "mapping_cardinality"] = "0:0" + self.assertEqual("1.1", msdf11.get_compatible_version()) From 90066071a432c7470ec758f9d36a42b7fdeef5d2 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 2 Sep 2025 20:53:22 +0100 Subject: [PATCH 02/12] Add missing return type hints. --- src/sssom/constants.py | 2 +- src/sssom/util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index d7cfa808..701200e8 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -279,7 +279,7 @@ def propagatable_slots(self) -> List[str]: slots.append(slot_name) return slots - def get_minimum_version(self, slot_name: str, class_name: str = "mapping"): + def get_minimum_version(self, slot_name: str, class_name: str = "mapping") -> SssomVersionEnum: """Get the minimum version of SSSOM required for a given slot. :param slot_name: The queried slot. diff --git a/src/sssom/util.py b/src/sssom/util.py index 8425f9e7..4d6819c3 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -393,7 +393,7 @@ def condense(self) -> List[str]: self.df.drop(columns=condensed, inplace=True) return condensed - def get_compatible_version(self): + def get_compatible_version(self) -> str: """Get the minimum version of SSSOM this set is compatible with.""" schema = SSSOMSchemaView() versions = set() From 20ea4e8311a7797100290e75c9ac793a5acb11f4 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 2 Sep 2025 21:37:02 +0100 Subject: [PATCH 03/12] Misc fixes. Fix wrong slot name when looking for "composed entity expression". Let Python compare version numbers as tuples of integers. Use `max(list)` instead of `sorted(list)[-1]`. --- src/sssom/util.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 4d6819c3..8d02d3ab 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -416,7 +416,7 @@ def get_compatible_version(self) -> str: # . if ( self.metadata.get("subject_type") == "composed entity expression" - or self.metadata.get("subject_type") == "composed entity expression" + or self.metadata.get("object_type") == "composed entity expression" or ( "subject_type" in self.df.columns and "composed entity expression" in self.df["subject_type"].values @@ -438,10 +438,9 @@ def get_compatible_version(self) -> str: # sort, so that version 1.10 (if we ever get that far in the 1.x # branch) does not get sorted before version 1.9. def _version_to_compare_key(version): - major, minor = [int(s) for s in version.split(".")] - return (major * 100) + minor + return tuple(int(s) for s in version.split(".")) - return sorted(versions, key=_version_to_compare_key)[-1] + return max(versions, key=_version_to_compare_key) def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: From 588bf4ff0d568572b6f84cf26e75df6d08ec1f7a Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 2 Sep 2025 22:46:09 +0100 Subject: [PATCH 04/12] Use constants to refer to SSSOM slot names. --- src/sssom/constants.py | 1 + src/sssom/util.py | 20 ++++++++++---------- tests/test_utils.py | 19 ++++++++++++------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 701200e8..65cf4afb 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -90,6 +90,7 @@ MAPPING_SET_SOURCE = "mapping_set_source" MAPPING_SOURCE = "mapping_source" MAPPING_CARDINALITY = "mapping_cardinality" +CARDINALITY_SCOPE = "cardinality_scope" MAPPING_TOOL = "mapping_tool" MAPPING_TOOL_VERSION = "mapping_tool_version" MAPPING_DATE = "mapping_date" diff --git a/src/sssom/util.py b/src/sssom/util.py index 8d02d3ab..a60aa1dd 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -26,6 +26,7 @@ COLUMN_INVERT_DICTIONARY, COMMENT, CONFIDENCE, + MAPPING_CARDINALITY, MAPPING_JUSTIFICATION, MAPPING_SET_ID, MAPPING_SET_SOURCE, @@ -33,6 +34,7 @@ OBJECT_ID, OBJECT_LABEL, OBJECT_SOURCE, + OBJECT_TYPE, OBO_HAS_DB_XREF, OWL_DIFFERENT_FROM, OWL_EQUIVALENT_CLASS, @@ -55,6 +57,7 @@ SUBJECT_ID, SUBJECT_LABEL, SUBJECT_SOURCE, + SUBJECT_TYPE, UNKNOWN_IRI, MetadataType, PathOrIO, @@ -415,23 +418,20 @@ def get_compatible_version(self) -> str: # "manually" based on the informations provided in # . if ( - self.metadata.get("subject_type") == "composed entity expression" - or self.metadata.get("object_type") == "composed entity expression" + self.metadata.get(SUBJECT_TYPE) == "composed entity expression" + or self.metadata.get(OBJECT_TYPE) == "composed entity expression" or ( - "subject_type" in self.df.columns - and "composed entity expression" in self.df["subject_type"].values + SUBJECT_TYPE in self.df.columns + and "composed entity expression" in self.df[SUBJECT_TYPE].values ) or ( - "object_type" in self.df.columns - and "composed entity expression" in self.df["object_type"].values + OBJECT_TYPE in self.df.columns + and "composed entity expression" in self.df[OBJECT_TYPE].values ) ): versions.add("1.1") - if ( - "mapping_cardinality" in self.df.columns - and "0:0" in self.df["mapping_cardinality"].values - ): + if MAPPING_CARDINALITY in self.df.columns and "0:0" in self.df[MAPPING_CARDINALITY].values: versions.add("1.1") # Get the highest of the accumulated versions. We do a numerical diff --git a/tests/test_utils.py b/tests/test_utils.py index 206deffc..ce8f7473 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,13 +12,18 @@ from sssom_schema import slots as SSSOM_Slots from sssom.constants import ( + CARDINALITY_SCOPE, CREATOR_ID, + MAPPING_CARDINALITY, OBJECT_ID, OBJECT_LABEL, + OBJECT_TYPE, PREDICATE_ID, + PREDICATE_TYPE, SEMAPV, SUBJECT_ID, SUBJECT_LABEL, + SUBJECT_TYPE, ) from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter from sssom.io import extract_iris @@ -608,27 +613,27 @@ def _clone(msdf): # Inject a 1.1-specific mapping set slot msdf11 = _clone(msdf10) - msdf11.metadata["cardinality_scope"] = "predicate_id" + msdf11.metadata[CARDINALITY_SCOPE] = "predicate_id" self.assertEqual("1.1", msdf11.get_compatible_version()) # Inject a 1.1-specific mapping slot msdf11 = _clone(msdf10) - msdf11.df["predicate_type"] = "owl object property" + msdf11.df[PREDICATE_TYPE] = "owl object property" self.assertEqual("1.1", msdf11.get_compatible_version()) # Inject a 1.1-specific entity_type_enum value msdf11 = _clone(msdf10) - msdf11.metadata["subject_type"] = "composed entity expression" + msdf11.metadata[SUBJECT_TYPE] = "composed entity expression" self.assertEqual("1.1", msdf11.get_compatible_version()) # Same, but on a single mapping record msdf11 = _clone(msdf10) - msdf11.df["object_type"] = "owl class" - msdf11.df.loc[2, "object_type"] = "composed entity expression" + msdf11.df[OBJECT_TYPE] = "owl class" + msdf11.df.loc[2, OBJECT_TYPE] = "composed entity expression" self.assertEqual("1.1", msdf11.get_compatible_version()) # Inject the 1.1-specific "0:0" cardinality value msdf11 = _clone(msdf10) - msdf11.df["mapping_cardinality"] = "1:1" - msdf11.df.loc[9, "mapping_cardinality"] = "0:0" + msdf11.df[MAPPING_CARDINALITY] = "1:1" + msdf11.df.loc[9, MAPPING_CARDINALITY] = "0:0" self.assertEqual("1.1", msdf11.get_compatible_version()) From 28a23545a4cf55acb8041803b1e243a15b5ccc32 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 9 Sep 2025 14:11:40 +0100 Subject: [PATCH 05/12] Add missing type annotation. --- src/sssom/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 8ed05a58..5f4cdfb3 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -501,7 +501,7 @@ def _to_string(row: dict[str, Any], side: str) -> str: def get_compatible_version(self) -> str: """Get the minimum version of SSSOM this set is compatible with.""" schema = SSSOMSchemaView() - versions = set() + versions: Set[str] = set() # First get the minimum versions required by the slots present # in the set; this is entirely provided by the SSSOM model. From f419fba8530a8c15d20fa5cfdac807f35d90ce27 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Tue, 9 Sep 2025 19:46:28 +0100 Subject: [PATCH 06/12] Manipulate version numbers as tuples. Amend the SSSOMSchemaView#get_minimum_version() method to return a (major, minor) tuple, rather than a SssomVersionEnum object. The SssomVersionObject (which is automatically generated from the LinkML schema) is cumbersome to use, for at least two reasons: 1) obtaining the actual value of the enum requires accessing two levels of attributes (SssomVersionObject.code.text); 2) SssomVersionEnum values cannot be meaningfully compared (e.g. to check that a given version number is higher than another given version), we must (a) obtain the text value, (b) split that value over the middle dot, (c) convert the strings to integers, (d) put the integers into a tuple. OK, this can be done in one line of code, but this is cumbersome all the same, and it's best if that kind of things is not left to client code. --- src/sssom/constants.py | 23 ++++++++++++++--------- src/sssom/util.py | 17 +++++++---------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 1ac162ef..33fd6588 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -4,13 +4,12 @@ import uuid from enum import Enum from functools import cached_property, lru_cache -from typing import Any, Dict, List, Literal, Set, TextIO, Union +from typing import Any, Dict, List, Literal, Optional, Set, TextIO, Tuple, Union import importlib_resources import yaml from linkml_runtime.utils.schema_as_dict import schema_as_dict from linkml_runtime.utils.schemaview import SchemaView -from sssom_schema.datamodel.sssom_schema import SssomVersionEnum HERE = pathlib.Path(__file__).parent.resolve() @@ -284,7 +283,9 @@ def propagatable_slots(self) -> List[str]: slots.append(slot_name) return slots - def get_minimum_version(self, slot_name: str, class_name: str = "mapping") -> SssomVersionEnum: + def get_minimum_version( + self, slot_name: str, class_name: str = "mapping" + ) -> Optional[Tuple[int, int]]: """Get the minimum version of SSSOM required for a given slot. :param slot_name: The queried slot. @@ -292,16 +293,20 @@ def get_minimum_version(self, slot_name: str, class_name: str = "mapping") -> Ss because a slot may have been added to a class in a later version than the version in which it was first introduced in the schema. - :return: A SssomVersionEnum value representing the earliest - version of SSSOM that defines the given slot in the - given class. May be None if the requested slot name - is not a valid slot name. + :return: A tuple containing the major and minor numbers of the + earliest version of SSSOM that defines the given slot + in the given class. May be None if the requested slot + name is not a valid slot name. """ try: slot = self.view.induced_slot(slot_name, class_name) - return SssomVersionEnum(slot.annotations.added_in.value) + version = [int(s) for s in slot.annotations.added_in.value.split(".")] + if len(version) != 2: + # Should never happen, schema is incorrect + return None + return (version[0], version[1]) except AttributeError: # No added_in annotation, defaults to 1.0 - return SssomVersionEnum("1.0") + return (1, 0) except ValueError: # No such slot return None diff --git a/src/sssom/util.py b/src/sssom/util.py index 5f4cdfb3..20ce423c 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -501,18 +501,18 @@ def _to_string(row: dict[str, Any], side: str) -> str: def get_compatible_version(self) -> str: """Get the minimum version of SSSOM this set is compatible with.""" schema = SSSOMSchemaView() - versions: Set[str] = set() + versions: Set[Tuple[int, int]] = set() # First get the minimum versions required by the slots present # in the set; this is entirely provided by the SSSOM model. for slot in self.metadata.keys(): version = schema.get_minimum_version(slot, "mapping set") if version is not None: - versions.add(str(version)) + versions.add(version) for slot in self.df.columns: version = schema.get_minimum_version(slot, "mapping") if version is not None: - versions.add(str(version)) + versions.add(version) # Then take care of enum values; we cannot use the SSSOM model # for that (enum values are not tagged with an "added_in" @@ -531,16 +531,13 @@ def get_compatible_version(self) -> str: and "composed entity expression" in self.df[OBJECT_TYPE].values ) ): - versions.add("1.1") + versions.add((1, 1)) if MAPPING_CARDINALITY in self.df.columns and "0:0" in self.df[MAPPING_CARDINALITY].values: - versions.add("1.1") + versions.add((1, 1)) - # Get the highest of the accumulated versions. We do a numerical - # sort, so that version 1.10 (if we ever get that far in the 1.x - # branch) does not get sorted before version 1.9. - def _version_to_compare_key(version): - return tuple(int(s) for s in version.split(".")) + # Get the highest of the accumulated versions. + return ".".join([str(i) for i in max(versions)]) return max(versions, key=_version_to_compare_key) From b690800653783138904563c6afddbf2aa6e984bf Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 10 Sep 2025 00:38:04 +0100 Subject: [PATCH 07/12] Remove dead code. --- src/sssom/util.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 20ce423c..27d3242a 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -539,8 +539,6 @@ def get_compatible_version(self) -> str: # Get the highest of the accumulated versions. return ".".join([str(i) for i in max(versions)]) - return max(versions, key=_version_to_compare_key) - def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: """Standardize a CURIE or IRI, returning the original if not possible. From 04954d487a6ad98d14383b572f2ceed51802a465 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 10 Sep 2025 11:01:48 +0100 Subject: [PATCH 08/12] Add helper function to parse SSSOM version number. Add a small helper function to turn a "X.Y" string into a valid SSSOM version number represented as a tuple of integers (X, Y). Instead of working on the input string directly (splitting into two substrings, then converting the substrings to integers), we first convert the string into a SssomVersionEnum object, from which we get the string back. This is so we can rely on the LinkML-generated code to automatically check that the provided value is a valid value that correctly identifies a valid SSSOM version, without having to embed into the method the knowledge of which versions are valid at any given time. --- src/sssom/constants.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 33fd6588..f99fd6ce 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -10,6 +10,7 @@ import yaml from linkml_runtime.utils.schema_as_dict import schema_as_dict from linkml_runtime.utils.schemaview import SchemaView +from sssom_schema.datamodel.sssom_schema import SssomVersionEnum HERE = pathlib.Path(__file__).parent.resolve() @@ -300,17 +301,27 @@ def get_minimum_version( """ try: slot = self.view.induced_slot(slot_name, class_name) - version = [int(s) for s in slot.annotations.added_in.value.split(".")] - if len(version) != 2: - # Should never happen, schema is incorrect - return None - return (version[0], version[1]) + return parse_sssom_version(slot.annotations.added_in.value) except AttributeError: # No added_in annotation, defaults to 1.0 return (1, 0) except ValueError: # No such slot return None +def parse_sssom_version(version: str) -> Tuple[int, int]: + """Parse a string into a valid SSSOM version number. + + :param version: The string to parse into a version number. + :return: A (major, minor) tuple. + """ + v = [int(n) for n in SssomVersionEnum(version).code.text.split(".")] + if len(v) != 2: + # Should never happen, should be caught by the SssomVersionEnum + # constructor before we arrive here + raise ValueError("Invalid version") + return (v[0], v[1]) + + @lru_cache(1) def _get_sssom_schema_object() -> SSSOMSchemaView: """Get a view over the SSSOM schema.""" From d90911b3abae4dad0fdd24b2a458627fada401b4 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 10 Sep 2025 11:09:58 +0100 Subject: [PATCH 09/12] Add method to enforce compliance with a given SSSOM version. Add a new method `MappingSetDataFrame#enforce_compliance()` to ensure that a mapping set is compliant with a given version of the SSSOM specification, by removing any slot or slot value that has only been defined in a later version. The method can also be used to optionally remove any extra non-standard slot that has not been properly declared as an extension slot (strict=True). --- src/sssom/constants.py | 2 ++ src/sssom/util.py | 80 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index f99fd6ce..807d2294 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -106,6 +106,8 @@ SEE_ALSO = "see_also" OTHER = "other" COMMENT = "comment" +EXTENSION_DEFINITIONS = "extension_definitions" +EXTENSION_SLOT_NAME = "slot_name" CURIE_MAP = "curie_map" SUBJECT_SOURCE_ID = "subject_source_id" diff --git a/src/sssom/util.py b/src/sssom/util.py index 27d3242a..2b7cf8bb 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -27,6 +27,8 @@ COLUMN_INVERT_DICTIONARY, COMMENT, CONFIDENCE, + EXTENSION_DEFINITIONS, + EXTENSION_SLOT_NAME, MAPPING_CARDINALITY, MAPPING_JUSTIFICATION, MAPPING_SET_ID, @@ -66,6 +68,7 @@ SSSOMSchemaView, _get_sssom_schema_object, get_default_metadata, + parse_sssom_version, ) from .context import ( SSSOM_BUILT_IN_PREFIXES, @@ -539,6 +542,83 @@ def get_compatible_version(self) -> str: # Get the highest of the accumulated versions. return ".".join([str(i) for i in max(versions)]) + def enforce_version( + self, version: str, strict: bool = False, inplace: bool = False + ) -> "MappingSetDataFrame": + """Ensure the set is compliant with a given version of the SSSOM specification. + + This method will forcefully remove any slot or enum value that + is not defined in the specified version of the specification. + + :param version: The targeted version of the specification, as a + string of the form `X.Y`. + :param strict: If `True`, unknown slots will be removed as well, + unless they are properly declared as extensions. + :param inplace: if `True`, the method will modify and return the + set it has been called upon. The default is to + leave that set untouched and to return a + modified copy. + :return: A set that is compliant with the requested version of + the SSSOM specification. + """ + if inplace: + msdf = self + else: + msdf = MappingSetDataFrame(df=self.df.copy(), metadata=self.metadata.copy()) + + schema = SSSOMSchemaView() + target_version = parse_sssom_version(version) + defined_extensions = [ + ext.get(EXTENSION_SLOT_NAME) for ext in msdf.metadata.get(EXTENSION_DEFINITIONS, []) + ] + + # Helper method to decide whether to keep or discard a slot + def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool: + if version is not None: + # This is a known slot, keep if compatible with target version + return version <= target_version + elif strict: + # Unknown slot in strict mode, keep only if declared as an extension + return name in defined_extensions + else: + # Unknown slot in non-strict mode, always keep + return True + + # First the mapping set slots + to_remove = [ + name + for name in msdf.metadata.keys() + if not _keep(name, schema.get_minimum_version(name, "mapping set")) + ] + if target_version < (1, 1): + # Remove enum values introduced in 1.1 + if msdf.metadata.get(SUBJECT_TYPE) == "composed entity expression": + to_remove.append(SUBJECT_TYPE) + if msdf.metadata.get(OBJECT_TYPE) == "composed entity expression": + to_remove.append(OBJECT_TYPE) + for slot in to_remove: + msdf.metadata.pop(slot) + + # Then the individual mapping record slots + to_remove = [ + name + for name in msdf.df.columns + if not _keep(name, schema.get_minimum_version(name, "mapping")) + ] + msdf.df.drop(columns=to_remove, inplace=True) + if target_version < (1, 1): + # Remove enum values introduced in 1.1 + if SUBJECT_TYPE in msdf.df.columns: + msdf.df.loc[msdf.df[SUBJECT_TYPE] == "composed entity expression", SUBJECT_TYPE] = ( + "" + ) + if OBJECT_TYPE in msdf.df.columns: + msdf.df.loc[msdf.df[OBJECT_TYPE] == "composed entity expression", OBJECT_TYPE] = "" + if MAPPING_CARDINALITY in msdf.df.columns: + msdf.df.loc[msdf.df[MAPPING_CARDINALITY] == "0:0", MAPPING_CARDINALITY] = "" + + return msdf + def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: """Standardize a CURIE or IRI, returning the original if not possible. From ad09a466eae5edd5dcf1439f1ed62ec428174282 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 10 Sep 2025 11:13:13 +0100 Subject: [PATCH 10/12] Add test fixture for the new `enforce_version()` method. --- src/sssom/constants.py | 2 ++ tests/data/sssom11-extensions.sssom.tsv | 33 ++++++++++++++++++ tests/test_utils.py | 46 +++++++++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 tests/data/sssom11-extensions.sssom.tsv diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 807d2294..757095d9 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -75,6 +75,7 @@ MAPPING_SET_VERSION = "mapping_set_version" MAPPING_SET_GROUP = "mapping_set_group" MAPPING_SET_DESCRIPTION = "mapping_set_description" +MAPPING_SET_CONFIDENCE = "mapping_set_confidence" CREATOR_ID = "creator_id" CREATOR_LABEL = "creator_label" AUTHOR_ID = "author_id" @@ -92,6 +93,7 @@ MAPPING_CARDINALITY = "mapping_cardinality" CARDINALITY_SCOPE = "cardinality_scope" MAPPING_TOOL = "mapping_tool" +MAPPING_TOOL_ID = "mapping_tool_id" MAPPING_TOOL_VERSION = "mapping_tool_version" MAPPING_DATE = "mapping_date" PBLICATION_DATE = "publication_date" diff --git a/tests/data/sssom11-extensions.sssom.tsv b/tests/data/sssom11-extensions.sssom.tsv new file mode 100644 index 00000000..381bad56 --- /dev/null +++ b/tests/data/sssom11-extensions.sssom.tsv @@ -0,0 +1,33 @@ +#sssom_version: "1.1" +#curie_map: +# d: http://example.org/d/ +# orcid: https://orcid.org/ +# x: http://example.org/x/ +# z: http://example.org/z/ +#mapping_set_id: https://w3id.org/sssom/mapping/tests/data/sssom11-extensions.sssom.tsv +#mapping_set_confidence: 0.9 +#creator_id: +# - orcid:1234 +# - orcid:5678 +#license: https://creativecommons.org/publicdomain/zero/1.0/ +#extension_definitions: +# - slot_name: ext_fooable +# property: d:fooableProperty +# type_hint: xsd:boolean +# - slot_name: ext_fooability_scale +# property: d:fooableScaleProperty +# type_hint: xsd:integer +#ext_fooability_scale: 79 +#ext_undefined: bar +subject_id predicate_id object_id mapping_justification subject_type mapping_tool_id mapping_date ext_fooable ext_undefined +x:appendage owl:equivalentClass z:appendage semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching composed entity expression d:matcher 2020-05-30 false bar +x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar diff --git a/tests/test_utils.py b/tests/test_utils.py index 3ff69e81..c1ac09e2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -15,6 +15,8 @@ CARDINALITY_SCOPE, CREATOR_ID, MAPPING_CARDINALITY, + MAPPING_SET_CONFIDENCE, + MAPPING_TOOL_ID, OBJECT_ID, OBJECT_LABEL, OBJECT_TYPE, @@ -674,3 +676,47 @@ def _clone(msdf): msdf11.df[MAPPING_CARDINALITY] = "1:1" msdf11.df.loc[9, MAPPING_CARDINALITY] = "0:0" self.assertEqual("1.1", msdf11.get_compatible_version()) + + def test_enforce_version(self) -> None: + """Test that we can force a set to be compliant with a specific SSSOM version.""" + msdf11 = parse_sssom_table(f"{data_dir}/sssom11-extensions.sssom.tsv") + + # The test set contains non-standard slots, but they are + # discarded by the parser (even those properly declared as + # extensions!). To be able to test the "strict" enforcing mode, + # we manually reintroduce the non-standard slots here. + msdf11.metadata["ext_fooability_scale"] = 79 + msdf11.metadata["ext_undefined"] = "bar" + msdf11.df["ext_fooable"] = True + msdf11.df["ext_undefined"] = "bar" + + msdf10 = msdf11.enforce_version("1.0") + # msdf11 should still have all its 1.1 slots since we are not + # using inplace=True + self.assertIn(MAPPING_SET_CONFIDENCE, msdf11.metadata) + self.assertIn(MAPPING_TOOL_ID, msdf11.df.columns) + self.assertIn("composed entity expression", msdf11.df[SUBJECT_TYPE].values) + # But those slots should not be present in msdf10 + self.assertNotIn(MAPPING_SET_CONFIDENCE, msdf10.metadata) + self.assertNotIn(MAPPING_TOOL_ID, msdf10.df.columns) + self.assertNotIn("composed entity expression", msdf10.df[SUBJECT_TYPE].values) + # Further confirm that msdf10 is 1.0-compliant + self.assertEqual("1.0", msdf10.get_compatible_version()) + # Non-standard slots should all be preserved + self.assertIn("ext_fooability_scale", msdf10.metadata) + self.assertIn("ext_undefined", msdf10.metadata) + self.assertIn("ext_fooable", msdf10.df.columns) + self.assertIn("ext_undefined", msdf10.df.columns) + + msdf10 = msdf11.enforce_version("1.0", strict=True) + self.assertEqual("1.0", msdf10.get_compatible_version()) + # Declared non-standard slots should still be there + self.assertIn("ext_fooability_scale", msdf10.metadata) + self.assertIn("ext_fooable", msdf10.df.columns) + # But not undeclared ones + self.assertNotIn("ext_undefined", msdf10.metadata) + self.assertNotIn("ext_undefined", msdf10.df.columns) + + msdf11.enforce_version("1.0", inplace=True) + # now msdf11 itself should be 1.0-compliant + self.assertEqual("1.0", msdf11.get_compatible_version()) From dad820639087cc4b761614a4484745d169a818fe Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 10 Sep 2025 12:04:17 +0100 Subject: [PATCH 11/12] Centralize knowledge about newly added enum values. Currently the fact that a given enum value has been added in a specific version of the specification (for example, "composed entity expression" is new in 1.1) is not formally recorded anywhere, and wherever that information is needed we need custom code to deal with it. This commit adds a new constant `NEW_ENUM_VALUES` that provides that information once and for all. This makes the code for both the `get_compatible_version()` and `enforce_version()` much simpler, avoids duplicated information, and will make it easier to later cope with future new enum values. --- src/sssom/constants.py | 23 ++++++++++++++++++ src/sssom/util.py | 55 ++++++++++++------------------------------ 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 757095d9..e260a85c 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -2,6 +2,7 @@ import pathlib import uuid +from dataclasses import dataclass from enum import Enum from functools import cached_property, lru_cache from typing import Any, Dict, List, Literal, Optional, Set, TextIO, Tuple, Union @@ -219,6 +220,28 @@ class SchemaValidationType(str, Enum): ] +@dataclass +class NewEnumValue(object): + """Represents a enum value that had been added posteriorly to 1.0. + + Ideally that information should be encoded in the LinkML schema and + made available through the SSSOMSchemaView class below, but it does + not seem possible to annotate enum values in LinkML the way it can + be done for slots. So the information comes from the spec instead, + at . + """ + + slots: list[str] # Impacted slots + value: str # The new value + added_in: tuple[int, int] # Version that introduced the new value + + +NEW_ENUM_VALUES = [ + NewEnumValue([SUBJECT_TYPE, OBJECT_TYPE], "composed entity expression", (1, 1)), + NewEnumValue([MAPPING_CARDINALITY], "0:0", (1, 1)), +] + + class SSSOMSchemaView(object): """ SchemaView class from linkml which is instantiated when necessary. diff --git a/src/sssom/util.py b/src/sssom/util.py index 2b7cf8bb..a131eeef 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -33,12 +33,12 @@ MAPPING_JUSTIFICATION, MAPPING_SET_ID, MAPPING_SET_SOURCE, + NEW_ENUM_VALUES, NO_TERM_FOUND, OBJECT_CATEGORY, OBJECT_ID, OBJECT_LABEL, OBJECT_SOURCE, - OBJECT_TYPE, OBO_HAS_DB_XREF, OWL_DIFFERENT_FROM, OWL_EQUIVALENT_CLASS, @@ -61,7 +61,6 @@ SUBJECT_ID, SUBJECT_LABEL, SUBJECT_SOURCE, - SUBJECT_TYPE, UNKNOWN_IRI, MetadataType, PathOrIO, @@ -517,27 +516,13 @@ def get_compatible_version(self) -> str: if version is not None: versions.add(version) - # Then take care of enum values; we cannot use the SSSOM model - # for that (enum values are not tagged with an "added_in" - # annotation the way slots are), so this has to be handled - # "manually" based on the informations provided in - # . - if ( - self.metadata.get(SUBJECT_TYPE) == "composed entity expression" - or self.metadata.get(OBJECT_TYPE) == "composed entity expression" - or ( - SUBJECT_TYPE in self.df.columns - and "composed entity expression" in self.df[SUBJECT_TYPE].values - ) - or ( - OBJECT_TYPE in self.df.columns - and "composed entity expression" in self.df[OBJECT_TYPE].values - ) - ): - versions.add((1, 1)) - - if MAPPING_CARDINALITY in self.df.columns and "0:0" in self.df[MAPPING_CARDINALITY].values: - versions.add((1, 1)) + # Then take care of enum values + for new_enum_value in NEW_ENUM_VALUES: + for slot in new_enum_value.slots: + if self.metadata.get(slot) == new_enum_value.value or ( + slot in self.df.columns and new_enum_value.value in self.df[slot].values + ): + versions.add(new_enum_value.added_in) # Get the highest of the accumulated versions. return ".".join([str(i) for i in max(versions)]) @@ -590,12 +575,10 @@ def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool: for name in msdf.metadata.keys() if not _keep(name, schema.get_minimum_version(name, "mapping set")) ] - if target_version < (1, 1): - # Remove enum values introduced in 1.1 - if msdf.metadata.get(SUBJECT_TYPE) == "composed entity expression": - to_remove.append(SUBJECT_TYPE) - if msdf.metadata.get(OBJECT_TYPE) == "composed entity expression": - to_remove.append(OBJECT_TYPE) + for new_enum_value in [v for v in NEW_ENUM_VALUES if v.added_in > target_version]: + for slot in new_enum_value.slots: + if msdf.metadata.get(slot) == new_enum_value.value: + to_remove.append(slot) for slot in to_remove: msdf.metadata.pop(slot) @@ -606,16 +589,10 @@ def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool: if not _keep(name, schema.get_minimum_version(name, "mapping")) ] msdf.df.drop(columns=to_remove, inplace=True) - if target_version < (1, 1): - # Remove enum values introduced in 1.1 - if SUBJECT_TYPE in msdf.df.columns: - msdf.df.loc[msdf.df[SUBJECT_TYPE] == "composed entity expression", SUBJECT_TYPE] = ( - "" - ) - if OBJECT_TYPE in msdf.df.columns: - msdf.df.loc[msdf.df[OBJECT_TYPE] == "composed entity expression", OBJECT_TYPE] = "" - if MAPPING_CARDINALITY in msdf.df.columns: - msdf.df.loc[msdf.df[MAPPING_CARDINALITY] == "0:0", MAPPING_CARDINALITY] = "" + for new_enum_value in [v for v in NEW_ENUM_VALUES if v.added_in > target_version]: + for slot in new_enum_value.slots: + if slot in msdf.df.columns: + msdf.df.loc[msdf.df[slot] == new_enum_value.value, slot] = "" return msdf From 50c8589039a1cbb8f14b57c45b04d2aaeb037ba8 Mon Sep 17 00:00:00 2001 From: Damien Goutte-Gattat Date: Wed, 8 Oct 2025 10:11:22 +0100 Subject: [PATCH 12/12] Encapsulate new enum logic into the SSSOMSchemaView class. Add a `get_new_enum_values()` method to the SSSOMSchemaView class to get enum values that were introduced after a given version of the spec. First, this dispenses client code from having to explicitly import the NEW_ENUM_VALUES constant. Second, hopefully in the future we might be able to get the information about new enum values directly from the LinkML schema rather than from a hard-coded list, and when that happens we will simply have to update the `get_new_enum_values()` method without impacting the code that is calling that method. --- src/sssom/constants.py | 11 +++++++++++ src/sssom/util.py | 7 +++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index 258431f4..41d52452 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -328,6 +328,17 @@ def propagatable_slots(self) -> List[str]: slots.append(slot_name) return slots + def get_new_enum_values(self, after: Tuple[int, int] = (1, 0)) -> List[NewEnumValue]: + """Get enum values introduced after a given version of the specification. + + :param after: The target version of the SSSOM specification, as + a (major, minor) tuple. The default is (1,0), + meaning all enum values introduced in any version + after 1.0 will be returned. + :return: The list of newly introduced enum values. + """ + return [v for v in NEW_ENUM_VALUES if v.added_in > after] + def get_minimum_version( self, slot_name: str, class_name: str = "mapping" ) -> Optional[Tuple[int, int]]: diff --git a/src/sssom/util.py b/src/sssom/util.py index 94cf53b8..a1f00816 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -48,7 +48,6 @@ MAPPING_JUSTIFICATION, MAPPING_SET_ID, MAPPING_SET_SOURCE, - NEW_ENUM_VALUES, NO_TERM_FOUND, OBJECT_CATEGORY, OBJECT_ID, @@ -530,7 +529,7 @@ def get_compatible_version(self) -> str: versions.add(version) # Then take care of enum values - for new_enum_value in NEW_ENUM_VALUES: + for new_enum_value in schema.get_new_enum_values(): for slot in new_enum_value.slots: if self.metadata.get(slot) == new_enum_value.value or ( slot in self.df.columns and new_enum_value.value in self.df[slot].values @@ -588,7 +587,7 @@ def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool: for name in msdf.metadata.keys() if not _keep(name, schema.get_minimum_version(name, "mapping set")) ] - for new_enum_value in [v for v in NEW_ENUM_VALUES if v.added_in > target_version]: + for new_enum_value in schema.get_new_enum_values(after=target_version): for slot in new_enum_value.slots: if msdf.metadata.get(slot) == new_enum_value.value: to_remove.append(slot) @@ -602,7 +601,7 @@ def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool: if not _keep(name, schema.get_minimum_version(name, "mapping")) ] msdf.df.drop(columns=to_remove, inplace=True) - for new_enum_value in [v for v in NEW_ENUM_VALUES if v.added_in > target_version]: + for new_enum_value in schema.get_new_enum_values(after=target_version): for slot in new_enum_value.slots: if slot in msdf.df.columns: msdf.df.loc[msdf.df[slot] == new_enum_value.value, slot] = ""