diff --git a/src/sssom/constants.py b/src/sssom/constants.py index ad53b34c..41d52452 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -5,13 +5,28 @@ import importlib.resources import pathlib import uuid +from dataclasses import dataclass from enum import Enum from functools import cached_property, lru_cache -from typing import Any, ClassVar, Dict, List, Literal, Mapping, Set, TextIO, Union, cast +from typing import ( + Any, + ClassVar, + Dict, + List, + Literal, + Mapping, + Optional, + Set, + TextIO, + Tuple, + Union, + cast, +) import yaml from linkml_runtime.utils.schema_as_dict import schema_as_dict from linkml_runtime.utils.schemaview import SchemaView +from sssom_schema.datamodel.sssom_schema import SssomVersionEnum HERE = pathlib.Path(__file__).parent.resolve() @@ -77,6 +92,7 @@ MAPPING_SET_VERSION = "mapping_set_version" MAPPING_SET_GROUP = "mapping_set_group" MAPPING_SET_DESCRIPTION = "mapping_set_description" +MAPPING_SET_CONFIDENCE = "mapping_set_confidence" CREATOR_ID = "creator_id" CREATOR_LABEL = "creator_label" AUTHOR_ID = "author_id" @@ -94,6 +110,7 @@ MAPPING_CARDINALITY = "mapping_cardinality" CARDINALITY_SCOPE = "cardinality_scope" MAPPING_TOOL = "mapping_tool" +MAPPING_TOOL_ID = "mapping_tool_id" MAPPING_TOOL_VERSION = "mapping_tool_version" MAPPING_DATE = "mapping_date" PBLICATION_DATE = "publication_date" @@ -108,6 +125,8 @@ SEE_ALSO = "see_also" OTHER = "other" COMMENT = "comment" +EXTENSION_DEFINITIONS = "extension_definitions" +EXTENSION_SLOT_NAME = "slot_name" CURIE_MAP = "curie_map" SUBJECT_SOURCE_ID = "subject_source_id" @@ -217,6 +236,28 @@ class SchemaValidationType(str, Enum): ] +@dataclass +class NewEnumValue(object): + """Represents a enum value that had been added posteriorly to 1.0. + + Ideally that information should be encoded in the LinkML schema and + made available through the SSSOMSchemaView class below, but it does + not seem possible to annotate enum values in LinkML the way it can + be done for slots. So the information comes from the spec instead, + at . + """ + + slots: list[str] # Impacted slots + value: str # The new value + added_in: tuple[int, int] # Version that introduced the new value + + +NEW_ENUM_VALUES = [ + NewEnumValue([SUBJECT_TYPE, OBJECT_TYPE], "composed entity expression", (1, 1)), + NewEnumValue([MAPPING_CARDINALITY], "0:0", (1, 1)), +] + + class SSSOMSchemaView(object): """SchemaView class from linkml which is instantiated when necessary. @@ -287,6 +328,54 @@ def propagatable_slots(self) -> List[str]: slots.append(slot_name) return slots + def get_new_enum_values(self, after: Tuple[int, int] = (1, 0)) -> List[NewEnumValue]: + """Get enum values introduced after a given version of the specification. + + :param after: The target version of the SSSOM specification, as + a (major, minor) tuple. The default is (1,0), + meaning all enum values introduced in any version + after 1.0 will be returned. + :return: The list of newly introduced enum values. + """ + return [v for v in NEW_ENUM_VALUES if v.added_in > after] + + def get_minimum_version( + self, slot_name: str, class_name: str = "mapping" + ) -> Optional[Tuple[int, int]]: + """Get the minimum version of SSSOM required for a given slot. + + :param slot_name: The queried slot. + :param class_name: The class the slot belongs to. This is needed + because a slot may have been added to a class + in a later version than the version in which + it was first introduced in the schema. + :return: A tuple containing the major and minor numbers of the + earliest version of SSSOM that defines the given slot + in the given class. May be None if the requested slot + name is not a valid slot name. + """ + try: + slot = self.view.induced_slot(slot_name, class_name) + return parse_sssom_version(slot.annotations.added_in.value) + except AttributeError: # No added_in annotation, defaults to 1.0 + return (1, 0) + except ValueError: # No such slot + return None + + +def parse_sssom_version(version: str) -> Tuple[int, int]: + """Parse a string into a valid SSSOM version number. + + :param version: The string to parse into a version number. + :return: A (major, minor) tuple. + """ + v = [int(n) for n in SssomVersionEnum(version).code.text.split(".")] + if len(v) != 2: + # Should never happen, should be caught by the SssomVersionEnum + # constructor before we arrive here + raise ValueError("Invalid version") + return (v[0], v[1]) + @lru_cache(1) def _get_sssom_schema_object() -> SSSOMSchemaView: diff --git a/src/sssom/util.py b/src/sssom/util.py index eb14b069..a1f00816 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -42,6 +42,8 @@ COLUMN_INVERT_DICTIONARY, COMMENT, CONFIDENCE, + EXTENSION_DEFINITIONS, + EXTENSION_SLOT_NAME, MAPPING_CARDINALITY, MAPPING_JUSTIFICATION, MAPPING_SET_ID, @@ -78,6 +80,7 @@ SSSOMSchemaView, _get_sssom_schema_object, get_default_metadata, + parse_sssom_version, ) from .context import ( SSSOM_BUILT_IN_PREFIXES, @@ -509,6 +512,102 @@ def _to_string(row: dict[str, Any], side: str) -> str: # No scope, so remove any pre-existing "cardinality_scope" column self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore") + def get_compatible_version(self) -> str: + """Get the minimum version of SSSOM this set is compatible with.""" + schema = SSSOMSchemaView() + versions: Set[Tuple[int, int]] = set() + + # First get the minimum versions required by the slots present + # in the set; this is entirely provided by the SSSOM model. + for slot in self.metadata.keys(): + version = schema.get_minimum_version(slot, "mapping set") + if version is not None: + versions.add(version) + for slot in self.df.columns: + version = schema.get_minimum_version(slot, "mapping") + if version is not None: + versions.add(version) + + # Then take care of enum values + for new_enum_value in schema.get_new_enum_values(): + for slot in new_enum_value.slots: + if self.metadata.get(slot) == new_enum_value.value or ( + slot in self.df.columns and new_enum_value.value in self.df[slot].values + ): + versions.add(new_enum_value.added_in) + + # Get the highest of the accumulated versions. + return ".".join([str(i) for i in max(versions)]) + + def enforce_version( + self, version: str, strict: bool = False, inplace: bool = False + ) -> "MappingSetDataFrame": + """Ensure the set is compliant with a given version of the SSSOM specification. + + This method will forcefully remove any slot or enum value that + is not defined in the specified version of the specification. + + :param version: The targeted version of the specification, as a + string of the form `X.Y`. + :param strict: If `True`, unknown slots will be removed as well, + unless they are properly declared as extensions. + :param inplace: if `True`, the method will modify and return the + set it has been called upon. The default is to + leave that set untouched and to return a + modified copy. + :return: A set that is compliant with the requested version of + the SSSOM specification. + """ + if inplace: + msdf = self + else: + msdf = MappingSetDataFrame(df=self.df.copy(), metadata=self.metadata.copy()) + + schema = SSSOMSchemaView() + target_version = parse_sssom_version(version) + defined_extensions = [ + ext.get(EXTENSION_SLOT_NAME) for ext in msdf.metadata.get(EXTENSION_DEFINITIONS, []) + ] + + # Helper method to decide whether to keep or discard a slot + def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool: + if version is not None: + # This is a known slot, keep if compatible with target version + return version <= target_version + elif strict: + # Unknown slot in strict mode, keep only if declared as an extension + return name in defined_extensions + else: + # Unknown slot in non-strict mode, always keep + return True + + # First the mapping set slots + to_remove = [ + name + for name in msdf.metadata.keys() + if not _keep(name, schema.get_minimum_version(name, "mapping set")) + ] + for new_enum_value in schema.get_new_enum_values(after=target_version): + for slot in new_enum_value.slots: + if msdf.metadata.get(slot) == new_enum_value.value: + to_remove.append(slot) + for slot in to_remove: + msdf.metadata.pop(slot) + + # Then the individual mapping record slots + to_remove = [ + name + for name in msdf.df.columns + if not _keep(name, schema.get_minimum_version(name, "mapping")) + ] + msdf.df.drop(columns=to_remove, inplace=True) + for new_enum_value in schema.get_new_enum_values(after=target_version): + for slot in new_enum_value.slots: + if slot in msdf.df.columns: + msdf.df.loc[msdf.df[slot] == new_enum_value.value, slot] = "" + + return msdf + def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: """Standardize a CURIE or IRI, returning the original if not possible. diff --git a/tests/data/sssom11-extensions.sssom.tsv b/tests/data/sssom11-extensions.sssom.tsv new file mode 100644 index 00000000..d452b32b --- /dev/null +++ b/tests/data/sssom11-extensions.sssom.tsv @@ -0,0 +1,33 @@ +#sssom_version: "1.1" +#curie_map: +# d: http://example.org/d/ +# orcid: https://orcid.org/ +# x: http://example.org/x/ +# z: http://example.org/z/ +#mapping_set_id: https://w3id.org/sssom/mapping/tests/data/sssom11-extensions.sssom.tsv +#mapping_set_confidence: 0.9 +#creator_id: +# - orcid:1234 +# - orcid:5678 +#license: https://creativecommons.org/publicdomain/zero/1.0/ +#extension_definitions: +# - slot_name: ext_fooable +# property: d:fooableProperty +# type_hint: xsd:boolean +# - slot_name: ext_fooability_scale +# property: d:fooableScaleProperty +# type_hint: xsd:integer +#ext_fooability_scale: 79 +#ext_undefined: bar +subject_id predicate_id object_id mapping_justification subject_type mapping_tool_id mapping_date ext_fooable ext_undefined +x:appendage owl:equivalentClass z:appendage semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching composed entity expression d:matcher 2020-05-30 false bar +x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar +x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar diff --git a/tests/test_utils.py b/tests/test_utils.py index aacbc024..54048a84 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -15,12 +15,17 @@ CARDINALITY_SCOPE, CREATOR_ID, MAPPING_CARDINALITY, + MAPPING_SET_CONFIDENCE, + MAPPING_TOOL_ID, OBJECT_ID, OBJECT_LABEL, + OBJECT_TYPE, PREDICATE_ID, + PREDICATE_TYPE, SEMAPV, SUBJECT_ID, SUBJECT_LABEL, + SUBJECT_TYPE, MetadataType, ) from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter @@ -635,3 +640,84 @@ def test_infer_scoped_cardinality(self) -> None: expected = ["1:n", "1:n", "1:n", "1:n", "1:n", "1:n"] self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values)) self.assertNotIn(CARDINALITY_SCOPE, msdf.df.columns) + + def test_inferring_compatible_version(self) -> None: + """Test that we can correctly infer the version a set is compatible with.""" + msdf10 = parse_sssom_table(f"{data_dir}/basic.tsv") + + # Nothing in that set requires 1.1 + self.assertEqual("1.0", msdf10.get_compatible_version()) + + def _clone(msdf: MappingSetDataFrame) -> MappingSetDataFrame: + return MappingSetDataFrame(df=msdf.df.copy(), metadata=msdf.metadata.copy()) + + # Inject a 1.1-specific mapping set slot + msdf11 = _clone(msdf10) + msdf11.metadata[CARDINALITY_SCOPE] = "predicate_id" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Inject a 1.1-specific mapping slot + msdf11 = _clone(msdf10) + msdf11.df[PREDICATE_TYPE] = "owl object property" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Inject a 1.1-specific entity_type_enum value + msdf11 = _clone(msdf10) + msdf11.metadata[SUBJECT_TYPE] = "composed entity expression" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Same, but on a single mapping record + msdf11 = _clone(msdf10) + msdf11.df[OBJECT_TYPE] = "owl class" + msdf11.df.loc[2, OBJECT_TYPE] = "composed entity expression" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + # Inject the 1.1-specific "0:0" cardinality value + msdf11 = _clone(msdf10) + msdf11.df[MAPPING_CARDINALITY] = "1:1" + msdf11.df.loc[9, MAPPING_CARDINALITY] = "0:0" + self.assertEqual("1.1", msdf11.get_compatible_version()) + + def test_enforce_version(self) -> None: + """Test that we can force a set to be compliant with a specific SSSOM version.""" + msdf11 = parse_sssom_table(f"{data_dir}/sssom11-extensions.sssom.tsv") + + # The test set contains non-standard slots, but they are + # discarded by the parser (even those properly declared as + # extensions!). To be able to test the "strict" enforcing mode, + # we manually reintroduce the non-standard slots here. + msdf11.metadata["ext_fooability_scale"] = 79 + msdf11.metadata["ext_undefined"] = "bar" + msdf11.df["ext_fooable"] = True + msdf11.df["ext_undefined"] = "bar" + + msdf10 = msdf11.enforce_version("1.0") + # msdf11 should still have all its 1.1 slots since we are not + # using inplace=True + self.assertIn(MAPPING_SET_CONFIDENCE, msdf11.metadata) + self.assertIn(MAPPING_TOOL_ID, msdf11.df.columns) + self.assertIn("composed entity expression", msdf11.df[SUBJECT_TYPE].values) + # But those slots should not be present in msdf10 + self.assertNotIn(MAPPING_SET_CONFIDENCE, msdf10.metadata) + self.assertNotIn(MAPPING_TOOL_ID, msdf10.df.columns) + self.assertNotIn("composed entity expression", msdf10.df[SUBJECT_TYPE].values) + # Further confirm that msdf10 is 1.0-compliant + self.assertEqual("1.0", msdf10.get_compatible_version()) + # Non-standard slots should all be preserved + self.assertIn("ext_fooability_scale", msdf10.metadata) + self.assertIn("ext_undefined", msdf10.metadata) + self.assertIn("ext_fooable", msdf10.df.columns) + self.assertIn("ext_undefined", msdf10.df.columns) + + msdf10 = msdf11.enforce_version("1.0", strict=True) + self.assertEqual("1.0", msdf10.get_compatible_version()) + # Declared non-standard slots should still be there + self.assertIn("ext_fooability_scale", msdf10.metadata) + self.assertIn("ext_fooable", msdf10.df.columns) + # But not undeclared ones + self.assertNotIn("ext_undefined", msdf10.metadata) + self.assertNotIn("ext_undefined", msdf10.df.columns) + + msdf11.enforce_version("1.0", inplace=True) + # now msdf11 itself should be 1.0-compliant + self.assertEqual("1.0", msdf11.get_compatible_version())