Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 90 additions & 1 deletion src/sssom/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,28 @@
import importlib.resources
import pathlib
import uuid
from dataclasses import dataclass
from enum import Enum
from functools import cached_property, lru_cache
from typing import Any, ClassVar, Dict, List, Literal, Mapping, Set, TextIO, Union, cast
from typing import (
Any,
ClassVar,
Dict,
List,
Literal,
Mapping,
Optional,
Set,
TextIO,
Tuple,
Union,
cast,
)

import yaml
from linkml_runtime.utils.schema_as_dict import schema_as_dict
from linkml_runtime.utils.schemaview import SchemaView
from sssom_schema.datamodel.sssom_schema import SssomVersionEnum

HERE = pathlib.Path(__file__).parent.resolve()

Expand Down Expand Up @@ -77,6 +92,7 @@
MAPPING_SET_VERSION = "mapping_set_version"
MAPPING_SET_GROUP = "mapping_set_group"
MAPPING_SET_DESCRIPTION = "mapping_set_description"
MAPPING_SET_CONFIDENCE = "mapping_set_confidence"
CREATOR_ID = "creator_id"
CREATOR_LABEL = "creator_label"
AUTHOR_ID = "author_id"
Expand All @@ -94,6 +110,7 @@
MAPPING_CARDINALITY = "mapping_cardinality"
CARDINALITY_SCOPE = "cardinality_scope"
MAPPING_TOOL = "mapping_tool"
MAPPING_TOOL_ID = "mapping_tool_id"
MAPPING_TOOL_VERSION = "mapping_tool_version"
MAPPING_DATE = "mapping_date"
PBLICATION_DATE = "publication_date"
Expand All @@ -108,6 +125,8 @@
SEE_ALSO = "see_also"
OTHER = "other"
COMMENT = "comment"
EXTENSION_DEFINITIONS = "extension_definitions"
EXTENSION_SLOT_NAME = "slot_name"

CURIE_MAP = "curie_map"
SUBJECT_SOURCE_ID = "subject_source_id"
Expand Down Expand Up @@ -217,6 +236,28 @@ class SchemaValidationType(str, Enum):
]


@dataclass
class NewEnumValue(object):
"""Represents a enum value that had been added posteriorly to 1.0.

Ideally that information should be encoded in the LinkML schema and
made available through the SSSOMSchemaView class below, but it does
not seem possible to annotate enum values in LinkML the way it can
be done for slots. So the information comes from the spec instead,
at <https://mapping-commons.github.io/sssom/spec-model/#model-changes-across-versions>.
"""

slots: list[str] # Impacted slots
value: str # The new value
added_in: tuple[int, int] # Version that introduced the new value


NEW_ENUM_VALUES = [
NewEnumValue([SUBJECT_TYPE, OBJECT_TYPE], "composed entity expression", (1, 1)),
NewEnumValue([MAPPING_CARDINALITY], "0:0", (1, 1)),
]


class SSSOMSchemaView(object):
"""SchemaView class from linkml which is instantiated when necessary.

Expand Down Expand Up @@ -287,6 +328,54 @@ def propagatable_slots(self) -> List[str]:
slots.append(slot_name)
return slots

def get_new_enum_values(self, after: Tuple[int, int] = (1, 0)) -> List[NewEnumValue]:
"""Get enum values introduced after a given version of the specification.

:param after: The target version of the SSSOM specification, as
a (major, minor) tuple. The default is (1,0),
meaning all enum values introduced in any version
after 1.0 will be returned.
:return: The list of newly introduced enum values.
"""
return [v for v in NEW_ENUM_VALUES if v.added_in > after]

def get_minimum_version(
self, slot_name: str, class_name: str = "mapping"
) -> Optional[Tuple[int, int]]:
"""Get the minimum version of SSSOM required for a given slot.

:param slot_name: The queried slot.
:param class_name: The class the slot belongs to. This is needed
because a slot may have been added to a class
in a later version than the version in which
it was first introduced in the schema.
:return: A tuple containing the major and minor numbers of the
earliest version of SSSOM that defines the given slot
in the given class. May be None if the requested slot
name is not a valid slot name.
"""
try:
slot = self.view.induced_slot(slot_name, class_name)
return parse_sssom_version(slot.annotations.added_in.value)
except AttributeError: # No added_in annotation, defaults to 1.0
return (1, 0)
except ValueError: # No such slot
return None


def parse_sssom_version(version: str) -> Tuple[int, int]:
"""Parse a string into a valid SSSOM version number.

:param version: The string to parse into a version number.
:return: A (major, minor) tuple.
"""
v = [int(n) for n in SssomVersionEnum(version).code.text.split(".")]
if len(v) != 2:
# Should never happen, should be caught by the SssomVersionEnum
# constructor before we arrive here
raise ValueError("Invalid version")
return (v[0], v[1])


@lru_cache(1)
def _get_sssom_schema_object() -> SSSOMSchemaView:
Expand Down
99 changes: 99 additions & 0 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
COLUMN_INVERT_DICTIONARY,
COMMENT,
CONFIDENCE,
EXTENSION_DEFINITIONS,
EXTENSION_SLOT_NAME,
MAPPING_CARDINALITY,
MAPPING_JUSTIFICATION,
MAPPING_SET_ID,
Expand Down Expand Up @@ -78,6 +80,7 @@
SSSOMSchemaView,
_get_sssom_schema_object,
get_default_metadata,
parse_sssom_version,
)
from .context import (
SSSOM_BUILT_IN_PREFIXES,
Expand Down Expand Up @@ -509,6 +512,102 @@ def _to_string(row: dict[str, Any], side: str) -> str:
# No scope, so remove any pre-existing "cardinality_scope" column
self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore")

def get_compatible_version(self) -> str:
"""Get the minimum version of SSSOM this set is compatible with."""
schema = SSSOMSchemaView()
versions: Set[Tuple[int, int]] = set()

# First get the minimum versions required by the slots present
# in the set; this is entirely provided by the SSSOM model.
for slot in self.metadata.keys():
version = schema.get_minimum_version(slot, "mapping set")
if version is not None:
versions.add(version)
for slot in self.df.columns:
version = schema.get_minimum_version(slot, "mapping")
if version is not None:
versions.add(version)

# Then take care of enum values
for new_enum_value in schema.get_new_enum_values():
for slot in new_enum_value.slots:
if self.metadata.get(slot) == new_enum_value.value or (
slot in self.df.columns and new_enum_value.value in self.df[slot].values
):
versions.add(new_enum_value.added_in)

# Get the highest of the accumulated versions.
return ".".join([str(i) for i in max(versions)])

def enforce_version(
self, version: str, strict: bool = False, inplace: bool = False
) -> "MappingSetDataFrame":
"""Ensure the set is compliant with a given version of the SSSOM specification.

This method will forcefully remove any slot or enum value that
is not defined in the specified version of the specification.

:param version: The targeted version of the specification, as a
string of the form `X.Y`.
:param strict: If `True`, unknown slots will be removed as well,
unless they are properly declared as extensions.
:param inplace: if `True`, the method will modify and return the
set it has been called upon. The default is to
leave that set untouched and to return a
modified copy.
:return: A set that is compliant with the requested version of
the SSSOM specification.
"""
if inplace:
msdf = self
else:
msdf = MappingSetDataFrame(df=self.df.copy(), metadata=self.metadata.copy())

schema = SSSOMSchemaView()
target_version = parse_sssom_version(version)
defined_extensions = [
ext.get(EXTENSION_SLOT_NAME) for ext in msdf.metadata.get(EXTENSION_DEFINITIONS, [])
]

# Helper method to decide whether to keep or discard a slot
def _keep(name: str, version: Optional[Tuple[int, int]]) -> bool:
if version is not None:
# This is a known slot, keep if compatible with target version
return version <= target_version
elif strict:
# Unknown slot in strict mode, keep only if declared as an extension
return name in defined_extensions
else:
# Unknown slot in non-strict mode, always keep
return True

# First the mapping set slots
to_remove = [
name
for name in msdf.metadata.keys()
if not _keep(name, schema.get_minimum_version(name, "mapping set"))
]
for new_enum_value in schema.get_new_enum_values(after=target_version):
for slot in new_enum_value.slots:
if msdf.metadata.get(slot) == new_enum_value.value:
to_remove.append(slot)
for slot in to_remove:
msdf.metadata.pop(slot)

# Then the individual mapping record slots
to_remove = [
name
for name in msdf.df.columns
if not _keep(name, schema.get_minimum_version(name, "mapping"))
]
msdf.df.drop(columns=to_remove, inplace=True)
for new_enum_value in schema.get_new_enum_values(after=target_version):
for slot in new_enum_value.slots:
if slot in msdf.df.columns:
msdf.df.loc[msdf.df[slot] == new_enum_value.value, slot] = ""

return msdf


def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
"""Standardize a CURIE or IRI, returning the original if not possible.
Expand Down
33 changes: 33 additions & 0 deletions tests/data/sssom11-extensions.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#sssom_version: "1.1"
#curie_map:
# d: http://example.org/d/
# orcid: https://orcid.org/
# x: http://example.org/x/
# z: http://example.org/z/
#mapping_set_id: https://w3id.org/sssom/mapping/tests/data/sssom11-extensions.sssom.tsv
#mapping_set_confidence: 0.9
#creator_id:
# - orcid:1234
# - orcid:5678
#license: https://creativecommons.org/publicdomain/zero/1.0/
#extension_definitions:
# - slot_name: ext_fooable
# property: d:fooableProperty
# type_hint: xsd:boolean
# - slot_name: ext_fooability_scale
# property: d:fooableScaleProperty
# type_hint: xsd:integer
#ext_fooability_scale: 79
#ext_undefined: bar
subject_id predicate_id object_id mapping_justification subject_type mapping_tool_id mapping_date ext_fooable ext_undefined
x:appendage owl:equivalentClass z:appendage semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching composed entity expression d:matcher 2020-05-30 false bar
x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_element owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_element owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_tissue owl:equivalentClass z:bone_element semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
x:bone_tissue owl:equivalentClass z:bone_tissue semapv:LexicalMatching owl class d:matcher 2020-05-30 true bar
86 changes: 86 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@
CARDINALITY_SCOPE,
CREATOR_ID,
MAPPING_CARDINALITY,
MAPPING_SET_CONFIDENCE,
MAPPING_TOOL_ID,
OBJECT_ID,
OBJECT_LABEL,
OBJECT_TYPE,
PREDICATE_ID,
PREDICATE_TYPE,
SEMAPV,
SUBJECT_ID,
SUBJECT_LABEL,
SUBJECT_TYPE,
MetadataType,
)
from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter
Expand Down Expand Up @@ -635,3 +640,84 @@ def test_infer_scoped_cardinality(self) -> None:
expected = ["1:n", "1:n", "1:n", "1:n", "1:n", "1:n"]
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
self.assertNotIn(CARDINALITY_SCOPE, msdf.df.columns)

def test_inferring_compatible_version(self) -> None:
"""Test that we can correctly infer the version a set is compatible with."""
msdf10 = parse_sssom_table(f"{data_dir}/basic.tsv")

# Nothing in that set requires 1.1
self.assertEqual("1.0", msdf10.get_compatible_version())

def _clone(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
return MappingSetDataFrame(df=msdf.df.copy(), metadata=msdf.metadata.copy())

# Inject a 1.1-specific mapping set slot
msdf11 = _clone(msdf10)
msdf11.metadata[CARDINALITY_SCOPE] = "predicate_id"
self.assertEqual("1.1", msdf11.get_compatible_version())

# Inject a 1.1-specific mapping slot
msdf11 = _clone(msdf10)
msdf11.df[PREDICATE_TYPE] = "owl object property"
self.assertEqual("1.1", msdf11.get_compatible_version())

# Inject a 1.1-specific entity_type_enum value
msdf11 = _clone(msdf10)
msdf11.metadata[SUBJECT_TYPE] = "composed entity expression"
self.assertEqual("1.1", msdf11.get_compatible_version())

# Same, but on a single mapping record
msdf11 = _clone(msdf10)
msdf11.df[OBJECT_TYPE] = "owl class"
msdf11.df.loc[2, OBJECT_TYPE] = "composed entity expression"
self.assertEqual("1.1", msdf11.get_compatible_version())

# Inject the 1.1-specific "0:0" cardinality value
msdf11 = _clone(msdf10)
msdf11.df[MAPPING_CARDINALITY] = "1:1"
msdf11.df.loc[9, MAPPING_CARDINALITY] = "0:0"
self.assertEqual("1.1", msdf11.get_compatible_version())

def test_enforce_version(self) -> None:
"""Test that we can force a set to be compliant with a specific SSSOM version."""
msdf11 = parse_sssom_table(f"{data_dir}/sssom11-extensions.sssom.tsv")

# The test set contains non-standard slots, but they are
# discarded by the parser (even those properly declared as
# extensions!). To be able to test the "strict" enforcing mode,
# we manually reintroduce the non-standard slots here.
msdf11.metadata["ext_fooability_scale"] = 79
msdf11.metadata["ext_undefined"] = "bar"
msdf11.df["ext_fooable"] = True
msdf11.df["ext_undefined"] = "bar"

msdf10 = msdf11.enforce_version("1.0")
# msdf11 should still have all its 1.1 slots since we are not
# using inplace=True
self.assertIn(MAPPING_SET_CONFIDENCE, msdf11.metadata)
self.assertIn(MAPPING_TOOL_ID, msdf11.df.columns)
self.assertIn("composed entity expression", msdf11.df[SUBJECT_TYPE].values)
# But those slots should not be present in msdf10
self.assertNotIn(MAPPING_SET_CONFIDENCE, msdf10.metadata)
self.assertNotIn(MAPPING_TOOL_ID, msdf10.df.columns)
self.assertNotIn("composed entity expression", msdf10.df[SUBJECT_TYPE].values)
# Further confirm that msdf10 is 1.0-compliant
self.assertEqual("1.0", msdf10.get_compatible_version())
# Non-standard slots should all be preserved
self.assertIn("ext_fooability_scale", msdf10.metadata)
self.assertIn("ext_undefined", msdf10.metadata)
self.assertIn("ext_fooable", msdf10.df.columns)
self.assertIn("ext_undefined", msdf10.df.columns)

msdf10 = msdf11.enforce_version("1.0", strict=True)
self.assertEqual("1.0", msdf10.get_compatible_version())
# Declared non-standard slots should still be there
self.assertIn("ext_fooability_scale", msdf10.metadata)
self.assertIn("ext_fooable", msdf10.df.columns)
# But not undeclared ones
self.assertNotIn("ext_undefined", msdf10.metadata)
self.assertNotIn("ext_undefined", msdf10.df.columns)

msdf11.enforce_version("1.0", inplace=True)
# now msdf11 itself should be 1.0-compliant
self.assertEqual("1.0", msdf11.get_compatible_version())
Loading