Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SchemaView object should be instantiated when needed & not globally. #323

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pypi-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3.0.2
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v3.1.2
uses: actions/setup-python@v4.3.0
with:
python-version: 3.9

Expand Down
7 changes: 3 additions & 4 deletions sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,9 @@

from sssom.constants import (
DEFAULT_VALIDATION_TYPES,
MAPPING_SET_SLOTS,
MAPPING_SLOTS,
PREFIX_MAP_MODES,
SchemaValidationType,
SSSOMSchemaView,
)
from sssom.context import get_default_metadata

Expand Down Expand Up @@ -628,7 +627,7 @@ def decorator(f):
@main.command()
@input_argument
@output_option
@dynamically_generate_sssom_options(MAPPING_SLOTS)
@dynamically_generate_sssom_options(SSSOMSchemaView().mapping_slots)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this will defeat the purpose of the refactoring. I should have thought of this before.

I am wondering now if we need to find a way to skip network access in SchemaView()?

def filter(input: str, output: TextIO, **kwargs):
"""Filter a dataframe by dynamically generating queries based on user input.

Expand Down Expand Up @@ -659,7 +658,7 @@ def filter(input: str, output: TextIO, **kwargs):
type=bool,
help="Multivalued slots should be replaced or not. [default: False]",
)
@dynamically_generate_sssom_options(MAPPING_SET_SLOTS)
@dynamically_generate_sssom_options(SSSOMSchemaView().mapping_set_slots)
def annotate(input: str, output: TextIO, replace_multivalued: bool, **kwargs):
"""Annotate metadata of a mapping set.

Expand Down
35 changes: 16 additions & 19 deletions sssom/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,6 @@

HERE = pathlib.Path(__file__).parent.resolve()

SCHEMA_YAML = pkg_resources.resource_filename(
"sssom_schema", "schema/sssom_schema.yaml"
)
SCHEMA_VIEW = SchemaView(SCHEMA_YAML)
# SCHEMA_VIEW = package_schemaview("sssom_schema")
SCHEMA_DICT = schema_as_dict(SCHEMA_VIEW.schema)
MAPPING_SLOTS = SCHEMA_DICT["classes"]["mapping"]["slots"]
MAPPING_SET_SLOTS = SCHEMA_DICT["classes"]["mapping set"]["slots"]

OWL_EQUIV_CLASS = "http://www.w3.org/2002/07/owl#equivalentClass"
RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf"

Expand All @@ -42,16 +33,6 @@
PREFIX_MAP_MODE_SSSOM_DEFAULT_ONLY,
PREFIX_MAP_MODE_MERGED,
]
ENTITY_REFERENCE = "EntityReference"

MULTIVALUED_SLOTS = [
c for c in SCHEMA_VIEW.all_slots() if SCHEMA_VIEW.get_slot(c).multivalued
]
ENTITY_REFERENCE_SLOTS = [
c
for c in SCHEMA_VIEW.all_slots()
if SCHEMA_VIEW.get_slot(c).range == ENTITY_REFERENCE
]

# Slot Constants
MIRROR_FROM = "mirror_from"
Expand Down Expand Up @@ -177,3 +158,19 @@ class SchemaValidationType(str, Enum):
SchemaValidationType.JsonSchema,
SchemaValidationType.PrefixMapCompleteness,
]


class SSSOMSchemaView:
"""
SchemaView class from linkml which is instantiated when necessary.

Reason for this: https://github.com/mapping-commons/sssom-py/issues/322
Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323
"""

entity_reference = "EntityReference"
yaml = pkg_resources.resource_filename("sssom_schema", "schema/sssom_schema.yaml")
view = SchemaView(yaml)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure, but you may have to use init() to avoid this code being run whenever the class is referenced:

https://stackoverflow.com/questions/9056957/correct-way-to-define-class-variables-in-python

But not 100% sure.

dict = schema_as_dict(view.schema)
mapping_slots = dict["classes"]["mapping"]["slots"]
mapping_set_slots = dict["classes"]["mapping set"]["slots"]
4 changes: 2 additions & 2 deletions sssom/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from linkml.generators.jsonldcontextgen import ContextGenerator

from sssom.constants import SCHEMA_YAML
from sssom.constants import SSSOMSchemaView

from .external_context import sssom_external_context
from .typehints import Metadata, MetadataType, PrefixMap
Expand All @@ -29,7 +29,7 @@ def get_jsonld_context():

:return: JSON-LD context
"""
sssom_context = ContextGenerator(SCHEMA_YAML).serialize()
sssom_context = ContextGenerator(SSSOMSchemaView().yaml).serialize()
return json.loads(sssom_context, strict=False)


Expand Down
10 changes: 6 additions & 4 deletions sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@
MAPPING_JUSTIFICATION,
MAPPING_JUSTIFICATION_UNSPECIFIED,
MAPPING_SET_ID,
MAPPING_SET_SLOTS,
MAPPING_SLOTS,
OBJECT_ID,
OBJECT_LABEL,
OBJECT_SOURCE,
Expand All @@ -43,6 +41,7 @@
SUBJECT_LABEL,
SUBJECT_SOURCE,
SUBJECT_SOURCE_ID,
SSSOMSchemaView,
)

from .context import (
Expand Down Expand Up @@ -310,6 +309,9 @@ def _get_mdict_ms_and_bad_attrs(
) -> Tuple[dict, MappingSet, Counter]:

mdict = {}
schema_view = SSSOMSchemaView()
mapping_slots = schema_view.mapping_slots
mapping_set_slots = schema_view.mapping_set_slots

for k, v in row.items():
if v and v == v:
Expand All @@ -318,11 +320,11 @@ def _get_mdict_ms_and_bad_attrs(
k = str(k)
v = _address_multivalued_slot(k, v)
# if hasattr(Mapping, k):
if k in MAPPING_SLOTS:
if k in mapping_slots:
mdict[k] = v
ok = True
# if hasattr(MappingSet, k):
if k in MAPPING_SET_SLOTS:
if k in mapping_set_slots:
ms[k] = v
ok = True
if not ok:
Expand Down
58 changes: 33 additions & 25 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,9 @@
from .constants import (
COMMENT,
CONFIDENCE,
ENTITY_REFERENCE_SLOTS,
MAPPING_JUSTIFICATION,
MAPPING_SET_ID,
MAPPING_SET_SLOTS,
MAPPING_SET_SOURCE,
MULTIVALUED_SLOTS,
OBJECT_CATEGORY,
OBJECT_ID,
OBJECT_LABEL,
Expand All @@ -59,8 +56,6 @@
PREDICATE_MODIFIER_NOT,
PREFIX_MAP_MODES,
RDFS_SUBCLASS_OF,
SCHEMA_DICT,
SCHEMA_YAML,
SEMAPV,
SKOS_BROAD_MATCH,
SKOS_CLOSE_MATCH,
Expand All @@ -72,6 +67,7 @@
SUBJECT_ID,
SUBJECT_LABEL,
SUBJECT_SOURCE,
SSSOMSchemaView,
)
from .context import (
SSSOM_BUILT_IN_PREFIXES,
Expand Down Expand Up @@ -832,7 +828,7 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
:return: MappingSetDataFrame with metadata as columns
"""
# TODO Check if 'k' is a valid 'slot' for 'mapping' [sssom.yaml]
with open(SCHEMA_YAML) as file:
with open(schema_view_object.yaml) as file:
schema = yaml.safe_load(file)
slots = schema["classes"]["mapping"]["slots"]
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
if msdf.metadata is not None and msdf.df is not None:
Expand Down Expand Up @@ -957,8 +953,8 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame:
data = []
slots_with_double_as_range = [
s
for s in SCHEMA_DICT["slots"].keys()
if SCHEMA_DICT["slots"][s]["range"] == "double"
for s in schema_dict["slots"].keys()
if schema_dict["slots"][s]["range"] == "double"
]
if doc.mapping_set.mappings is not None:
for mapping in doc.mapping_set.mappings:
Expand Down Expand Up @@ -991,19 +987,19 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
map_dict = {}
slots_with_double_as_range = [
s
for s in SCHEMA_DICT["slots"].keys()
if SCHEMA_DICT["slots"][s]["range"] == "double"
for s in schema_dict["slots"].keys()
if schema_dict["slots"][s]["range"] == "double"
]
for property in map_obj:
if map_obj[property] is not None:
if isinstance(map_obj[property], list):
# IF object is an enum
if (
SCHEMA_DICT["slots"][property]["range"]
in SCHEMA_DICT["enums"].keys()
schema_dict["slots"][property]["range"]
in schema_dict["enums"].keys()
):
# IF object is a multivalued enum
if SCHEMA_DICT["slots"][property]["multivalued"]:
if schema_dict["slots"][property]["multivalued"]:
map_dict[property] = "|".join(
enum_value.code.text for enum_value in map_obj[property]
)
Expand All @@ -1019,8 +1015,8 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
else:
# IF object is an enum
if (
SCHEMA_DICT["slots"][property]["range"]
in SCHEMA_DICT["enums"].keys()
schema_dict["slots"][property]["range"]
in schema_dict["enums"].keys()
):
map_dict[property] = map_obj[property].code.text
else:
Expand All @@ -1040,6 +1036,18 @@ class NoCURIEException(ValueError):


CURIE_RE = re.compile(r"[A-Za-z0-9_.]+[:][A-Za-z0-9_]")
schema_view_object = SSSOMSchemaView()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is again in the global context I think.. wont this be invoked any time anyone imports anything from utils package?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will have to repeat this code in 4 different lines in the same file. Seems messy.

schema_view = schema_view_object.view
schema_dict = schema_view_object.dict
mapping_set_slots = schema_view_object.mapping_set_slots
multivalued_slots = [
c for c in schema_view.all_slots() if schema_view.get_slot(c).multivalued
]
entity_reference_slots = [
c
for c in schema_view.all_slots()
if schema_view.get_slot(c).range == schema_view_object.entity_reference
]
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved


def is_curie(string: str) -> bool:
Expand Down Expand Up @@ -1100,7 +1108,7 @@ def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]:
"""Get a list of prefixes used in CURIEs in key feature columns in a dataframe."""
prefixes = SSSOM_BUILT_IN_PREFIXES
if not df.empty:
for col in ENTITY_REFERENCE_SLOTS:
for col in entity_reference_slots:
if col in df.columns:
for v in df[col].values:
pref = get_prefix_from_curie(str(v))
Expand Down Expand Up @@ -1266,7 +1274,7 @@ def is_multivalued_slot(slot: str) -> bool:
# view = SchemaView('schema/sssom.yaml')
# return view.get_slot(slot).multivalued

return slot in MULTIVALUED_SLOTS
return slot in multivalued_slots


def reconcile_prefix_and_data(
Expand Down Expand Up @@ -1329,7 +1337,7 @@ def reconcile_prefix_and_data(
# Data editing
if len(data_switch_dict) > 0:
# Read schema file
slots = SCHEMA_DICT["slots"]
slots = schema_dict["slots"]
entity_reference_columns = [
k for k, v in slots.items() if v["range"] == "EntityReference"
]
Expand Down Expand Up @@ -1359,7 +1367,7 @@ def sort_df_rows_columns(
"""
if by_columns and len(df.columns) > 0:
column_sequence = [
col for col in SCHEMA_DICT["slots"].keys() if col in df.columns
col for col in schema_dict["slots"].keys() if col in df.columns
]
df = df.reindex(column_sequence, axis=1)
if by_rows and len(df) > 0:
Expand All @@ -1380,7 +1388,7 @@ def get_all_prefixes(msdf: MappingSetDataFrame) -> list:
metadata_keys = list(msdf.metadata.keys())
df_columns_list = msdf.df.columns.to_list() # type: ignore
all_keys = metadata_keys + df_columns_list
ent_ref_slots = [s for s in all_keys if s in ENTITY_REFERENCE_SLOTS]
ent_ref_slots = [s for s in all_keys if s in entity_reference_slots]

for slot in ent_ref_slots:
if slot in metadata_keys:
Expand Down Expand Up @@ -1437,7 +1445,7 @@ def augment_metadata(
if msdf.metadata:
for k, v in meta.items():
# If slot is multivalued, add to list.
if k in MULTIVALUED_SLOTS and not replace_multivalued:
if k in multivalued_slots and not replace_multivalued:
tmp_value: list = []
if isinstance(msdf.metadata[k], str):
tmp_value = [msdf.metadata[k]]
Expand All @@ -1450,7 +1458,7 @@ def augment_metadata(
)
tmp_value.extend(v)
msdf.metadata[k] = list(set(tmp_value))
elif k in MULTIVALUED_SLOTS and replace_multivalued:
elif k in multivalued_slots and replace_multivalued:
msdf.metadata[k] = list(v)
else:
msdf.metadata[k] = v[0]
Expand All @@ -1469,10 +1477,10 @@ def are_params_slots(params: dict) -> bool:
if len(empty_params) > 0:
logging.info(f"Parameters: {empty_params.keys()} has(ve) no value.")

legit_params = all(p in MAPPING_SET_SLOTS for p in params.keys())
legit_params = all(p in mapping_set_slots for p in params.keys())
if not legit_params:
invalids = [p for p in params if p not in MAPPING_SET_SLOTS]
invalids = [p for p in params if p not in mapping_set_slots]
raise ValueError(
f"The params are invalid: {invalids}. Should be any of the following: {MAPPING_SET_SLOTS}"
f"The params are invalid: {invalids}. Should be any of the following: {mapping_set_slots}"
)
return True
6 changes: 3 additions & 3 deletions sssom/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
from linkml.validators.sparqlvalidator import SparqlDataValidator # noqa: F401
from sssom_schema import MappingSet

from sssom.constants import SchemaValidationType, SSSOMSchemaView
from sssom.context import add_built_in_prefixes_to_prefix_map
from sssom.parsers import to_mapping_set_document
from sssom.util import MappingSetDataFrame, get_all_prefixes

from .constants import SCHEMA_YAML, SchemaValidationType


def validate(
msdf: MappingSetDataFrame, validation_types: List[SchemaValidationType]
Expand All @@ -37,7 +36,8 @@ def validate_json_schema(msdf: MappingSetDataFrame) -> None:

:param msdf: MappingSetDataFrame to eb validated.
"""
validator = JsonSchemaDataValidator(SCHEMA_YAML)
schema_view_object = SSSOMSchemaView()
validator = JsonSchemaDataValidator(schema_view_object.yaml)
mapping_set = to_mapping_set_document(msdf).mapping_set
validator.validate_object(mapping_set, MappingSet)

Expand Down
4 changes: 2 additions & 2 deletions sssom/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
# from .sssom_datamodel import slots
from sssom_schema import slots

from sssom.constants import SSSOMSchemaView
from sssom.validators import check_all_prefixes_in_curie_map

from .constants import SCHEMA_YAML
from .parsers import to_mapping_set_document
from .util import (
PREFIX_MAP_KEY,
Expand Down Expand Up @@ -284,7 +284,7 @@ def to_rdf_graph(msdf: MappingSetDataFrame) -> Graph:
# os.remove("sssom.ttl") # remove the intermediate file.
graph = rdflib_dumper.as_rdf_graph(
element=doc.mapping_set,
schemaview=SchemaView(SCHEMA_YAML),
schemaview=SchemaView(SSSOMSchemaView().yaml),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit weird... :D

prefix_map=msdf.prefix_map,
)
return graph
Expand Down
4 changes: 2 additions & 2 deletions tests/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
import os
import unittest

from sssom.constants import SCHEMA_YAML
from sssom.constants import SSSOMSchemaView


class TestResources(unittest.TestCase):
"""A test case for resource availability checks."""

def test_exists(self):
"""Test the schema YAML file is available to the package."""
self.assertTrue(os.path.exists(SCHEMA_YAML))
self.assertTrue(os.path.exists(SSSOMSchemaView().yaml))