-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SchemaView
object should be instantiated when needed & not globally.
#323
Changes from 5 commits
5d2db85
b6f4dd2
16c172a
c7b14ce
0be2f7b
66796c0
a216903
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,15 +11,6 @@ | |
|
||
HERE = pathlib.Path(__file__).parent.resolve() | ||
|
||
SCHEMA_YAML = pkg_resources.resource_filename( | ||
"sssom_schema", "schema/sssom_schema.yaml" | ||
) | ||
SCHEMA_VIEW = SchemaView(SCHEMA_YAML) | ||
# SCHEMA_VIEW = package_schemaview("sssom_schema") | ||
SCHEMA_DICT = schema_as_dict(SCHEMA_VIEW.schema) | ||
MAPPING_SLOTS = SCHEMA_DICT["classes"]["mapping"]["slots"] | ||
MAPPING_SET_SLOTS = SCHEMA_DICT["classes"]["mapping set"]["slots"] | ||
|
||
OWL_EQUIV_CLASS = "http://www.w3.org/2002/07/owl#equivalentClass" | ||
RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf" | ||
|
||
|
@@ -42,16 +33,6 @@ | |
PREFIX_MAP_MODE_SSSOM_DEFAULT_ONLY, | ||
PREFIX_MAP_MODE_MERGED, | ||
] | ||
ENTITY_REFERENCE = "EntityReference" | ||
|
||
MULTIVALUED_SLOTS = [ | ||
c for c in SCHEMA_VIEW.all_slots() if SCHEMA_VIEW.get_slot(c).multivalued | ||
] | ||
ENTITY_REFERENCE_SLOTS = [ | ||
c | ||
for c in SCHEMA_VIEW.all_slots() | ||
if SCHEMA_VIEW.get_slot(c).range == ENTITY_REFERENCE | ||
] | ||
|
||
# Slot Constants | ||
MIRROR_FROM = "mirror_from" | ||
|
@@ -177,3 +158,19 @@ class SchemaValidationType(str, Enum): | |
SchemaValidationType.JsonSchema, | ||
SchemaValidationType.PrefixMapCompleteness, | ||
] | ||
|
||
|
||
class SSSOMSchemaView: | ||
""" | ||
SchemaView class from linkml which is instantiated when necessary. | ||
|
||
Reason for this: https://github.com/mapping-commons/sssom-py/issues/322 | ||
Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323 | ||
""" | ||
|
||
entity_reference = "EntityReference" | ||
yaml = pkg_resources.resource_filename("sssom_schema", "schema/sssom_schema.yaml") | ||
view = SchemaView(yaml) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, but you may have to use init() to avoid this code being run whenever the class is referenced: https://stackoverflow.com/questions/9056957/correct-way-to-define-class-variables-in-python But not 100% sure. |
||
dict = schema_as_dict(view.schema) | ||
mapping_slots = dict["classes"]["mapping"]["slots"] | ||
mapping_set_slots = dict["classes"]["mapping set"]["slots"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,12 +40,9 @@ | |
from .constants import ( | ||
COMMENT, | ||
CONFIDENCE, | ||
ENTITY_REFERENCE_SLOTS, | ||
MAPPING_JUSTIFICATION, | ||
MAPPING_SET_ID, | ||
MAPPING_SET_SLOTS, | ||
MAPPING_SET_SOURCE, | ||
MULTIVALUED_SLOTS, | ||
OBJECT_CATEGORY, | ||
OBJECT_ID, | ||
OBJECT_LABEL, | ||
|
@@ -59,8 +56,6 @@ | |
PREDICATE_MODIFIER_NOT, | ||
PREFIX_MAP_MODES, | ||
RDFS_SUBCLASS_OF, | ||
SCHEMA_DICT, | ||
SCHEMA_YAML, | ||
SEMAPV, | ||
SKOS_BROAD_MATCH, | ||
SKOS_CLOSE_MATCH, | ||
|
@@ -72,6 +67,7 @@ | |
SUBJECT_ID, | ||
SUBJECT_LABEL, | ||
SUBJECT_SOURCE, | ||
SSSOMSchemaView, | ||
) | ||
from .context import ( | ||
SSSOM_BUILT_IN_PREFIXES, | ||
|
@@ -832,7 +828,7 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame: | |
:return: MappingSetDataFrame with metadata as columns | ||
""" | ||
# TODO Check if 'k' is a valid 'slot' for 'mapping' [sssom.yaml] | ||
with open(SCHEMA_YAML) as file: | ||
with open(schema_view_object.yaml) as file: | ||
schema = yaml.safe_load(file) | ||
slots = schema["classes"]["mapping"]["slots"] | ||
hrshdhgd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if msdf.metadata is not None and msdf.df is not None: | ||
|
@@ -957,8 +953,8 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame: | |
data = [] | ||
slots_with_double_as_range = [ | ||
s | ||
for s in SCHEMA_DICT["slots"].keys() | ||
if SCHEMA_DICT["slots"][s]["range"] == "double" | ||
for s in schema_dict["slots"].keys() | ||
if schema_dict["slots"][s]["range"] == "double" | ||
] | ||
if doc.mapping_set.mappings is not None: | ||
for mapping in doc.mapping_set.mappings: | ||
|
@@ -991,19 +987,19 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) -> | |
map_dict = {} | ||
slots_with_double_as_range = [ | ||
s | ||
for s in SCHEMA_DICT["slots"].keys() | ||
if SCHEMA_DICT["slots"][s]["range"] == "double" | ||
for s in schema_dict["slots"].keys() | ||
if schema_dict["slots"][s]["range"] == "double" | ||
] | ||
for property in map_obj: | ||
if map_obj[property] is not None: | ||
if isinstance(map_obj[property], list): | ||
# IF object is an enum | ||
if ( | ||
SCHEMA_DICT["slots"][property]["range"] | ||
in SCHEMA_DICT["enums"].keys() | ||
schema_dict["slots"][property]["range"] | ||
in schema_dict["enums"].keys() | ||
): | ||
# IF object is a multivalued enum | ||
if SCHEMA_DICT["slots"][property]["multivalued"]: | ||
if schema_dict["slots"][property]["multivalued"]: | ||
map_dict[property] = "|".join( | ||
enum_value.code.text for enum_value in map_obj[property] | ||
) | ||
|
@@ -1019,8 +1015,8 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) -> | |
else: | ||
# IF object is an enum | ||
if ( | ||
SCHEMA_DICT["slots"][property]["range"] | ||
in SCHEMA_DICT["enums"].keys() | ||
schema_dict["slots"][property]["range"] | ||
in schema_dict["enums"].keys() | ||
): | ||
map_dict[property] = map_obj[property].code.text | ||
else: | ||
|
@@ -1040,6 +1036,18 @@ class NoCURIEException(ValueError): | |
|
||
|
||
CURIE_RE = re.compile(r"[A-Za-z0-9_.]+[:][A-Za-z0-9_]") | ||
schema_view_object = SSSOMSchemaView() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is again in the global context I think.. wont this be invoked any time anyone imports anything from utils package? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will have to repeat this code in 4 different lines in the same file. Seems messy. |
||
schema_view = schema_view_object.view | ||
schema_dict = schema_view_object.dict | ||
mapping_set_slots = schema_view_object.mapping_set_slots | ||
multivalued_slots = [ | ||
c for c in schema_view.all_slots() if schema_view.get_slot(c).multivalued | ||
] | ||
entity_reference_slots = [ | ||
c | ||
for c in schema_view.all_slots() | ||
if schema_view.get_slot(c).range == schema_view_object.entity_reference | ||
] | ||
hrshdhgd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def is_curie(string: str) -> bool: | ||
|
@@ -1100,7 +1108,7 @@ def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]: | |
"""Get a list of prefixes used in CURIEs in key feature columns in a dataframe.""" | ||
prefixes = SSSOM_BUILT_IN_PREFIXES | ||
if not df.empty: | ||
for col in ENTITY_REFERENCE_SLOTS: | ||
for col in entity_reference_slots: | ||
if col in df.columns: | ||
for v in df[col].values: | ||
pref = get_prefix_from_curie(str(v)) | ||
|
@@ -1266,7 +1274,7 @@ def is_multivalued_slot(slot: str) -> bool: | |
# view = SchemaView('schema/sssom.yaml') | ||
# return view.get_slot(slot).multivalued | ||
|
||
return slot in MULTIVALUED_SLOTS | ||
return slot in multivalued_slots | ||
|
||
|
||
def reconcile_prefix_and_data( | ||
|
@@ -1329,7 +1337,7 @@ def reconcile_prefix_and_data( | |
# Data editing | ||
if len(data_switch_dict) > 0: | ||
# Read schema file | ||
slots = SCHEMA_DICT["slots"] | ||
slots = schema_dict["slots"] | ||
entity_reference_columns = [ | ||
k for k, v in slots.items() if v["range"] == "EntityReference" | ||
] | ||
|
@@ -1359,7 +1367,7 @@ def sort_df_rows_columns( | |
""" | ||
if by_columns and len(df.columns) > 0: | ||
column_sequence = [ | ||
col for col in SCHEMA_DICT["slots"].keys() if col in df.columns | ||
col for col in schema_dict["slots"].keys() if col in df.columns | ||
] | ||
df = df.reindex(column_sequence, axis=1) | ||
if by_rows and len(df) > 0: | ||
|
@@ -1380,7 +1388,7 @@ def get_all_prefixes(msdf: MappingSetDataFrame) -> list: | |
metadata_keys = list(msdf.metadata.keys()) | ||
df_columns_list = msdf.df.columns.to_list() # type: ignore | ||
all_keys = metadata_keys + df_columns_list | ||
ent_ref_slots = [s for s in all_keys if s in ENTITY_REFERENCE_SLOTS] | ||
ent_ref_slots = [s for s in all_keys if s in entity_reference_slots] | ||
|
||
for slot in ent_ref_slots: | ||
if slot in metadata_keys: | ||
|
@@ -1437,7 +1445,7 @@ def augment_metadata( | |
if msdf.metadata: | ||
for k, v in meta.items(): | ||
# If slot is multivalued, add to list. | ||
if k in MULTIVALUED_SLOTS and not replace_multivalued: | ||
if k in multivalued_slots and not replace_multivalued: | ||
tmp_value: list = [] | ||
if isinstance(msdf.metadata[k], str): | ||
tmp_value = [msdf.metadata[k]] | ||
|
@@ -1450,7 +1458,7 @@ def augment_metadata( | |
) | ||
tmp_value.extend(v) | ||
msdf.metadata[k] = list(set(tmp_value)) | ||
elif k in MULTIVALUED_SLOTS and replace_multivalued: | ||
elif k in multivalued_slots and replace_multivalued: | ||
msdf.metadata[k] = list(v) | ||
else: | ||
msdf.metadata[k] = v[0] | ||
|
@@ -1469,10 +1477,10 @@ def are_params_slots(params: dict) -> bool: | |
if len(empty_params) > 0: | ||
logging.info(f"Parameters: {empty_params.keys()} has(ve) no value.") | ||
|
||
legit_params = all(p in MAPPING_SET_SLOTS for p in params.keys()) | ||
legit_params = all(p in mapping_set_slots for p in params.keys()) | ||
if not legit_params: | ||
invalids = [p for p in params if p not in MAPPING_SET_SLOTS] | ||
invalids = [p for p in params if p not in mapping_set_slots] | ||
raise ValueError( | ||
f"The params are invalid: {invalids}. Should be any of the following: {MAPPING_SET_SLOTS}" | ||
f"The params are invalid: {invalids}. Should be any of the following: {mapping_set_slots}" | ||
) | ||
return True |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,9 +16,9 @@ | |
# from .sssom_datamodel import slots | ||
from sssom_schema import slots | ||
|
||
from sssom.constants import SSSOMSchemaView | ||
from sssom.validators import check_all_prefixes_in_curie_map | ||
|
||
from .constants import SCHEMA_YAML | ||
from .parsers import to_mapping_set_document | ||
from .util import ( | ||
PREFIX_MAP_KEY, | ||
|
@@ -284,7 +284,7 @@ def to_rdf_graph(msdf: MappingSetDataFrame) -> Graph: | |
# os.remove("sssom.ttl") # remove the intermediate file. | ||
graph = rdflib_dumper.as_rdf_graph( | ||
element=doc.mapping_set, | ||
schemaview=SchemaView(SCHEMA_YAML), | ||
schemaview=SchemaView(SSSOMSchemaView().yaml), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bit weird... :D |
||
prefix_map=msdf.prefix_map, | ||
) | ||
return graph | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if this will defeat the purpose of the refactoring. I should have thought of this before.
I am wondering now if we need to find a way to skip network access in SchemaView()?