Skip to content
4 changes: 2 additions & 2 deletions sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def convert(input: str, output: TextIO, output_format: str):
)
@click.option(
"-E",
"--embedded-mode",
"--embedded-mode / --non-embedded-mode",
default=True,
is_flag=True,
help="If False, the resultant SSSOM file will be saved\
Expand Down Expand Up @@ -484,7 +484,7 @@ def correlations(input: str, output: TextIO, transpose: bool, fields: Tuple):
help="Boolean indicating the need for reconciliation of the SSSOM tsv file.",
)
@output_option
def merge(inputs: str, output: TextIO, reconcile: bool = True):
def merge(inputs: str, output: TextIO, reconcile: bool = False):
"""Merge multiple MappingSetDataFrames into one .

if reconcile=True, then dedupe(remove redundant lower confidence mappings) and
Expand Down
29 changes: 19 additions & 10 deletions sssom/cliques.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@
# from .sssom_datamodel import Mapping
from sssom_schema import Mapping

from sssom.constants import (
OWL_DIFFERENT_FROM,
OWL_EQUIVALENT_CLASS,
RDFS_SUBCLASS_OF,
SKOS_BROAD_MATCH,
SKOS_CLOSE_MATCH,
SKOS_EXACT_MATCH,
SKOS_NARROW_MATCH,
SSSOM_SUPERCLASS_OF,
)

from .parsers import to_mapping_set_document
from .sssom_document import MappingSetDocument
from .util import MappingSetDataFrame
Expand All @@ -31,24 +42,22 @@ def to_digraph(msdf: MappingSetDataFrame) -> nx.DiGraph:

pi = None

if p == "owl:equivalentClass":
if p == OWL_EQUIVALENT_CLASS:
pi = 2
elif p == "skos:exactMatch":
elif p == SKOS_EXACT_MATCH:
pi = 2
elif p == "skos:closeMatch":
elif p == SKOS_CLOSE_MATCH:
# TODO: consider distributing
pi = 2
elif p == "owl:subClassOf":
elif p == RDFS_SUBCLASS_OF:
pi = 0
elif p == "skos:broadMatch":
elif p == SKOS_BROAD_MATCH:
pi = 0
elif p == "inverseOf(owl:subClassOf)":
elif p == SSSOM_SUPERCLASS_OF:
pi = 1
elif p == "skos:narrowMatch":
elif p == SKOS_NARROW_MATCH:
pi = 1
elif p == "owl:differentFrom":
pi = 3
elif p == "dbpedia-owl:different":
elif p == OWL_DIFFERENT_FROM:
pi = 3

if pi == 0:
Expand Down
32 changes: 32 additions & 0 deletions sssom/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,38 @@
SUBJECT_SOURCE_ID = "subject_source_id"
OBJECT_SOURCE_ID = "object_source_id"

# PREDICATES
OWL_EQUIVALENT_CLASS = "owl:equivalentClass"
OWL_EQUIVALENT_PROPERTY = "owl:equivalentProperty"
OWL_DIFFERENT_FROM = "owl:differentFrom"
RDFS_SUBCLASS_OF = "rdfs:subClassOf"
RDFS_SUBPROPERTY_OF = "rdfs:subPropertyOf"
OWL_SAME_AS = "owl:sameAs"
SKOS_EXACT_MATCH = "skos:exactMatch"
SKOS_CLOSE_MATCH = "skos:closeMatch"
SKOS_BROAD_MATCH = "skos:broadMatch"
SKOS_NARROW_MATCH = "skos:narrowMatch"
OBO_HAS_DB_XREF = "oboInOwl:hasDbXref"
SKOS_RELATED_MATCH = "skos:relatedMatch"
RDF_SEE_ALSO = "rdfs:seeAlso"
SSSOM_SUPERCLASS_OF = "inverseOf(owl:subClassOf)"

PREDICATE_LIST = [
OWL_EQUIVALENT_CLASS,
OWL_EQUIVALENT_PROPERTY,
RDFS_SUBCLASS_OF,
SSSOM_SUPERCLASS_OF,
RDFS_SUBPROPERTY_OF,
OWL_SAME_AS,
SKOS_EXACT_MATCH,
SKOS_CLOSE_MATCH,
SKOS_BROAD_MATCH,
SKOS_NARROW_MATCH,
OBO_HAS_DB_XREF,
SKOS_RELATED_MATCH,
RDF_SEE_ALSO,
]


class SEMAPV(Enum):
"""SEMAPV Enum containing different mapping_justification."""
Expand Down
112 changes: 97 additions & 15 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,24 @@
OBJECT_ID,
OBJECT_LABEL,
OBJECT_SOURCE,
OBO_HAS_DB_XREF,
OWL_DIFFERENT_FROM,
OWL_EQUIVALENT_CLASS,
PREDICATE_ID,
PREDICATE_LIST,
PREDICATE_MODIFIER,
PREDICATE_MODIFIER_NOT,
PREFIX_MAP_MODES,
RDFS_SUBCLASS_OF,
SCHEMA_DICT,
SCHEMA_YAML,
SEMAPV,
SKOS_BROAD_MATCH,
SKOS_CLOSE_MATCH,
SKOS_EXACT_MATCH,
SKOS_NARROW_MATCH,
SKOS_RELATED_MATCH,
SSSOM_SUPERCLASS_OF,
SUBJECT_CATEGORY,
SUBJECT_ID,
SUBJECT_LABEL,
Expand Down Expand Up @@ -289,13 +300,83 @@ def filter_redundant_rows(
# will be removed from pandas in a future version.
# Use pandas.concat instead.
# return_df = df.append(nan_df).drop_duplicates()
return_df = pd.concat([df, nan_df]).drop_duplicates()
confidence_reconciled_df = pd.concat([df, nan_df]).drop_duplicates()

# Reconciling dataframe rows based on the predicates with equal confidence.
if PREDICATE_MODIFIER in confidence_reconciled_df.columns:
tmp_df = confidence_reconciled_df[
[SUBJECT_ID, OBJECT_ID, PREDICATE_ID, CONFIDENCE, PREDICATE_MODIFIER]
]
tmp_df = tmp_df[tmp_df[PREDICATE_MODIFIER] != PREDICATE_MODIFIER_NOT].drop(
PREDICATE_MODIFIER, axis=1
)
else:
tmp_df = confidence_reconciled_df[
[SUBJECT_ID, OBJECT_ID, PREDICATE_ID, CONFIDENCE]
]
tmp_df_grp = tmp_df.groupby(
[SUBJECT_ID, OBJECT_ID, CONFIDENCE], as_index=False
).count()
tmp_df_grp = tmp_df_grp[tmp_df_grp[PREDICATE_ID] > 1].drop(PREDICATE_ID, axis=1)
non_predicate_reconciled_df = (
confidence_reconciled_df.merge(
tmp_df_grp, on=list(tmp_df_grp.columns), how="left", indicator=True
)
.query('_merge == "left_only"')
.drop(columns="_merge")
)

multiple_predicate_df = (
confidence_reconciled_df.merge(
tmp_df_grp, on=list(tmp_df_grp.columns), how="right", indicator=True
)
.query('_merge == "both"')
.drop(columns="_merge")
)

return_df = non_predicate_reconciled_df
for _, row in tmp_df_grp.iterrows():
logic_df = multiple_predicate_df[list(tmp_df_grp.columns)] == row
concerned_row_index = (
logic_df[logic_df[list(tmp_df_grp.columns)]].dropna().index
)
concerned_df = multiple_predicate_df.iloc[concerned_row_index]
# Go down the hierarchical list of PREDICATE_LIST and grab the first match
return_df = pd.concat(
[get_row_based_on_hierarchy(concerned_df), return_df], axis=0
).drop_duplicates()

if return_df[CONFIDENCE].isnull().all():
return_df = return_df.drop(columns=[CONFIDENCE], axis=1)
return return_df


def get_row_based_on_hierarchy(df: pd.DataFrame):
"""Get row based on hierarchy of predicates.

The hierarchy is as follows:
# owl:equivalentClass
# owl:equivalentProperty
# rdfs:subClassOf
# rdfs:subPropertyOf
# owl:sameAs
# skos:exactMatch
# skos:closeMatch
# skos:broadMatch
# skos:narrowMatch
# oboInOwl:hasDbXref
# skos:relatedMatch
# rdfs:seeAlso

:param df: Dataframe containing multiple predicates for same subject and object.
:return: Dataframe with a single row which ranks higher in the hierarchy.
"""
for pred in PREDICATE_LIST:
hierarchical_df = df[df[PREDICATE_ID] == pred]
if not hierarchical_df.empty:
return hierarchical_df


def assign_default_confidence(
df: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
Expand Down Expand Up @@ -430,29 +511,27 @@ def dataframe_to_ptable(df: pd.DataFrame, *, inverse_factor: float = None):
residual_confidence = (1 - (confidence + inverse_confidence)) / 2.0

predicate = row[PREDICATE_ID]
if predicate == "owl:equivalentClass":
if predicate == OWL_EQUIVALENT_CLASS:
predicate_type = PREDICATE_EQUIVALENT
elif predicate == "skos:exactMatch":
elif predicate == SKOS_EXACT_MATCH:
predicate_type = PREDICATE_EQUIVALENT
elif predicate == "skos:closeMatch":
elif predicate == SKOS_CLOSE_MATCH:
# TODO: consider distributing
predicate_type = PREDICATE_EQUIVALENT
elif predicate == "owl:subClassOf":
elif predicate == RDFS_SUBCLASS_OF:
predicate_type = PREDICATE_SUBCLASS
elif predicate == "skos:broadMatch":
elif predicate == SKOS_BROAD_MATCH:
predicate_type = PREDICATE_SUBCLASS
elif predicate == "inverseOf(owl:subClassOf)":
elif predicate == SSSOM_SUPERCLASS_OF:
predicate_type = PREDICATE_SUPERCLASS
elif predicate == "skos:narrowMatch":
elif predicate == SKOS_NARROW_MATCH:
predicate_type = PREDICATE_SUPERCLASS
elif predicate == "owl:differentFrom":
predicate_type = PREDICATE_SIBLING
elif predicate == "dbpedia-owl:different":
elif predicate == OWL_DIFFERENT_FROM:
predicate_type = PREDICATE_SIBLING
# * Added by H2 ############################
elif predicate == "oboInOwl:hasDbXref":
elif predicate == OBO_HAS_DB_XREF:
predicate_type = PREDICATE_HAS_DBXREF
elif predicate == "skos:relatedMatch":
elif predicate == SKOS_RELATED_MATCH:
predicate_type = PREDICATE_RELATED_MATCH
# * ########################################
else:
Expand Down Expand Up @@ -538,7 +617,7 @@ def sha256sum(path: str) -> str:

def merge_msdf(
*msdfs: MappingSetDataFrame,
reconcile: bool = True,
reconcile: bool = False,
) -> MappingSetDataFrame:
"""Merge multiple MappingSetDataFrames into one.

Expand Down Expand Up @@ -573,7 +652,10 @@ def merge_msdf(
merged_msdf.df = df_merged
if reconcile:
merged_msdf.df = filter_redundant_rows(merged_msdf.df)
if PREDICATE_MODIFIER in merged_msdf.df.columns:
if (
PREDICATE_MODIFIER in merged_msdf.df.columns
and PREDICATE_MODIFIER_NOT in merged_msdf.df[PREDICATE_MODIFIER]
):
merged_msdf.df = deal_with_negation(merged_msdf.df) # deals with negation

# TODO: Add default values for license and mapping_set_id.
Expand Down
2 changes: 1 addition & 1 deletion tests/data/basic.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ z:region REGION owl:equivalentClass y:region regions semapv:ManualMappingCurati
z:tissue TISSUE owl:equivalentClass x:tissue tissue semapv:ManualMappingCuration z:example x:example rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity tissu .
z:tissue TISSUE owl:equivalentClass x:tissue tissue semapv:LexicalMatching z:example x:example rdf_matcher 0.881856236 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity tissue .
z:tissue TISSUE owl:equivalentClass y:tissue tissues semapv:ManualMappingCuration z:example y:example rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity tissu .
a:something XXXXX owl:subClassOf b:something xxxxxx semapv:LexicalMatching a:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something XXXXX rdfs:subClassOf b:something xxxxxx semapv:LexicalMatching a:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YYYYY owl:equivalentClass d:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.81 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY owl:equivalentClass Not a:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something XYXYX owl:equivalentClass c:something xyxyxy semapv:LexicalMatching a:example c:example rdf_matcher 0.83 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
Expand Down
50 changes: 50 additions & 0 deletions tests/data/basic7.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#license: "https://creativecommons.org/publicdomain/zero/1.0/"
#mapping_set_id: https://w3id.org/sssom/mapping/tests/data/basic3.tsv
#mapping_tool: "https://github.com/cmungall/rdf_matcher"
#creator_id:
# - "orcid:1234"
#mapping_date: "2020-05-30"
#curie_map:
# a: "http://example.org/a/"
# b: "http://example.org/b/"
# c: "http://example.org/c/"
# d: "http://example.org/d/"
# rdfs: "http://www.w3.org/2000/01/rdf-schema#"
# owl: "http://www.w3.org/2002/07/owl#"
# orcid: "https://orcid.org/my-orcid?orcid="
# semapv: "https://w3id.org/semapv/"
# skos: "http://www.w3.org/2004/02/skos/core#"
subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment
a:something YYYYY owl:equivalentClass b:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something YYYYY owl:equivalentProperty b:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX owl:equivalentProperty c:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX rdfs:subClassOf c:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY rdfs:subClassOf d:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY rdfs:subPropertyOf d:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY rdfs:subPropertyOf c:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY owl:sameAs c:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY owl:sameAs b:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY skos:exactMatch b:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX skos:exactMatch a:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX skos:closeMatch a:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something YYYYY skos:closeMatch c:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something YYYYY skos:broadMatch c:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX skos:broadMatch d:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX skos:narrowMatch d:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY skos:narrowMatch a:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY oboInOwl:hasDbXref a:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY oboInOwl:hasDbXref b:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY skos:relatedMatch b:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something YYYYY skos:relatedMatch d:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something YYYYY rdfs:seeAlso d:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY rdfs:seeAlso a:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY owl:equivalentClass Not a:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something XYXYX owl:equivalentClass c:something xyxyxy semapv:LexicalMatching a:example c:example rdf_matcher 0.83 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY owl:equivalentClass b:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.845 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
b:something XXXXX owl:equivalentClass a:something xxxxxx semapv:LexicalMatching b:example a:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YYYYY owl:equivalentClass d:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.81 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
c:something YXYXY owl:equivalentClass Not b:something yxyxyx semapv:ManualMappingCuration c:example b:example rdf_matcher 0.845 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something XYXYX owl:equivalentClass b:something xyxyxy semapv:LexicalMatching d:example b:example rdf_matcher 0.83 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something XXXXX owl:subClassOf b:something xxxxxx semapv:LexicalMatching a:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
d:something YYYYY owl:equivalentClass a:something yyyyyy semapv:ManualMappingCuration d:example a:example rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
a:something XXXXX owl:subClassOf Not b:something xxxxxx semapv:LexicalMatching a:example b:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
4 changes: 2 additions & 2 deletions tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def setUp(self) -> None:
def test_merge_multiple_inputs(self):
"""Test merging of multiple msdfs."""
merged_msdf = merge_msdf(*self.msdfs)
self.assertEqual(123, len(merged_msdf.df))
self.assertEqual(275, len(merged_msdf.df))

def test_merge_single_input(self):
"""Test merging when a single msdf is provided."""
self.assertEqual(93, len(merge_msdf(self.msdf).df))
self.assertEqual(141, len(merge_msdf(self.msdf).df))
Loading