Skip to content

Commit

Permalink
merge command returned nothing when input msdfs had no `confidenc…
Browse files Browse the repository at this point in the history
…e` column (#350)

* `merge` returned nothing wihtout confidence

* `np.NaN` => 0.0

* added flag to indicate confidence column status

* default = False

* removed unnecessary flag return

* Update sssom/util.py

Co-authored-by: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>

* Update sssom/util.py

Co-authored-by: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>

* with and without reconcile tests

---------

Co-authored-by: Nico Matentzoglu <nicolas.matentzoglu@gmail.com>
  • Loading branch information
hrshdhgd and matentzn committed Mar 16, 2023
1 parent c5411d1 commit 66eb981
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 7 deletions.
2 changes: 1 addition & 1 deletion sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def correlations(input: str, output: TextIO, transpose: bool, fields: Tuple):
@click.option(
"-R",
"--reconcile",
default=True,
default=False,
help="Boolean indicating the need for reconciliation of the SSSOM tsv file.",
)
@output_option
Expand Down
9 changes: 7 additions & 2 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ def filter_redundant_rows(
# create a 'sort' method and then replce the following line by sort()
df = sort_sssom(df)
# df[CONFIDENCE] = df[CONFIDENCE].apply(lambda x: x + random.random() / 10000)
confidence_in_original = CONFIDENCE in df.columns
df, nan_df = assign_default_confidence(df)
if ignore_predicate:
key = [SUBJECT_ID, OBJECT_ID]
Expand Down Expand Up @@ -367,7 +368,7 @@ def filter_redundant_rows(
[get_row_based_on_hierarchy(concerned_df), return_df], axis=0
).drop_duplicates()

if return_df[CONFIDENCE].isnull().all():
if not confidence_in_original:
return_df = return_df.drop(columns=[CONFIDENCE], axis=1)
return return_df

Expand Down Expand Up @@ -410,7 +411,7 @@ def assign_default_confidence(
if df is not None:
new_df = df.copy()
if CONFIDENCE not in new_df.columns:
new_df[CONFIDENCE] = np.NaN
new_df[CONFIDENCE] = 0.0 # np.NaN
nan_df = pd.DataFrame(columns=new_df.columns)
else:
new_df = df[~df[CONFIDENCE].isna()]
Expand Down Expand Up @@ -711,6 +712,7 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
"""

# Handle DataFrames with no 'confidence' column (basically adding a np.NaN to all non-numeric confidences)
confidence_in_original = CONFIDENCE in df.columns
df, nan_df = assign_default_confidence(df)
if df is None:
raise ValueError(
Expand Down Expand Up @@ -831,6 +833,9 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
else:
return_df = reconciled_df.append(nan_df).drop_duplicates()

if not confidence_in_original:
return_df = return_df.drop(columns=[CONFIDENCE], axis=1)

return return_df


Expand Down
18 changes: 18 additions & 0 deletions tests/data/reconcile_1.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# curie_map:
# UBERON: http://purl.obolibrary.org/obo/UBERON_
# ZFS: http://purl.obolibrary.org/obo/ZFS_
# oio: http://www.geneontology.org/formats/oboInOwl#
# owl: http://www.w3.org/2002/07/owl#
# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
# rdfs: http://www.w3.org/2000/01/rdf-schema#
# semapv: https://w3id.org/semapv/
# skos: http://www.w3.org/2004/02/skos/core#
# sssom: https://w3id.org/sssom/
# license: https://w3id.org/sssom/license/unspecified
# mapping_set_id: https://w3id.org/sssom/mappings/72debc9d-ca69-45e8-b46d-aef8361bedf2
# object_source: ZFS
# subject_source: UBERON
subject_id subject_label predicate_id object_id mapping_justification subject_source object_source
UBERON:0000069 larval stage oio:hasDbXref ZFS:0000048 semapv:UnspecifiedMatching UBERON ZFS
UBERON:0000105 life cycle stage oio:hasDbXref ZFS:0100000 semapv:UnspecifiedMatching UBERON ZFS
UBERON:0000105 life cycle stage oio:hasDbXref ZFS:0000000 semapv:UnspecifiedMatching UBERON ZFS
19 changes: 19 additions & 0 deletions tests/data/reconcile_2.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# curie_map:
# UBERON: http://purl.obolibrary.org/obo/UBERON_
# WBls: http://purl.obolibrary.org/obo/WBls_
# oio: http://www.geneontology.org/formats/oboInOwl#
# owl: http://www.w3.org/2002/07/owl#
# rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
# rdfs: http://www.w3.org/2000/01/rdf-schema#
# semapv: https://w3id.org/semapv/
# skos: http://www.w3.org/2004/02/skos/core#
# sssom: https://w3id.org/sssom/
# license: https://w3id.org/sssom/license/unspecified
# mapping_set_id: https://w3id.org/sssom/mappings/c5e357f5-86df-4aaa-a30e-8a23ad523ab2
# object_source: WBls
# subject_source: UBERON
subject_id subject_label predicate_id object_id mapping_justification subject_source object_source
UBERON:0000066 fully formed stage oio:hasDbXref WBls:0000041 semapv:UnspecifiedMatching UBERON WBls
UBERON:0000068 embryo stage oio:hasDbXref WBls:0000003 semapv:UnspecifiedMatching UBERON WBls
UBERON:0000068 embryo stage oio:hasDbXref WBls:0000102 semapv:UnspecifiedMatching UBERON WBls
UBERON:0000068 embryo stage oio:hasDbXref WBls:0000092 semapv:UnspecifiedMatching UBERON WBls
18 changes: 14 additions & 4 deletions tests/test_reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,22 @@ def test_merge(self):
self.assertEqual(34, len(merged_msdf3.df))

def test_merge_with_reconcile(self):
"""Test merging two tables with reconciliation."""
merged_msdf = merge_msdf(self.msdf1, self.msdf2, reconcile=True)
self.assertEqual(len(merged_msdf.df), 18)

def test_merge_without_reconcile(self):
"""Test merging two tables without reconciliation."""
merged_msdf = merge_msdf(self.msdf1, self.msdf2, reconcile=False)
self.assertEqual(len(merged_msdf.df), 34)

def test_merge_with_reconcile_without_confidence(self):
"""Test merging two tables without reconciliation."""
msdf1 = parse_sssom_table(data_dir / "basic4.tsv")
msdf2 = parse_sssom_table(data_dir / "basic5.tsv")
msdf1 = parse_sssom_table(data_dir / "reconcile_1.tsv")
msdf2 = parse_sssom_table(data_dir / "reconcile_2.tsv")

merged_msdf = merge_msdf(msdf1, msdf2, reconcile=True)

self.assertEqual(53, len(msdf1.df))
self.assertEqual(53, len(msdf2.df))
self.assertEqual(3, len(msdf1.df))
self.assertEqual(4, len(msdf2.df))
self.assertEqual(len(merged_msdf.df), (len(msdf1.df) + len(msdf2.df)))

0 comments on commit 66eb981

Please sign in to comment.