Skip to content
21 changes: 21 additions & 0 deletions sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,5 +675,26 @@ def annotate(input: str, output: TextIO, replace_multivalued: bool, **kwargs):
)


@main.command()
@input_argument
@click.option(
"--remove-map",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is great, we can later consider making this multivalued (not now). Thank you!

type=click.Path(),
help="Mapping file path that needs to be removed from input.",
)
@output_option
def remove(input: str, output: TextIO, remove_map: str):
"""Remove mappings from an input mapping.

:param input: Input SSSOM tsv file.
:param output: Output path.
:param remove_map: Mapping to be removed.
"""
input_msdf = parse_sssom_table(input)
remove_msdf = parse_sssom_table(remove_map)
input_msdf.remove_mappings(remove_msdf)
write_table(input_msdf, output)


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions sssom/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,8 @@ def filter_file(input: str, output: TextIO, **kwargs) -> MappingSetDataFrame:
for exp in v[1:]:
query += " OR "
query += k + " LIKE '" + exp + "') "
else:
query += ") "
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is new, I assume there was no test for the else case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was missing and became apparent when only 1 value to a param was given for e.g. --subject_id ABCD:1234 instead of multiple --subject_id ABCD:1234 --subject_id DEFG:4567. The missing ) that's added in the code above threw an error. There was no test to check this.

if multiple_params and idx != len(params):
query += " AND ("
return run_sql_query(query=query, inputs=[input], output=output)
Expand Down
59 changes: 54 additions & 5 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,28 @@ def clean_prefix_map(self) -> None:
self.df = filter_out_prefixes(self.df, missing_prefixes)
self.prefix_map = new_prefixes

def remove_mappings(self, msdf: "MappingSetDataFrame"):
"""Remove mappings in right msdf from left msdf.

:param msdf: MappingSetDataframe object to be removed from primary msdf object.
"""
self.df = (
pd.merge(
self.df,
msdf.df,
on=KEY_FEATURES,
how="outer",
suffixes=("", "_2"),
indicator=True,
)
.query("_merge == 'left_only'")
.drop("_merge", axis=1)
.reset_index(drop=True)
)

self.df = self.df[self.df.columns.drop(list(self.df.filter(regex=r"_2")))]
self.clean_prefix_map()


@dataclass
class EntityPair:
Expand Down Expand Up @@ -990,27 +1012,54 @@ def get_prefixes_used_in_table(df: pd.DataFrame) -> List[str]:
return list(set(prefixes))


def filter_out_prefixes(df: pd.DataFrame, filter_prefixes: List[str]) -> pd.DataFrame:
def filter_out_prefixes(
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
) -> pd.DataFrame:
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.

:param df: Pandas DataFrame
:param filter_prefixes: List of prefixes
:param features: List of dataframe column names dataframe to consider
:return: Pandas Dataframe
"""
filter_prefix_set = set(filter_prefixes)
rows = []

for _, row in df.iterrows():
# Get list of CURIEs from the 3 columns (KEY_FEATURES) for the row.
prefixes = {get_prefix_from_curie(curie) for curie in row[KEY_FEATURES]}
# Confirm if none of the 3 CURIEs in the list above appear in the filter_prefixes list.
prefixes = {get_prefix_from_curie(curie) for curie in row[features]}
# Confirm if none of the CURIEs in the list above appear in the filter_prefixes list.
# If TRUE, append row.
if not any(prefix in prefixes for prefix in filter_prefix_set):
rows.append(row)
if rows:
return pd.DataFrame(rows)
else:
return pd.DataFrame(columns=KEY_FEATURES)
return pd.DataFrame(columns=features)


def filter_prefixes(
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
) -> pd.DataFrame:
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.

:param df: Pandas DataFrame
:param filter_prefixes: List of prefixes
:param features: List of dataframe column names dataframe to consider
:return: Pandas Dataframe
"""
filter_prefix_set = set(filter_prefixes)
rows = []

for _, row in df.iterrows():
prefixes = {get_prefix_from_curie(curie) for curie in row[features]}
# Confirm if all of the CURIEs in the list above appear in the filter_prefixes list.
# If TRUE, append row.
if all(prefix in filter_prefix_set for prefix in prefixes):
rows.append(row)
if rows:
return pd.DataFrame(rows)
else:
return pd.DataFrame(columns=features)


# TODO this is not used anywhere
Expand Down
22 changes: 22 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
partition,
ptable,
reconcile_prefixes,
remove,
sort,
split,
validate,
Expand All @@ -33,6 +34,8 @@
test_out_dir,
)

from .constants import data_dir


class SSSOMCLITestSuite(unittest.TestCase):
"""A test case for the dynamic CLI tests."""
Expand Down Expand Up @@ -64,6 +67,7 @@ def test_cli_single_input(self):
self.run_sort_rows_columns(runner, test)
self.run_filter(runner, test)
self.run_annotate(runner, test)
self.run_remove(runner, test)

self.assertTrue(len(test_cases) > 2)

Expand Down Expand Up @@ -336,3 +340,21 @@ def run_annotate(self, runner: CliRunner, test_case: SSSOMTestCase) -> Result:
)
self.run_successful(result, test_case)
return result

def run_remove(self, runner: CliRunner, test_case: SSSOMTestCase) -> Result:
"""Test removal of mappings."""
out_file = os.path.join(test_out_dir, "remove_map_test.tsv")
in_file = test_case.filepath
rm_file = os.path.join(data_dir, "basic3.tsv")
result = runner.invoke(
remove,
[
in_file,
"-o",
os.path.join(test_out_dir, out_file),
"--remove-map",
rm_file,
],
)
self.run_successful(result, test_case)
return result
36 changes: 36 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
"""Test for merging MappingSetDataFrames."""
import unittest

from sssom.constants import OBJECT_ID, SUBJECT_ID
from sssom.io import extract_iri
from sssom.parsers import parse_sssom_table
from sssom.util import MappingSetDataFrame, filter_out_prefixes, filter_prefixes
from tests.constants import data_dir


class TestIO(unittest.TestCase):
"""A test case for merging msdfs."""

def setUp(self) -> None:
"""Set up."""
self.msdf = parse_sssom_table(f"{data_dir}/basic.tsv")
self.features = [SUBJECT_ID, OBJECT_ID]

def test_broken_predicate_list(self):
"""Test merging of multiple msdfs."""
pred_filter_list = ["skos:relatedMatch", f"{data_dir}/predicate_list3.txt"]
Expand All @@ -18,3 +26,31 @@ def test_broken_predicate_list(self):
if p_iri:
iri_list.extend(p_iri)
self.assertEqual(3, len(iri_list))

def test_filter_prefixes(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
filtered_df = filter_prefixes(self.msdf.df, prefix_filter_list, self.features)
self.assertEqual(len(filtered_df), 40)

def test_filter_out_prefixes(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
filtered_df = filter_out_prefixes(
self.msdf.df, prefix_filter_list, self.features
)
self.assertEqual(len(filtered_df), 5)

def test_remove_mappings(self):
"""Test remove mappings."""
prefix_filter_list = ["x", "y"]
filtered_df = filter_out_prefixes(
self.msdf.df, prefix_filter_list, self.features
)
new_msdf = MappingSetDataFrame(
df=filtered_df, prefix_map=self.msdf.prefix_map, metadata=self.msdf.metadata
)
original_length = len(self.msdf.df)
self.msdf.remove_mappings(new_msdf)
# len(self.msdf.df) = 141 and len(new_msdf.df) = 5
self.assertEqual(len(self.msdf.df), original_length - len(new_msdf.df))