From 21cdd42819cb19f7362752d6db589f4badd0c219 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Sun, 11 Jun 2023 18:27:47 +0300 Subject: [PATCH 01/12] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0c556438..4a94e423 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ PYTHON=python -SSSOM_VERSION_TAG=0.11.0 +SSSOM_VERSION_TAG=0.12.0 DEFAULT_PREFIX_MAP="https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/contexts/obo.context.jsonld" SSSOM_PY="https://raw.githubusercontent.com/mapping-commons/sssom/$(SSSOM_VERSION_TAG)/src/sssom_schema/datamodel/sssom_schema.py" SSSOM_YAML="https://raw.githubusercontent.com/mapping-commons/sssom/$(SSSOM_VERSION_TAG)/src/sssom_schema/schema/sssom_schema.yaml" From 8bbd3b6bd3001c47a8ad73d768f951114744ec1d Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Mon, 12 Jun 2023 16:21:08 -0500 Subject: [PATCH 02/12] Updated KEY_FEATURES --- sssom/util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sssom/util.py b/sssom/util.py index 64e9cc43..575dbf42 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -100,8 +100,9 @@ URI_SSSOM_MAPPINGS = f"{SSSOM_URI_PREFIX}mappings" -#: The 3 columns whose combination would be used as primary keys while merging/grouping -KEY_FEATURES = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID] +#: The 4 columns whose combination would be used as primary keys while merging/grouping +KEY_FEATURES = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID, PREDICATE_MODIFIER] +TRIPLES_IDS = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID] @dataclass @@ -761,7 +762,7 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame max_confidence_df = combined_normalized_subset.groupby( - KEY_FEATURES, as_index=False + TRIPLES_IDS, as_index=False )[CONFIDENCE].max() # If same confidence prefer "HumanCurated". From a5e007bed24843d9b436e248df9e1330700713a0 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Mon, 12 Jun 2023 16:46:18 -0500 Subject: [PATCH 03/12] formatted --- sssom/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sssom/util.py b/sssom/util.py index 575dbf42..3129938e 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -761,9 +761,9 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame - max_confidence_df = combined_normalized_subset.groupby( - TRIPLES_IDS, as_index=False - )[CONFIDENCE].max() + max_confidence_df = combined_normalized_subset.groupby(TRIPLES_IDS, as_index=False)[ + CONFIDENCE + ].max() # If same confidence prefer "HumanCurated". reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns) From 1843d258d369e931b96b975cd0dbae13740bf213 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Mon, 12 Jun 2023 17:33:16 -0500 Subject: [PATCH 04/12] updated `semapv` prefix map as sssoms-schema --- tests/data/bad_basic.tsv | 2 +- tests/data/basic.tsv | 2 +- tests/data/basic2.tsv | 2 +- tests/data/basic3.tsv | 2 +- tests/data/basic4.tsv | 2 +- tests/data/basic5.tsv | 2 +- tests/data/basic7.tsv | 2 +- tests/data/cob-to-external.tsv | 2 +- tests/data/reconcile_1.tsv | 2 +- tests/data/reconcile_2.tsv | 2 +- tests/data/test_annotate_sssom.tsv | 2 +- tests/data/test_clean_prefix.tsv | 2 +- tests/data/test_filter_sssom.tsv | 2 +- tests/data/test_inject_metadata_msdf.tsv | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/data/bad_basic.tsv b/tests/data/bad_basic.tsv index 93b1188f..9f4e885c 100644 --- a/tests/data/bad_basic.tsv +++ b/tests/data/bad_basic.tsv @@ -10,7 +10,7 @@ # d: "http://example.org/d/" # rdfs: "http://www.w3.org/2000/01/rdf-schema#" # owl: "http://www.w3.org/2002/07/owl#" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment c:something YYYYY owl:equivalentClass b:something yyyyyy Lexical c d rdf_matcher 0.81 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data d:something YYYYY owl:equivalentClass Not a:something yyyyyy Lexical d a rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data diff --git a/tests/data/basic.tsv b/tests/data/basic.tsv index 100c2b86..6d8b4a97 100644 --- a/tests/data/basic.tsv +++ b/tests/data/basic.tsv @@ -17,7 +17,7 @@ # c: "http://example.org/c/" # d: "http://example.org/d/" # orcid: "https://orcid.org/my-orcid?orcid=" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment x:appendage appendage owl:equivalentClass y:appendage appendages semapv:ManualMappingCuration x:example y:example rdf_matcher 0.840714406 rdfs:label|skos:prefLabel rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag . x:appendage appendage owl:equivalentClass z:appendage APPENDAGE semapv:ManualMappingCuration x:example z:example rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag . diff --git a/tests/data/basic2.tsv b/tests/data/basic2.tsv index d9c6309a..40666a33 100644 --- a/tests/data/basic2.tsv +++ b/tests/data/basic2.tsv @@ -16,7 +16,7 @@ # c: "http://example.org/c/" # d: "http://example.org/d/" # orcid: "https://orcid.org/my-orcid?orcid=" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment x:FOO FOO owl:equivalentClass y:FOO FOO semapv:SemanticSimilarityThresholdMatching x:example y:example rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity foo . x:appendage appendage owl:equivalentClass y:appendage appendages semapv:ManualMappingCuration x:example y:example rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag . diff --git a/tests/data/basic3.tsv b/tests/data/basic3.tsv index 5ec7034b..5786645d 100644 --- a/tests/data/basic3.tsv +++ b/tests/data/basic3.tsv @@ -12,7 +12,7 @@ # rdfs: "http://www.w3.org/2000/01/rdf-schema#" # owl: "http://www.w3.org/2002/07/owl#" # orcid: "https://orcid.org/my-orcid?orcid=" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment c:something YYYYY owl:equivalentClass b:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.81 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data d:something YYYYY owl:equivalentClass Not a:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data diff --git a/tests/data/basic4.tsv b/tests/data/basic4.tsv index fc73fb32..89773d02 100644 --- a/tests/data/basic4.tsv +++ b/tests/data/basic4.tsv @@ -13,7 +13,7 @@ # b2: "http://example.org/b2/" # c2: "http://example.org/c2/" # d2: "http://example.org/d2/" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment x2:appendage appendage owl:equivalentClass y2:appendage appendages semapv:SemanticSimilarityThresholdMatching x y rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag . x2:appendage appendage owl:equivalentClass z2:appendage APPENDAGE semapv:SemanticSimilarityThresholdMatching x z rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity appendag . diff --git a/tests/data/basic5.tsv b/tests/data/basic5.tsv index cb5f56fd..d8825e7c 100644 --- a/tests/data/basic5.tsv +++ b/tests/data/basic5.tsv @@ -13,7 +13,7 @@ # b1: "http://example.org/b1/" # c1: "http://example.org/c1/" # d1: "http://example.org/d1/" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment x1:appendage appendage owl:equivalentClass y1:appendage appendages semapv:LexicalMatching x z rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity organ . x1:appendage appendage owl:equivalentClass z1:appendage APPENDAGE semapv:LexicalMatching x y rdf_matcher 0.840714406 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity region . diff --git a/tests/data/basic7.tsv b/tests/data/basic7.tsv index 77f92dbb..a3116b67 100644 --- a/tests/data/basic7.tsv +++ b/tests/data/basic7.tsv @@ -12,7 +12,7 @@ # rdfs: "http://www.w3.org/2000/01/rdf-schema#" # owl: "http://www.w3.org/2002/07/owl#" # orcid: "https://orcid.org/my-orcid?orcid=" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" # skos: "http://www.w3.org/2004/02/skos/core#" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment a:something YYYYY owl:equivalentClass b:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.8 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data diff --git a/tests/data/cob-to-external.tsv b/tests/data/cob-to-external.tsv index 1384bbbc..c73baa43 100644 --- a/tests/data/cob-to-external.tsv +++ b/tests/data/cob-to-external.tsv @@ -4,7 +4,7 @@ # rdfs: "http://www.w3.org/2000/01/rdf-schema#" # skos: "http://www.w3.org/2004/02/skos/core#" # owl: "http://www.w3.org/2002/07/owl#" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" # BFO: "http://purl.obolibrary.org/obo/BFO_" # CARO: "http://purl.obolibrary.org/obo/CARO_" # CHEBI: "http://purl.obolibrary.org/obo/CHEBI_" diff --git a/tests/data/reconcile_1.tsv b/tests/data/reconcile_1.tsv index 52197175..17984f9f 100644 --- a/tests/data/reconcile_1.tsv +++ b/tests/data/reconcile_1.tsv @@ -5,7 +5,7 @@ # owl: http://www.w3.org/2002/07/owl# # rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# # rdfs: http://www.w3.org/2000/01/rdf-schema# -# semapv: https://w3id.org/semapv/ +# semapv: https://w3id.org/semapv/vocab/ # skos: http://www.w3.org/2004/02/skos/core# # sssom: https://w3id.org/sssom/ # license: https://w3id.org/sssom/license/unspecified diff --git a/tests/data/reconcile_2.tsv b/tests/data/reconcile_2.tsv index 909f2d07..4ab368c7 100644 --- a/tests/data/reconcile_2.tsv +++ b/tests/data/reconcile_2.tsv @@ -5,7 +5,7 @@ # owl: http://www.w3.org/2002/07/owl# # rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# # rdfs: http://www.w3.org/2000/01/rdf-schema# -# semapv: https://w3id.org/semapv/ +# semapv: https://w3id.org/semapv/vocab/ # skos: http://www.w3.org/2004/02/skos/core# # sssom: https://w3id.org/sssom/ # license: https://w3id.org/sssom/license/unspecified diff --git a/tests/data/test_annotate_sssom.tsv b/tests/data/test_annotate_sssom.tsv index 03cde6a2..ea6efdb6 100644 --- a/tests/data/test_annotate_sssom.tsv +++ b/tests/data/test_annotate_sssom.tsv @@ -11,7 +11,7 @@ # owl: http://www.w3.org/2002/07/owl# # rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# # rdfs: http://www.w3.org/2000/01/rdf-schema# -# semapv: https://w3id.org/semapv/ +# semapv: https://w3id.org/semapv/vocab/ # skos: http://www.w3.org/2004/02/skos/core# # sssom: https://w3id.org/sssom/ # x: http://example.org/x/ diff --git a/tests/data/test_clean_prefix.tsv b/tests/data/test_clean_prefix.tsv index 95d8b204..69866ba1 100644 --- a/tests/data/test_clean_prefix.tsv +++ b/tests/data/test_clean_prefix.tsv @@ -12,7 +12,7 @@ # rdfs: "http://www.w3.org/2000/01/rdf-schema#" # owl: "http://www.w3.org/2002/07/owl#" # orcid: "https://orcid.org/my-orcid?orcid=" -# semapv: "https://w3id.org/semapv/" +# semapv: "https://w3id.org/semapv/vocab/" subject_id subject_label predicate_id predicate_modifier object_id object_label mapping_justification subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment c:something YYYYY owl:equivalentClass b:something yyyyyy semapv:LexicalMatching c:example d:example rdf_matcher 0.81 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data d:something YYYYY owl:equivalentClass Not a:something yyyyyy semapv:LexicalMatching d:example a:example rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data diff --git a/tests/data/test_filter_sssom.tsv b/tests/data/test_filter_sssom.tsv index 7f439e4d..c06a29a7 100644 --- a/tests/data/test_filter_sssom.tsv +++ b/tests/data/test_filter_sssom.tsv @@ -11,7 +11,7 @@ # owl: http://www.w3.org/2002/07/owl# # rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# # rdfs: http://www.w3.org/2000/01/rdf-schema# -# semapv: https://w3id.org/semapv/ +# semapv: https://w3id.org/semapv/vocab/ # skos: http://www.w3.org/2004/02/skos/core# # sssom: https://w3id.org/sssom/ # x: http://example.org/x/ diff --git a/tests/data/test_inject_metadata_msdf.tsv b/tests/data/test_inject_metadata_msdf.tsv index 404ae8ab..527873c2 100644 --- a/tests/data/test_inject_metadata_msdf.tsv +++ b/tests/data/test_inject_metadata_msdf.tsv @@ -8,7 +8,7 @@ # owl: http://www.w3.org/2002/07/owl# # rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# # rdfs: http://www.w3.org/2000/01/rdf-schema# -# semapv: https://w3id.org/semapv/ +# semapv: https://w3id.org/semapv/vocab/ # skos: http://www.w3.org/2004/02/skos/core# # sssom: https://w3id.org/sssom/ # license: https://creativecommons.org/licenses/by-nc/4.0/ From 5cf3c00fd940a40cfdfd3ff0a8020de91e0633d6 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Tue, 13 Jun 2023 12:10:38 -0500 Subject: [PATCH 05/12] linted again --- sssom/cli.py | 1 - sssom/cliques.py | 7 +++--- sssom/context.py | 1 - sssom/io.py | 1 - sssom/parsers.py | 7 +++--- sssom/sssom_document.py | 4 +-- sssom/util.py | 53 +++++++++++++++++++++------------------- sssom/validators.py | 3 +-- sssom/writers.py | 3 +-- tests/test_annotate.py | 1 + tests/test_cli.py | 2 +- tests/test_collapse.py | 2 +- tests/test_conversion.py | 1 - tests/test_convert.py | 1 + tests/test_data.py | 2 +- tests/test_filter.py | 1 + tests/test_merge.py | 1 + tests/test_parsers.py | 2 +- tests/test_reconcile.py | 5 ++-- tests/test_rewire.py | 2 +- tests/test_scc.py | 1 + tests/test_sort.py | 1 + tests/test_utils.py | 1 + tests/test_validate.py | 2 +- tests/test_writers.py | 2 +- 25 files changed, 55 insertions(+), 52 deletions(-) diff --git a/sssom/cli.py b/sssom/cli.py index 3f6193d2..f5b745a6 100644 --- a/sssom/cli.py +++ b/sssom/cli.py @@ -22,7 +22,6 @@ import yaml from rdflib import Graph from scipy.stats import chi2_contingency - from sssom.constants import ( DEFAULT_VALIDATION_TYPES, PREFIX_MAP_MODES, diff --git a/sssom/cliques.py b/sssom/cliques.py index e3907474..d547e26a 100644 --- a/sssom/cliques.py +++ b/sssom/cliques.py @@ -7,10 +7,6 @@ import networkx as nx import pandas as pd - -# from .sssom_datamodel import Mapping -from sssom_schema import Mapping - from sssom.constants import ( OWL_DIFFERENT_FROM, OWL_EQUIVALENT_CLASS, @@ -22,6 +18,9 @@ SSSOM_SUPERCLASS_OF, ) +# from .sssom_datamodel import Mapping +from sssom_schema import Mapping + from .parsers import to_mapping_set_document from .sssom_document import MappingSetDocument from .util import MappingSetDataFrame diff --git a/sssom/context.py b/sssom/context.py index 748df911..fb1ba674 100644 --- a/sssom/context.py +++ b/sssom/context.py @@ -6,7 +6,6 @@ from typing import Optional from linkml.generators.jsonldcontextgen import ContextGenerator - from sssom.constants import SCHEMA_YAML from .external_context import sssom_external_context diff --git a/sssom/io.py b/sssom/io.py index 8fff6c9d..3ef91a9b 100644 --- a/sssom/io.py +++ b/sssom/io.py @@ -9,7 +9,6 @@ import pandas as pd from bioregistry import get_iri from pansql import sqldf - from sssom.validators import validate from .constants import ( diff --git a/sssom/parsers.py b/sssom/parsers.py index 10b90d65..cb46ca85 100644 --- a/sssom/parsers.py +++ b/sssom/parsers.py @@ -18,10 +18,6 @@ from deprecation import deprecated from linkml_runtime.loaders.json_loader import JSONLoader from rdflib import Graph, URIRef - -# from .sssom_datamodel import Mapping, MappingSet -from sssom_schema import Mapping, MappingSet - from sssom.constants import ( CONFIDENCE, CURIE_MAP, @@ -44,6 +40,9 @@ SSSOMSchemaView, ) +# from .sssom_datamodel import Mapping, MappingSet +from sssom_schema import Mapping, MappingSet + from .context import ( DEFAULT_LICENSE, DEFAULT_MAPPING_SET_ID, diff --git a/sssom/sssom_document.py b/sssom/sssom_document.py index c7d344e7..3d2600bb 100644 --- a/sssom/sssom_document.py +++ b/sssom/sssom_document.py @@ -2,11 +2,11 @@ from dataclasses import dataclass +from sssom.context import DEFAULT_LICENSE, DEFAULT_MAPPING_SET_ID + # from .sssom_datamodel import MappingSet from sssom_schema import MappingSet -from sssom.context import DEFAULT_LICENSE, DEFAULT_MAPPING_SET_ID - from .typehints import PrefixMap __all__ = [ diff --git a/sssom/util.py b/sssom/util.py index 3129938e..d50649fa 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -729,60 +729,65 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: ) # If s,!p,o and s,p,o , then prefer higher confidence and remove the other. ### - negation_df: pd.DataFrame - negation_df = df.loc[df[PREDICATE_MODIFIER] == PREDICATE_MODIFIER_NOT] - normalized_negation_df = negation_df.reset_index() + # negation_df: pd.DataFrame + # negation_df = df.loc[df[PREDICATE_MODIFIER] == PREDICATE_MODIFIER_NOT] + # normalized_negation_df = negation_df.reset_index() # This step ONLY if 'NOT' is expressed by the symbol '!' in 'predicate_id' ##### # normalized_negation_df[PREDICATE_ID] = normalized_negation_df[ # PREDICATE_ID # ].str.replace("!", "") ######################################################## - normalized_negation_df = normalized_negation_df.drop(["index"], axis=1) + # normalized_negation_df = normalized_negation_df.drop(["index"], axis=1) # remove the NOT rows from the main DataFrame - condition = negation_df.isin(df) - positive_df = df.drop(condition.index) - positive_df = positive_df.reset_index().drop(["index"], axis=1) + # condition = negation_df.isin(df) + # positive_df = df.drop(condition.index) + # positive_df = positive_df.reset_index().drop(["index"], axis=1) columns_of_interest = [ SUBJECT_ID, PREDICATE_ID, + PREDICATE_MODIFIER, OBJECT_ID, CONFIDENCE, MAPPING_JUSTIFICATION, ] - negation_subset = normalized_negation_df[columns_of_interest] - positive_subset = positive_df[columns_of_interest] + df_subset = df[columns_of_interest] + # negation_subset = normalized_negation_df[columns_of_interest] + # positive_subset = positive_df[columns_of_interest] - combined_normalized_subset = pd.concat( - [positive_subset, negation_subset] - ).drop_duplicates() + # combined_normalized_subset = pd.concat( + # [positive_subset, negation_subset] + # ).drop_duplicates() # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame - max_confidence_df = combined_normalized_subset.groupby(TRIPLES_IDS, as_index=False)[ + max_confidence_df = df_subset.groupby(KEY_FEATURES, as_index=False)[ CONFIDENCE ].max() + # max_confidence_df = combined_normalized_subset.groupby(TRIPLES_IDS, as_index=False)[ + # CONFIDENCE + # ].max() # If same confidence prefer "HumanCurated". - reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns) + reconciled_df_subset = pd.DataFrame(columns=df_subset.columns) for _, row_1 in max_confidence_df.iterrows(): match_condition_1 = ( - (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) - & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) - & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) + (df_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) + & (df_subset[OBJECT_ID] == row_1[OBJECT_ID]) + & (df_subset[CONFIDENCE] == row_1[CONFIDENCE]) ) # match_condition_1[match_condition_1] gives the list of 'True's. # In other words, the rows that match the condition (rules declared). # Ideally, there should be 1 row. If not apply an extra rule to look for 'HumanCurated'. if len(match_condition_1[match_condition_1].index) > 1: match_condition_1 = ( - (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) - & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) - & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) + (df_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) + & (df_subset[OBJECT_ID] == row_1[OBJECT_ID]) + & (df_subset[CONFIDENCE] == row_1[CONFIDENCE]) & ( - combined_normalized_subset[MAPPING_JUSTIFICATION] + df_subset[MAPPING_JUSTIFICATION] == SEMAPV.ManualMappingCuration.value ) ) @@ -802,9 +807,7 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: reconciled_df_subset = pd.concat( [ reconciled_df_subset, - combined_normalized_subset.loc[ - match_condition_1[match_condition_1].index, : - ], + df_subset.loc[match_condition_1[match_condition_1].index, :], ], ignore_index=True, ) @@ -812,7 +815,7 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: # Add negations (PREDICATE_MODIFIER) back to DataFrame # NOTE: negative TRUMPS positive if negative and positive with same # [SUBJECT_ID, OBJECT_ID, PREDICATE_ID] exist - for _, row_2 in negation_df.iterrows(): + for _, row_2 in df_subset.iterrows(): match_condition_2 = ( (reconciled_df_subset[SUBJECT_ID] == row_2[SUBJECT_ID]) & (reconciled_df_subset[OBJECT_ID] == row_2[OBJECT_ID]) diff --git a/sssom/validators.py b/sssom/validators.py index 3e1cf90b..2bc334da 100644 --- a/sssom/validators.py +++ b/sssom/validators.py @@ -6,11 +6,10 @@ from jsonschema import ValidationError from linkml.validators.jsonschemavalidator import JsonSchemaDataValidator from linkml.validators.sparqlvalidator import SparqlDataValidator # noqa: F401 -from sssom_schema import MappingSet - from sssom.context import add_built_in_prefixes_to_prefix_map from sssom.parsers import to_mapping_set_document from sssom.util import MappingSetDataFrame, get_all_prefixes +from sssom_schema import MappingSet from .constants import SCHEMA_YAML, SchemaValidationType diff --git a/sssom/writers.py b/sssom/writers.py index 3f8b09af..ea636a7d 100644 --- a/sssom/writers.py +++ b/sssom/writers.py @@ -12,12 +12,11 @@ from linkml_runtime.utils.schemaview import SchemaView from rdflib import Graph, URIRef from rdflib.namespace import OWL, RDF +from sssom.validators import check_all_prefixes_in_curie_map # from .sssom_datamodel import slots from sssom_schema import slots -from sssom.validators import check_all_prefixes_in_curie_map - from .constants import SCHEMA_YAML from .parsers import to_mapping_set_document from .typehints import PrefixMap diff --git a/tests/test_annotate.py b/tests/test_annotate.py index 26eb17c5..6116e6ad 100644 --- a/tests/test_annotate.py +++ b/tests/test_annotate.py @@ -8,6 +8,7 @@ # from sssom.io import filter_file from sssom.parsers import parse_sssom_table + from tests.constants import data_dir diff --git a/tests/test_cli.py b/tests/test_cli.py index c6feddb7..97fe5a77 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,6 @@ from typing import Mapping from click.testing import CliRunner, Result - from sssom.cli import ( annotate, cliquesummary, @@ -27,6 +26,7 @@ split, validate, ) + from tests.test_data import ( RECON_YAML, SSSOMTestCase, diff --git a/tests/test_collapse.py b/tests/test_collapse.py index 7d417ee8..e6bae5d5 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -4,6 +4,7 @@ import yaml from pansql import sqldf +from sssom.parsers import parse_sssom_table from sssom import ( collapse, @@ -14,7 +15,6 @@ parse, reconcile_prefix_and_data, ) -from sssom.parsers import parse_sssom_table from tests.constants import data_dir, prefix_recon_yaml diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 76b69955..621f1e46 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -7,7 +7,6 @@ from typing import Dict from rdflib import Graph - from sssom.parsers import get_parsing_function, to_mapping_set_document from sssom.sssom_document import MappingSetDocument from sssom.util import read_pandas, to_mapping_set_dataframe diff --git a/tests/test_convert.py b/tests/test_convert.py index fb34c354..c4838560 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -3,6 +3,7 @@ from sssom.parsers import parse_sssom_table from sssom.writers import to_json, to_owl_graph, to_rdf_graph + from tests.constants import data_dir diff --git a/tests/test_data.py b/tests/test_data.py index f0cf22e1..d88379a4 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -4,8 +4,8 @@ from typing import Any, List, Mapping import yaml - from sssom.util import PREFIX_MAP_KEY + from tests.constants import cwd, data_dir, test_out_dir test_validate_dir = os.path.join(cwd, "validate_data") diff --git a/tests/test_filter.py b/tests/test_filter.py index 9f3924ea..d8612c86 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -9,6 +9,7 @@ # from sssom.io import filter_file from sssom.parsers import parse_sssom_table + from tests.constants import data_dir diff --git a/tests/test_merge.py b/tests/test_merge.py index f1dd1c5c..08ecf186 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -4,6 +4,7 @@ from sssom.parsers import parse_sssom_table from sssom.util import merge_msdf + from tests.constants import data_dir diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 22e57960..a1c81c7b 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -10,7 +10,6 @@ import pandas as pd import yaml from rdflib import Graph - from sssom.context import get_default_metadata from sssom.parsers import ( from_alignment_minidom, @@ -22,6 +21,7 @@ ) from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns from sssom.writers import write_table + from tests.test_data import data_dir as test_data_dir from tests.test_data import test_out_dir diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index fca05b05..5b7b6ab6 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -2,9 +2,10 @@ import unittest -from sssom import filter_redundant_rows from sssom.parsers import parse_sssom_table from sssom.util import deal_with_negation, merge_msdf + +from sssom import filter_redundant_rows from tests.constants import data_dir @@ -28,7 +29,7 @@ def test_deal_with_negation(self): df1 = deal_with_negation(self.msdf1.df) self.assertEqual(8, len(df1.index)) df2 = deal_with_negation(self.msdf2.df) - self.assertEqual(12, len(df2.index)) + self.assertEqual(14, len(df2.index)) def test_merge(self): """Test merging two tables.""" diff --git a/tests/test_rewire.py b/tests/test_rewire.py index 0e2357e6..71a52bec 100644 --- a/tests/test_rewire.py +++ b/tests/test_rewire.py @@ -4,9 +4,9 @@ import unittest from rdflib import Graph - from sssom.parsers import parse_sssom_table from sssom.rdf_util import rewire_graph + from tests.constants import data_dir, test_out_dir diff --git a/tests/test_scc.py b/tests/test_scc.py index 74caa9c4..b5bef3ee 100644 --- a/tests/test_scc.py +++ b/tests/test_scc.py @@ -4,6 +4,7 @@ from sssom.cliques import split_into_cliques, summarize_cliques from sssom.parsers import parse_sssom_table + from tests.constants import data_dir diff --git a/tests/test_sort.py b/tests/test_sort.py index 0836d9f1..3f4f3f48 100644 --- a/tests/test_sort.py +++ b/tests/test_sort.py @@ -5,6 +5,7 @@ from sssom.constants import SSSOMSchemaView from sssom.parsers import parse_sssom_table from sssom.util import sort_df_rows_columns + from tests.constants import data_dir SCHEMA_DICT = SSSOMSchemaView.instance.dict diff --git a/tests/test_utils.py b/tests/test_utils.py index 00daf569..230744a3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,6 +11,7 @@ inject_metadata_into_df, invert_mappings, ) + from tests.constants import data_dir diff --git a/tests/test_validate.py b/tests/test_validate.py index 05d695e5..fd46c936 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -3,10 +3,10 @@ import unittest from jsonschema import ValidationError - from sssom.constants import DEFAULT_VALIDATION_TYPES, SchemaValidationType from sssom.parsers import parse_sssom_table from sssom.validators import validate + from tests.constants import data_dir diff --git a/tests/test_writers.py b/tests/test_writers.py index e7f3d759..ec3320e5 100644 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -4,7 +4,6 @@ import unittest from jsonasobj2 import JsonObj - from sssom.parsers import parse_sssom_json, parse_sssom_rdf, parse_sssom_table from sssom.writers import ( write_fhir_json, @@ -14,6 +13,7 @@ write_rdf, write_table, ) + from tests.constants import data_dir as test_data_dir from tests.constants import test_out_dir From eabad9fcb5172a6198ce37c7de136ff0cb9eb766 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Tue, 13 Jun 2023 12:16:26 -0500 Subject: [PATCH 06/12] cleanup --- sssom/util.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/sssom/util.py b/sssom/util.py index d50649fa..89c5770b 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -728,23 +728,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: "The dataframe, after assigning default confidence, appears empty (deal_with_negation)" ) - # If s,!p,o and s,p,o , then prefer higher confidence and remove the other. ### - # negation_df: pd.DataFrame - # negation_df = df.loc[df[PREDICATE_MODIFIER] == PREDICATE_MODIFIER_NOT] - # normalized_negation_df = negation_df.reset_index() - - # This step ONLY if 'NOT' is expressed by the symbol '!' in 'predicate_id' ##### - # normalized_negation_df[PREDICATE_ID] = normalized_negation_df[ - # PREDICATE_ID - # ].str.replace("!", "") - ######################################################## - # normalized_negation_df = normalized_negation_df.drop(["index"], axis=1) - - # remove the NOT rows from the main DataFrame - # condition = negation_df.isin(df) - # positive_df = df.drop(condition.index) - # positive_df = positive_df.reset_index().drop(["index"], axis=1) - columns_of_interest = [ SUBJECT_ID, PREDICATE_ID, @@ -754,21 +737,12 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: MAPPING_JUSTIFICATION, ] df_subset = df[columns_of_interest] - # negation_subset = normalized_negation_df[columns_of_interest] - # positive_subset = positive_df[columns_of_interest] - - # combined_normalized_subset = pd.concat( - # [positive_subset, negation_subset] - # ).drop_duplicates() # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame max_confidence_df = df_subset.groupby(KEY_FEATURES, as_index=False)[ CONFIDENCE ].max() - # max_confidence_df = combined_normalized_subset.groupby(TRIPLES_IDS, as_index=False)[ - # CONFIDENCE - # ].max() # If same confidence prefer "HumanCurated". reconciled_df_subset = pd.DataFrame(columns=df_subset.columns) @@ -796,14 +770,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: if len(match_condition_1[match_condition_1].index) > 1: match_condition_1 = match_condition_1[match_condition_1].sample() - # FutureWarning: The frame.append method is deprecated and will be removed - # from pandas in a future version. Use pandas.concat instead. - # reconciled_df_subset = reconciled_df_subset.append( - # combined_normalized_subset.loc[ - # match_condition_1[match_condition_1].index, : - # ], - # ignore_index=True, - # ) reconciled_df_subset = pd.concat( [ reconciled_df_subset, From 9ef0f85310094feffad1334dc6031dcbe3fd242b Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Tue, 13 Jun 2023 12:41:22 -0500 Subject: [PATCH 07/12] cleanup --- sssom/util.py | 57 ++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 44 deletions(-) diff --git a/sssom/util.py b/sssom/util.py index 3129938e..89c5770b 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -728,61 +728,40 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: "The dataframe, after assigning default confidence, appears empty (deal_with_negation)" ) - # If s,!p,o and s,p,o , then prefer higher confidence and remove the other. ### - negation_df: pd.DataFrame - negation_df = df.loc[df[PREDICATE_MODIFIER] == PREDICATE_MODIFIER_NOT] - normalized_negation_df = negation_df.reset_index() - - # This step ONLY if 'NOT' is expressed by the symbol '!' in 'predicate_id' ##### - # normalized_negation_df[PREDICATE_ID] = normalized_negation_df[ - # PREDICATE_ID - # ].str.replace("!", "") - ######################################################## - normalized_negation_df = normalized_negation_df.drop(["index"], axis=1) - - # remove the NOT rows from the main DataFrame - condition = negation_df.isin(df) - positive_df = df.drop(condition.index) - positive_df = positive_df.reset_index().drop(["index"], axis=1) - columns_of_interest = [ SUBJECT_ID, PREDICATE_ID, + PREDICATE_MODIFIER, OBJECT_ID, CONFIDENCE, MAPPING_JUSTIFICATION, ] - negation_subset = normalized_negation_df[columns_of_interest] - positive_subset = positive_df[columns_of_interest] - - combined_normalized_subset = pd.concat( - [positive_subset, negation_subset] - ).drop_duplicates() + df_subset = df[columns_of_interest] # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame - max_confidence_df = combined_normalized_subset.groupby(TRIPLES_IDS, as_index=False)[ + max_confidence_df = df_subset.groupby(KEY_FEATURES, as_index=False)[ CONFIDENCE ].max() # If same confidence prefer "HumanCurated". - reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns) + reconciled_df_subset = pd.DataFrame(columns=df_subset.columns) for _, row_1 in max_confidence_df.iterrows(): match_condition_1 = ( - (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) - & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) - & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) + (df_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) + & (df_subset[OBJECT_ID] == row_1[OBJECT_ID]) + & (df_subset[CONFIDENCE] == row_1[CONFIDENCE]) ) # match_condition_1[match_condition_1] gives the list of 'True's. # In other words, the rows that match the condition (rules declared). # Ideally, there should be 1 row. If not apply an extra rule to look for 'HumanCurated'. if len(match_condition_1[match_condition_1].index) > 1: match_condition_1 = ( - (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) - & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) - & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) + (df_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) + & (df_subset[OBJECT_ID] == row_1[OBJECT_ID]) + & (df_subset[CONFIDENCE] == row_1[CONFIDENCE]) & ( - combined_normalized_subset[MAPPING_JUSTIFICATION] + df_subset[MAPPING_JUSTIFICATION] == SEMAPV.ManualMappingCuration.value ) ) @@ -791,20 +770,10 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: if len(match_condition_1[match_condition_1].index) > 1: match_condition_1 = match_condition_1[match_condition_1].sample() - # FutureWarning: The frame.append method is deprecated and will be removed - # from pandas in a future version. Use pandas.concat instead. - # reconciled_df_subset = reconciled_df_subset.append( - # combined_normalized_subset.loc[ - # match_condition_1[match_condition_1].index, : - # ], - # ignore_index=True, - # ) reconciled_df_subset = pd.concat( [ reconciled_df_subset, - combined_normalized_subset.loc[ - match_condition_1[match_condition_1].index, : - ], + df_subset.loc[match_condition_1[match_condition_1].index, :], ], ignore_index=True, ) @@ -812,7 +781,7 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: # Add negations (PREDICATE_MODIFIER) back to DataFrame # NOTE: negative TRUMPS positive if negative and positive with same # [SUBJECT_ID, OBJECT_ID, PREDICATE_ID] exist - for _, row_2 in negation_df.iterrows(): + for _, row_2 in df_subset.iterrows(): match_condition_2 = ( (reconciled_df_subset[SUBJECT_ID] == row_2[SUBJECT_ID]) & (reconciled_df_subset[OBJECT_ID] == row_2[OBJECT_ID]) From 12b6ced0bf517e7fdc1c817578de48badc68616c Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Tue, 13 Jun 2023 12:51:50 -0500 Subject: [PATCH 08/12] Revert "linted again" This reverts commit 5cf3c00fd940a40cfdfd3ff0a8020de91e0633d6. --- sssom/cli.py | 1 + sssom/cliques.py | 7 ++++--- sssom/context.py | 1 + sssom/io.py | 1 + sssom/parsers.py | 7 ++++--- sssom/sssom_document.py | 4 ++-- sssom/validators.py | 3 ++- sssom/writers.py | 3 ++- tests/test_annotate.py | 1 - tests/test_cli.py | 2 +- tests/test_collapse.py | 2 +- tests/test_conversion.py | 1 + tests/test_convert.py | 1 - tests/test_data.py | 2 +- tests/test_filter.py | 1 - tests/test_merge.py | 1 - tests/test_parsers.py | 2 +- tests/test_reconcile.py | 5 ++--- tests/test_rewire.py | 2 +- tests/test_scc.py | 1 - tests/test_sort.py | 1 - tests/test_utils.py | 1 - tests/test_validate.py | 2 +- tests/test_writers.py | 2 +- 24 files changed, 27 insertions(+), 27 deletions(-) diff --git a/sssom/cli.py b/sssom/cli.py index f5b745a6..3f6193d2 100644 --- a/sssom/cli.py +++ b/sssom/cli.py @@ -22,6 +22,7 @@ import yaml from rdflib import Graph from scipy.stats import chi2_contingency + from sssom.constants import ( DEFAULT_VALIDATION_TYPES, PREFIX_MAP_MODES, diff --git a/sssom/cliques.py b/sssom/cliques.py index d547e26a..e3907474 100644 --- a/sssom/cliques.py +++ b/sssom/cliques.py @@ -7,6 +7,10 @@ import networkx as nx import pandas as pd + +# from .sssom_datamodel import Mapping +from sssom_schema import Mapping + from sssom.constants import ( OWL_DIFFERENT_FROM, OWL_EQUIVALENT_CLASS, @@ -18,9 +22,6 @@ SSSOM_SUPERCLASS_OF, ) -# from .sssom_datamodel import Mapping -from sssom_schema import Mapping - from .parsers import to_mapping_set_document from .sssom_document import MappingSetDocument from .util import MappingSetDataFrame diff --git a/sssom/context.py b/sssom/context.py index fb1ba674..748df911 100644 --- a/sssom/context.py +++ b/sssom/context.py @@ -6,6 +6,7 @@ from typing import Optional from linkml.generators.jsonldcontextgen import ContextGenerator + from sssom.constants import SCHEMA_YAML from .external_context import sssom_external_context diff --git a/sssom/io.py b/sssom/io.py index 3ef91a9b..8fff6c9d 100644 --- a/sssom/io.py +++ b/sssom/io.py @@ -9,6 +9,7 @@ import pandas as pd from bioregistry import get_iri from pansql import sqldf + from sssom.validators import validate from .constants import ( diff --git a/sssom/parsers.py b/sssom/parsers.py index cb46ca85..10b90d65 100644 --- a/sssom/parsers.py +++ b/sssom/parsers.py @@ -18,6 +18,10 @@ from deprecation import deprecated from linkml_runtime.loaders.json_loader import JSONLoader from rdflib import Graph, URIRef + +# from .sssom_datamodel import Mapping, MappingSet +from sssom_schema import Mapping, MappingSet + from sssom.constants import ( CONFIDENCE, CURIE_MAP, @@ -40,9 +44,6 @@ SSSOMSchemaView, ) -# from .sssom_datamodel import Mapping, MappingSet -from sssom_schema import Mapping, MappingSet - from .context import ( DEFAULT_LICENSE, DEFAULT_MAPPING_SET_ID, diff --git a/sssom/sssom_document.py b/sssom/sssom_document.py index 3d2600bb..c7d344e7 100644 --- a/sssom/sssom_document.py +++ b/sssom/sssom_document.py @@ -2,11 +2,11 @@ from dataclasses import dataclass -from sssom.context import DEFAULT_LICENSE, DEFAULT_MAPPING_SET_ID - # from .sssom_datamodel import MappingSet from sssom_schema import MappingSet +from sssom.context import DEFAULT_LICENSE, DEFAULT_MAPPING_SET_ID + from .typehints import PrefixMap __all__ = [ diff --git a/sssom/validators.py b/sssom/validators.py index 2bc334da..3e1cf90b 100644 --- a/sssom/validators.py +++ b/sssom/validators.py @@ -6,10 +6,11 @@ from jsonschema import ValidationError from linkml.validators.jsonschemavalidator import JsonSchemaDataValidator from linkml.validators.sparqlvalidator import SparqlDataValidator # noqa: F401 +from sssom_schema import MappingSet + from sssom.context import add_built_in_prefixes_to_prefix_map from sssom.parsers import to_mapping_set_document from sssom.util import MappingSetDataFrame, get_all_prefixes -from sssom_schema import MappingSet from .constants import SCHEMA_YAML, SchemaValidationType diff --git a/sssom/writers.py b/sssom/writers.py index ea636a7d..3f8b09af 100644 --- a/sssom/writers.py +++ b/sssom/writers.py @@ -12,11 +12,12 @@ from linkml_runtime.utils.schemaview import SchemaView from rdflib import Graph, URIRef from rdflib.namespace import OWL, RDF -from sssom.validators import check_all_prefixes_in_curie_map # from .sssom_datamodel import slots from sssom_schema import slots +from sssom.validators import check_all_prefixes_in_curie_map + from .constants import SCHEMA_YAML from .parsers import to_mapping_set_document from .typehints import PrefixMap diff --git a/tests/test_annotate.py b/tests/test_annotate.py index 6116e6ad..26eb17c5 100644 --- a/tests/test_annotate.py +++ b/tests/test_annotate.py @@ -8,7 +8,6 @@ # from sssom.io import filter_file from sssom.parsers import parse_sssom_table - from tests.constants import data_dir diff --git a/tests/test_cli.py b/tests/test_cli.py index 97fe5a77..c6feddb7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,6 +6,7 @@ from typing import Mapping from click.testing import CliRunner, Result + from sssom.cli import ( annotate, cliquesummary, @@ -26,7 +27,6 @@ split, validate, ) - from tests.test_data import ( RECON_YAML, SSSOMTestCase, diff --git a/tests/test_collapse.py b/tests/test_collapse.py index e6bae5d5..7d417ee8 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -4,7 +4,6 @@ import yaml from pansql import sqldf -from sssom.parsers import parse_sssom_table from sssom import ( collapse, @@ -15,6 +14,7 @@ parse, reconcile_prefix_and_data, ) +from sssom.parsers import parse_sssom_table from tests.constants import data_dir, prefix_recon_yaml diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 621f1e46..76b69955 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -7,6 +7,7 @@ from typing import Dict from rdflib import Graph + from sssom.parsers import get_parsing_function, to_mapping_set_document from sssom.sssom_document import MappingSetDocument from sssom.util import read_pandas, to_mapping_set_dataframe diff --git a/tests/test_convert.py b/tests/test_convert.py index c4838560..fb34c354 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -3,7 +3,6 @@ from sssom.parsers import parse_sssom_table from sssom.writers import to_json, to_owl_graph, to_rdf_graph - from tests.constants import data_dir diff --git a/tests/test_data.py b/tests/test_data.py index d88379a4..f0cf22e1 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -4,8 +4,8 @@ from typing import Any, List, Mapping import yaml -from sssom.util import PREFIX_MAP_KEY +from sssom.util import PREFIX_MAP_KEY from tests.constants import cwd, data_dir, test_out_dir test_validate_dir = os.path.join(cwd, "validate_data") diff --git a/tests/test_filter.py b/tests/test_filter.py index d8612c86..9f3924ea 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -9,7 +9,6 @@ # from sssom.io import filter_file from sssom.parsers import parse_sssom_table - from tests.constants import data_dir diff --git a/tests/test_merge.py b/tests/test_merge.py index 08ecf186..f1dd1c5c 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -4,7 +4,6 @@ from sssom.parsers import parse_sssom_table from sssom.util import merge_msdf - from tests.constants import data_dir diff --git a/tests/test_parsers.py b/tests/test_parsers.py index a1c81c7b..22e57960 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -10,6 +10,7 @@ import pandas as pd import yaml from rdflib import Graph + from sssom.context import get_default_metadata from sssom.parsers import ( from_alignment_minidom, @@ -21,7 +22,6 @@ ) from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns from sssom.writers import write_table - from tests.test_data import data_dir as test_data_dir from tests.test_data import test_out_dir diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index 5b7b6ab6..fca05b05 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -2,10 +2,9 @@ import unittest +from sssom import filter_redundant_rows from sssom.parsers import parse_sssom_table from sssom.util import deal_with_negation, merge_msdf - -from sssom import filter_redundant_rows from tests.constants import data_dir @@ -29,7 +28,7 @@ def test_deal_with_negation(self): df1 = deal_with_negation(self.msdf1.df) self.assertEqual(8, len(df1.index)) df2 = deal_with_negation(self.msdf2.df) - self.assertEqual(14, len(df2.index)) + self.assertEqual(12, len(df2.index)) def test_merge(self): """Test merging two tables.""" diff --git a/tests/test_rewire.py b/tests/test_rewire.py index 71a52bec..0e2357e6 100644 --- a/tests/test_rewire.py +++ b/tests/test_rewire.py @@ -4,9 +4,9 @@ import unittest from rdflib import Graph + from sssom.parsers import parse_sssom_table from sssom.rdf_util import rewire_graph - from tests.constants import data_dir, test_out_dir diff --git a/tests/test_scc.py b/tests/test_scc.py index b5bef3ee..74caa9c4 100644 --- a/tests/test_scc.py +++ b/tests/test_scc.py @@ -4,7 +4,6 @@ from sssom.cliques import split_into_cliques, summarize_cliques from sssom.parsers import parse_sssom_table - from tests.constants import data_dir diff --git a/tests/test_sort.py b/tests/test_sort.py index 3f4f3f48..0836d9f1 100644 --- a/tests/test_sort.py +++ b/tests/test_sort.py @@ -5,7 +5,6 @@ from sssom.constants import SSSOMSchemaView from sssom.parsers import parse_sssom_table from sssom.util import sort_df_rows_columns - from tests.constants import data_dir SCHEMA_DICT = SSSOMSchemaView.instance.dict diff --git a/tests/test_utils.py b/tests/test_utils.py index 230744a3..00daf569 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -11,7 +11,6 @@ inject_metadata_into_df, invert_mappings, ) - from tests.constants import data_dir diff --git a/tests/test_validate.py b/tests/test_validate.py index fd46c936..05d695e5 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -3,10 +3,10 @@ import unittest from jsonschema import ValidationError + from sssom.constants import DEFAULT_VALIDATION_TYPES, SchemaValidationType from sssom.parsers import parse_sssom_table from sssom.validators import validate - from tests.constants import data_dir diff --git a/tests/test_writers.py b/tests/test_writers.py index ec3320e5..e7f3d759 100644 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -4,6 +4,7 @@ import unittest from jsonasobj2 import JsonObj + from sssom.parsers import parse_sssom_json, parse_sssom_rdf, parse_sssom_table from sssom.writers import ( write_fhir_json, @@ -13,7 +14,6 @@ write_rdf, write_table, ) - from tests.constants import data_dir as test_data_dir from tests.constants import test_out_dir From a594856340b0df3eb8b31380470ef34b609588b2 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Tue, 13 Jun 2023 13:03:16 -0500 Subject: [PATCH 09/12] fixed test --- tests/test_reconcile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index fca05b05..15eadce2 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -28,7 +28,7 @@ def test_deal_with_negation(self): df1 = deal_with_negation(self.msdf1.df) self.assertEqual(8, len(df1.index)) df2 = deal_with_negation(self.msdf2.df) - self.assertEqual(12, len(df2.index)) + self.assertEqual(14, len(df2.index)) def test_merge(self): """Test merging two tables.""" From 26a2809e5baa6c4d16b07faa17bd5121bd2c1203 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Tue, 13 Jun 2023 13:47:14 -0500 Subject: [PATCH 10/12] removed unnecessary variable --- sssom/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sssom/util.py b/sssom/util.py index 89c5770b..f30430ca 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -102,7 +102,6 @@ #: The 4 columns whose combination would be used as primary keys while merging/grouping KEY_FEATURES = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID, PREDICATE_MODIFIER] -TRIPLES_IDS = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID] @dataclass From 4ca1b76706d0a0e1a9420a2c679e828481b5e2f5 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Thu, 15 Jun 2023 09:06:54 -0500 Subject: [PATCH 11/12] chnaged back to splitting negative and positive --- sssom/util.py | 53 +++++++++++++++++++++++++++++------------ tests/test_reconcile.py | 2 +- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/sssom/util.py b/sssom/util.py index f30430ca..921919a1 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -102,6 +102,7 @@ #: The 4 columns whose combination would be used as primary keys while merging/grouping KEY_FEATURES = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID, PREDICATE_MODIFIER] +TRIPLE_IDS = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID] @dataclass @@ -726,41 +727,61 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: raise ValueError( "The dataframe, after assigning default confidence, appears empty (deal_with_negation)" ) + # If s,!p,o and s,p,o , then prefer higher confidence and remove the other. ### + negation_df: pd.DataFrame + negation_df = df.loc[df[PREDICATE_MODIFIER] == PREDICATE_MODIFIER_NOT] + normalized_negation_df = negation_df.reset_index() + + # This step ONLY if 'NOT' is expressed by the symbol '!' in 'predicate_id' ##### + # normalized_negation_df[PREDICATE_ID] = normalized_negation_df[ + # PREDICATE_ID + # ].str.replace("!", "") + ######################################################## + normalized_negation_df = normalized_negation_df.drop(["index"], axis=1) + + # remove the NOT rows from the main DataFrame + condition = negation_df.isin(df) + positive_df = df.drop(condition.index) + positive_df = positive_df.reset_index().drop(["index"], axis=1) columns_of_interest = [ SUBJECT_ID, PREDICATE_ID, - PREDICATE_MODIFIER, OBJECT_ID, CONFIDENCE, MAPPING_JUSTIFICATION, ] - df_subset = df[columns_of_interest] + negation_subset = normalized_negation_df[columns_of_interest] + positive_subset = positive_df[columns_of_interest] + + combined_normalized_subset = pd.concat( + [positive_subset, negation_subset] + ).drop_duplicates() # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame - max_confidence_df = df_subset.groupby(KEY_FEATURES, as_index=False)[ - CONFIDENCE - ].max() + max_confidence_df = combined_normalized_subset.groupby( + TRIPLE_IDS, as_index=False + )[CONFIDENCE].max() # If same confidence prefer "HumanCurated". - reconciled_df_subset = pd.DataFrame(columns=df_subset.columns) + reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns) for _, row_1 in max_confidence_df.iterrows(): match_condition_1 = ( - (df_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) - & (df_subset[OBJECT_ID] == row_1[OBJECT_ID]) - & (df_subset[CONFIDENCE] == row_1[CONFIDENCE]) + (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) + & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) + & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) ) # match_condition_1[match_condition_1] gives the list of 'True's. # In other words, the rows that match the condition (rules declared). # Ideally, there should be 1 row. If not apply an extra rule to look for 'HumanCurated'. if len(match_condition_1[match_condition_1].index) > 1: match_condition_1 = ( - (df_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) - & (df_subset[OBJECT_ID] == row_1[OBJECT_ID]) - & (df_subset[CONFIDENCE] == row_1[CONFIDENCE]) + (combined_normalized_subset[SUBJECT_ID] == row_1[SUBJECT_ID]) + & (combined_normalized_subset[OBJECT_ID] == row_1[OBJECT_ID]) + & (combined_normalized_subset[CONFIDENCE] == row_1[CONFIDENCE]) & ( - df_subset[MAPPING_JUSTIFICATION] + combined_normalized_subset[MAPPING_JUSTIFICATION] == SEMAPV.ManualMappingCuration.value ) ) @@ -772,7 +793,9 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: reconciled_df_subset = pd.concat( [ reconciled_df_subset, - df_subset.loc[match_condition_1[match_condition_1].index, :], + combined_normalized_subset.loc[ + match_condition_1[match_condition_1].index, : + ], ], ignore_index=True, ) @@ -780,7 +803,7 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: # Add negations (PREDICATE_MODIFIER) back to DataFrame # NOTE: negative TRUMPS positive if negative and positive with same # [SUBJECT_ID, OBJECT_ID, PREDICATE_ID] exist - for _, row_2 in df_subset.iterrows(): + for _, row_2 in negation_df.iterrows(): match_condition_2 = ( (reconciled_df_subset[SUBJECT_ID] == row_2[SUBJECT_ID]) & (reconciled_df_subset[OBJECT_ID] == row_2[OBJECT_ID]) diff --git a/tests/test_reconcile.py b/tests/test_reconcile.py index 15eadce2..fca05b05 100644 --- a/tests/test_reconcile.py +++ b/tests/test_reconcile.py @@ -28,7 +28,7 @@ def test_deal_with_negation(self): df1 = deal_with_negation(self.msdf1.df) self.assertEqual(8, len(df1.index)) df2 = deal_with_negation(self.msdf2.df) - self.assertEqual(14, len(df2.index)) + self.assertEqual(12, len(df2.index)) def test_merge(self): """Test merging two tables.""" From 7316bcc18ce04bc2662cd61675acb4cc73a24c7a Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Thu, 15 Jun 2023 09:11:20 -0500 Subject: [PATCH 12/12] black formatted --- sssom/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sssom/util.py b/sssom/util.py index 921919a1..8396f972 100644 --- a/sssom/util.py +++ b/sssom/util.py @@ -760,9 +760,9 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame: # GroupBy and SELECT ONLY maximum confidence max_confidence_df: pd.DataFrame - max_confidence_df = combined_normalized_subset.groupby( - TRIPLE_IDS, as_index=False - )[CONFIDENCE].max() + max_confidence_df = combined_normalized_subset.groupby(TRIPLE_IDS, as_index=False)[ + CONFIDENCE + ].max() # If same confidence prefer "HumanCurated". reconciled_df_subset = pd.DataFrame(columns=combined_normalized_subset.columns)