# Mondo RD subset analysis

In [43]:
## Settings

!pip install oaklib sssom --quiet
!pip install --upgrade --quiet  langchain-core langchain-community langchain-openai tabulate

import pandas as pd
import subprocess
from oaklib import get_adapter
from sssom.parsers import parse_sssom_table

# Configure dataframe display
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Settings
source = "ncit"
source_roots = ['NCIT:C3262', 'NCIT:C134533']
source_id_space = "NCIT"

mondo_db_path = "tmp/mondo.db"
source_db_path = f"tmp/{source}.db"
mondo_mappings_path = "tmp/mondo.sssom.tsv"
source_mappings_path = f"tmp/{source}.sssom.tsv"
negative_matches_path = f"tmp/mondo-negative-matches.sssom.tsv"
lexical_matches_path = f"tmp/lexical-matches-{source}.tsv"

mondo_adapter = f"sqlite:{mondo_db_path}"
source_adapter = f"sqlite:{source_db_path}"
prepare_command = f"update-{source}"

def run_command(command):
    try:
        result = subprocess.run(command, check=True, text=True, capture_output=True)
        print("Output:")
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error:")
        print(e.stderr)

run_command(['make', prepare_command])

mondo = get_adapter(mondo_adapter)
source = get_adapter(source_adapter)
mondo_mappings =  parse_sssom_table(mondo_mappings_path)
source_mappings =  parse_sssom_table(source_mappings_path)
lexmatch_matches = parse_sssom_table(lexical_matches_path)
negative_matches = parse_sssom_table(negative_matches_path)

[0mOutput:
/Library/Developer/CommandLineTools/usr/bin/make tmp/ncit.db
make[1]: `tmp/ncit.db' is up to date.
/Library/Developer/CommandLineTools/usr/bin/make tmp/mondo.db
make[1]: `tmp/mondo.db' is up to date.
/Library/Developer/CommandLineTools/usr/bin/make	tmp/mondo.sssom.tsv
make[1]: `tmp/mondo.sssom.tsv' is up to date.
/Library/Developer/CommandLineTools/usr/bin/make	tmp/ncit.sssom.tsv
make[1]: `tmp/ncit.sssom.tsv' is up to date.
/Library/Developer/CommandLineTools/usr/bin/make	tmp/mondo-negative-matches.sssom.tsv
make[1]: `tmp/mondo-negative-matches.sssom.tsv' is up to date.
/Library/Developer/CommandLineTools/usr/bin/make tmp/lexical-matches-ncit.tsv SOURCE_ID_SPACE=NCIT
make[1]: `tmp/lexical-matches-ncit.tsv' is up to date.



  df.replace("", np.nan, inplace=True)
  df.replace("", np.nan, inplace=True)
/Users/matentzn/ws/mondo/.venv/lib/python3.11/site-packages/sssom/parsers.py:428: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
When using the Copy-on-Write mode, such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object.


  df2[CONFIDENCE].replace(r"^\s*$", np.NaN, regex=True, inplace=True)
  df.replace("", np.nan, inplace=True)
/Users/matentzn/ws/mondo/.venv/lib/python3.11/site-packages/sssom/parsers.py:428: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inpla

In [44]:
def sanitize_table(df, mondo, source, default_value=0.5):
    """
    Ensure the confidence column exists, and all values are valid floats between 0 and 1.
    Replace invalid or missing values with the default value.
    """
    if 'confidence' not in df.columns:
        df['confidence'] = default_value
    else:
        df['confidence'] = pd.to_numeric(df['confidence'], errors='coerce')  # Convert to numeric, invalid parsing will be NaN
        df['confidence'] = df['confidence'].apply(lambda x: x if 0 <= x <= 1 else default_value)  # Replace out of range values with default
        df['confidence'].fillna(default_value, inplace=True)  # Replace NaN with default value
    
    df['subject_label'] = df.apply(
            lambda row: mondo.label(row['subject_id']) if pd.notnull(mondo.label(row['subject_id'])) 
            else source.label(row['subject_id']) if pd.notnull(source.label(row['subject_id'])) 
            else row['subject_label'] if 'subject_label' in row and pd.notnull(row['subject_label'])
            else '', axis=1
        )
    df['object_label'] = df.apply(
            lambda row: mondo.label(row['object_id']) if pd.notnull(mondo.label(row['object_id'])) 
            else source.label(row['object_id']) if pd.notnull(source.label(row['object_id'])) 
            else row['object_label'] if 'object_label' in row and pd.notnull(row['object_label'])
            else '', axis=1
        )
    return df

df_mondo_mappings = mondo_mappings.df.copy()
df_mondo_mappings_exact = sanitize_table(df_mondo_mappings[df_mondo_mappings['predicate_id'] == 'skos:exactMatch'], mondo, source, 0.95)
df_source_mappings = sanitize_table(source_mappings.df.copy(), mondo, source)
df_lexmatch_matches = lexmatch_matches.df.copy()
df_negative_matches = negative_matches.df.copy()
#negative_matches.df = negative_matches.df.astype(str)

df_mondo_mappings_exact

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,confidence
0,MONDO:0000001,disease,skos:exactMatch,DOID:4,disease,semapv:UnspecifiedMatching,0.95
1,MONDO:0000001,disease,skos:exactMatch,MEDGEN:4347,,semapv:UnspecifiedMatching,0.95
2,MONDO:0000001,disease,skos:exactMatch,NCIT:C2991,Disease or Disorder,semapv:UnspecifiedMatching,0.95
3,MONDO:0000001,disease,skos:exactMatch,Orphanet:377788,Disease,semapv:UnspecifiedMatching,0.95
4,MONDO:0000001,disease,skos:exactMatch,SCTID:64572001,,semapv:UnspecifiedMatching,0.95
...,...,...,...,...,...,...,...
98855,MONDO:8000030,obsolete morphological anomaly,skos:exactMatch,Orphanet:377791,Morphological anomaly,semapv:UnspecifiedMatching,0.95
98856,MONDO:8000031,obsolete subtype of a disorder,skos:exactMatch,Orphanet:557494,subtype of a disorder,semapv:UnspecifiedMatching,0.95
98857,MONDO:8000032,obsolete malformation syndrome,skos:exactMatch,Orphanet:377789,Malformation syndrome,semapv:UnspecifiedMatching,0.95
98858,MONDO:8000033,obsolete group of disorders,skos:exactMatch,Orphanet:557492,group of disorders,semapv:UnspecifiedMatching,0.95


In [45]:
pd.notnull(mondo.label("kkk"))

False

In [46]:
def get_one_hop_mappings(df_mondo_mappings, df_source_mappings):
    # Get all mappings that are one hop away from Mondo
    merged_df = pd.merge(
        df_mondo_mappings,
        df_source_mappings,
        on=["object_id", "predicate_id"],
        suffixes=("_mondo", "_source")
    )
    
    # Create the new mappings
    one_hop_mappings = pd.DataFrame({
        "subject_id": merged_df["subject_id_mondo"],
        "object_id": merged_df["subject_id_source"],
        "predicate_id": merged_df["predicate_id"],
        "mapping_justification": "semapv:MappingChaining",
        "confidence": merged_df["confidence_mondo"] * merged_df["confidence_source"],
        "comment": "Generated by matching the object of a Mondo mapping to the object of a source mapping."
    })
    
    return one_hop_mappings

def prepare_mapping_candidates(df_mondo_mappings, df_source_mappings, df_lexmatch_matches, df_negative_matches):
    
    # 1. Take all lexical matches found
    candidate_mappings = df_lexmatch_matches.copy()
    
    # 2. Add all simple one hop mappings
    one_hop_mappings = get_one_hop_mappings(df_mondo_mappings, df_source_mappings)
    one_hop_mappings = one_hop_mappings.merge(
        candidate_mappings[['subject_id', 'predicate_id', 'object_id']],
        on=['subject_id', 'predicate_id', 'object_id'],
        how='left',
        indicator=True
    )
    one_hop_mappings = one_hop_mappings[one_hop_mappings['_merge'] == 'left_only']
    one_hop_mappings.drop(columns=['_merge'], inplace=True)
    candidate_mappings = pd.concat([candidate_mappings, one_hop_mappings], ignore_index=True)

    # 3. Remove all mappings that are already in Mondo
    candidate_mappings = candidate_mappings[~candidate_mappings['object_id'].isin(df_mondo_mappings['object_id'])]

    # 4. Remove all mappings that are already in the source
    candidate_mappings = candidate_mappings.merge(
        df_negative_matches[['subject_id', 'predicate_id', 'object_id']],
        on=['subject_id', 'predicate_id', 'object_id'],
        how='left',
        indicator=True
    )
    candidate_mappings = candidate_mappings[candidate_mappings['_merge'] == 'left_only']
    candidate_mappings.drop(columns=['_merge'], inplace=True)
    
    # 5. Remove all mappings that are not between the source and Mondo
    candidate_mappings = candidate_mappings[
        candidate_mappings['subject_id'].str.startswith('MONDO') & 
        candidate_mappings['object_id'].str.startswith(source_id_space)
    ]
    
    return candidate_mappings

candidate_mappings = prepare_mapping_candidates(df_mondo_mappings_exact, df_source_mappings, df_lexmatch_matches, df_negative_matches)
candidate_mappings = sanitize_table(candidate_mappings, mondo, source)
candidate_mappings.to_csv("candidate_mappings.tsv", sep="\t", index=False)
candidate_mappings

/var/folders/vj/ks1_0k8x3t9ftrwcr0t9vjwr0000gn/T/ipykernel_59403/133923016.py:11: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
When using the Copy-on-Write mode, such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object.


  df['confidence'].fillna(default_value, inplace=True)  # Replace NaN with default value


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,comment
0,MONDO:0000001,disease,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,rdfs:label,medical condition,
1,MONDO:0000001,disease,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,rdfs:label,condition,
2,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,rdfs:label,hypocortisolemia,
3,MONDO:0000328,hyperphosphatemia,skos:exactMatch,NCIT:C113750,Hyperphosphatemia,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,rdfs:label,hyperphosphatemia,
4,MONDO:0000328,hyperphosphatemia,skos:exactMatch,NCIT:C113750,Hyperphosphatemia,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,hyperphosphatemia,
...,...,...,...,...,...,...,...,...,...,...,...,...
2598,MONDO:0956990,"supratentorial ependymoma, ZFTA fusion–positive",skos:exactMatch,NCIT:C186350,Supratentorial Ependymoma ZFTA Fusion-Positive,semapv:MappingChaining,,0.475000,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
2599,MONDO:0956991,"supratentorial ependymoma, YAP1 fusion–positive",skos:exactMatch,NCIT:C186351,Supratentorial Ependymoma YAP1 Fusion-Positive,semapv:MappingChaining,,0.475000,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
2600,MONDO:0956992,posterior fossa group A ependymoma,skos:exactMatch,NCIT:C186450,"Posterior Fossa Ependymoma, Group A (PFA)",semapv:MappingChaining,,0.475000,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
2601,MONDO:0956993,posterior fossa group B ependymoma,skos:exactMatch,NCIT:C186451,"Posterior Fossa Ependymoma, Group B (PFB)",semapv:MappingChaining,,0.475000,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.


[Back](#overview)

---
<a id="download"></a>
### Load Mondo using OAK pronto adapter


In [47]:
from oaklib.datamodels.vocabulary import IS_A

def get_unmapped_disease_list(source_diseases_to_align, mondo_mappings):
    unmapped_source_diseases = []

    # Iterate through all source diseases, and find those that are not yet mapped exactly to Mondo
    for source_disease in source_diseases_to_align:
        matches = mondo_mappings[mondo_mappings["object_id"] == source_disease]['subject_id'].tolist()
        if not matches:
            unmapped_source_diseases.append(source_disease)
    
    return unmapped_source_diseases

def create_matches_index_for_unmapped_disease(unmapped_source_diseases, candidate_mappings):
    matches_list = {}
    for disease in unmapped_source_diseases:
        matches = candidate_mappings[candidate_mappings["object_id"] == disease]
        matches_list[disease] = matches
    return matches_list

# Get all the descendants of the source roots. The goal is to align all diseases under these roots.
source_diseases_to_align = list(set(source.descendants(source_roots, predicates=[IS_A])))

# Get all the diseases that are not yet mapped to Mondo
unmapped_source_diseases = get_unmapped_disease_list(source_diseases_to_align, df_mondo_mappings_exact)

# Generate a dictionary that maps each unmapped source disease to its candidate mappings
matches_index = create_matches_index_for_unmapped_disease(unmapped_source_diseases, candidate_mappings)


In [48]:
import getpass
import os
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate

#os.environ["OPENAI_API_KEY"] = getpass.getpass()

model = ChatOpenAI(model="gpt-4o")

In [49]:
from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import date
#from sssom_pydantic import Mapping

# Assuming PredicateModifierEnum, EntityTypeEnum, MappingCardinalityEnum are defined elsewhere

class Mapping(BaseModel):
    """
    Represents an individual mapping between a pair of entities.
    """
    subject_id: str = Field(..., description="The ID of the subject of the mapping.")
    subject_label: Optional[str] = Field(None, description="The label of subject of the mapping.")
    subject_category: Optional[str] = Field(None, description="The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary. This slot is deliberately underspecified. Conceptual categories can range from those that are found in general upper ontologies such as BFO (e.g. process, temporal region, etc) to those that serve as upper ontologies in specific domains, such as COB or BioLink (e.g. gene, disease, chemical entity). The purpose of this optional field is documentation for human reviewers - when a category is known and documented clearly, the cost of interpreting and evaluating the mapping decreases.")
    predicate_id: str = Field(..., description="The ID of the predicate or relation that relates the subject and object of this match.")
    predicate_label: Optional[str] = Field(None, description="The label of the predicate/relation of the mapping.")
    predicate_modifier: Optional[str] = Field(None, description="A modifier for negating the predicate. See https://github.com/mapping-commons/sssom/issues/40 for discussion.")
    object_id: str = Field(..., description="The ID of the object of the mapping.")
    object_label: Optional[str] = Field(None, description="The label of object of the mapping.")
    object_category: Optional[str] = Field(None, description="The conceptual category to which the subject belongs to. This can be a string denoting the category or a term from a controlled vocabulary. This slot is deliberately underspecified. Conceptual categories can range from those that are found in general upper ontologies such as BFO (e.g. process, temporal region, etc) to those that serve as upper ontologies in specific domains, such as COB or BioLink (e.g. gene, disease, chemical entity). The purpose of this optional field is documentation for human reviewers - when a category is known and documented clearly, the cost of interpreting and evaluating the mapping decreases.")
    mapping_justification: str = Field(..., description="A mapping justification is an action (or the written representation of that action) of showing a mapping to be right or reasonable.")
    author_id: Optional[List[str]] = Field(default_factory=list, description="Identifies the persons or groups responsible for asserting the mappings. Recommended to be a list of ORCIDs or otherwise identifying URIs.")
    author_label: Optional[List[str]] = Field(default_factory=list, description="A string identifying the author of this mapping. In the spirit of provenance, consider using author_id instead.")
    reviewer_id: Optional[List[str]] = Field(default_factory=list, description="Identifies the persons or groups that reviewed and confirmed the mapping. Recommended to be a list of ORCIDs or otherwise identifying URIs.")
    reviewer_label: Optional[List[str]] = Field(default_factory=list, description="A string identifying the reviewer of this mapping. In the spirit of provenance, consider using reviewer_id instead.")
    creator_id: Optional[List[str]] = Field(default_factory=list, description="Identifies the persons or groups responsible for the creation of the mapping. The creator is the agent that put the mapping in its published form, which may be different from the author, which is a person that was actively involved in the assertion of the mapping. Recommended to be a list of ORCIDs or otherwise identifying URIs.")
    creator_label: Optional[List[str]] = Field(default_factory=list, description="A string identifying the creator of this mapping. In the spirit of provenance, consider using creator_id instead.")
    license: Optional[str] = Field(None, description="A url to the license of the mapping. In absence of a license we assume no license.")
    subject_type: Optional[str] = Field(None, description="The type of entity that is being mapped.")
    subject_source: Optional[str] = Field(None, description="URI of vocabulary or identifier source for the subject.")
    subject_source_version: Optional[str] = Field(None, description="Version IRI or version string of the source of the subject term.")
    object_type: Optional[str] = Field(None, description="The type of entity that is being mapped.")
    object_source: Optional[str] = Field(None, description="URI of vocabulary or identifier source for the object.")
    object_source_version: Optional[str] = Field(None, description="Version IRI or version string of the source of the object term.")
    mapping_provider: Optional[str] = Field(None, description="URL pointing to the source that provided the mapping, for example an ontology that already contains the mappings, or a database from which it was derived.")
    mapping_source: Optional[str] = Field(None, description="The mapping set this mapping was originally defined in. mapping_source is used for example when merging multiple mapping sets or deriving one mapping set from another.")
    mapping_cardinality: Optional[str] = Field(None, description="A string indicating whether this mapping is from a 1:1 (the subject_id maps to a single object_id), 1:n (the subject maps to more than one object_id), n:1, 1:0, 0:1 or n:n group. Note that this is a convenience field that should be derivable from the mapping set.")
    mapping_tool: Optional[str] = Field(None, description="A reference to the tool or algorithm that was used to generate the mapping. Should be a URL pointing to more info about it, but can be free text.")
    mapping_tool_version: Optional[str] = Field(None, description="Version string that denotes the version of the mapping tool used.")
    mapping_date: Optional[date] = Field(None, description="The date the mapping was asserted. This is different from the date the mapping was published or compiled in a SSSOM file.")
    publication_date: Optional[date] = Field(None, description="The date the mapping was published. This is different from the date the mapping was asserted.")
    confidence: Optional[float] = Field(None, description="A score between 0 and 1 to denote the confidence or probability that the match is correct, where 1 denotes total confidence.")
    curation_rule: Optional[List[str]] = Field(default_factory=list, description="A curation rule is a (potentially) complex condition executed by an agent that led to the establishment of a mapping. Curation rules often involve complex domain-specific considerations, which are hard to capture in an automated fashion. The curation rule is captured as a resource rather than a string, which enables higher levels of transparency and sharing across mapping sets. The URI representation of the curation rule is expected to be a resolvable identifier which provides details about the nature of the curation rule.")
    curation_rule_text: Optional[List[str]] = Field(default_factory=list, description="A curation rule is a (potentially) complex condition executed by an agent that led to the establishment of a mapping. Curation rules often involve complex domain-specific considerations, which are hard to capture in an automated fashion. The curation rule should be captured as a resource (entity reference) rather than a string (see curation_rule element), which enables higher levels of transparency and sharing across mapping sets. The textual representation of curation rule is intended to be used in cases where (1) the creation of a resource is not practical from the perspective of the mapping_provider and (2) as an additional piece of metadata to augment the curation_rule element with a human readable text.")
    subject_match_field: Optional[List[str]] = Field(default_factory=list, description="A list of properties (term annotations on the subject) that was used for the match.")
    object_match_field: Optional[List[str]] = Field(default_factory=list, description="A list of properties (term annotations on the object) that was used for the match.")
    match_string: Optional[List[str]] = Field(default_factory=list, description="String that is shared by subj/obj. It is recommended to indicate the fields for the match using the object and subject_match_field slots.")
    subject_preprocessing: Optional[List[str]] = Field(default_factory=list, description="Method of preprocessing applied to the fields of the subject. If different preprocessing steps were performed on different fields, it is recommended to store the match in separate rows.")
    object_preprocessing: Optional[List[str]] = Field(default_factory=list, description="Method of preprocessing applied to the fields of the object. If different preprocessing steps were performed on different fields, it is recommended to store the match in separate rows.")
    semantic_similarity_score: Optional[float] = Field(None, description="A score between 0 and 1 to denote the semantic similarity, where 1 denotes equivalence.")
    semantic_similarity_measure: Optional[str] = Field(None, description="The measure used for computing the the semantic similarity score. To make processing this field as unambiguous as possible, we recommend using wikidata identifiers, but wikipedia pages could also be acceptable.")
    see_also: Optional[List[str]] = Field(default_factory=list, description="""A URL specific for the mapping instance. E.g. for kboom we have a per-mapping image that shows surrounding axioms that drive probability. Could also be a github issue URL that discussed a complicated alignment""")
    issue_tracker_item: Optional[str] = Field(None, description="""The issue tracker item discussing this mapping.""")
    other: Optional[str] = Field(None, description="""Pipe separated list of key value pairs for properties not part of the SSSOM spec. Can be used to encode additional provenance data.""")
    comment: Optional[str] = Field(None, description="""Free text field containing either curator notes or text generated by tool providing additional informative information.""")

class MappingList(BaseModel):
    mappings: List[Mapping]

In [52]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

output_parser = JsonOutputParser(pydantic_object=MappingList)

prompt = PromptTemplate(
    template="""
        You are a medical terminology specialist. 
        
        Your task is to find an appropriate match for the disease "{disease}" 
        in the Mondo Ontology from a set of candidate mappings.
        
        {disease} has has the following definition in the source ontology: {definition}.
        
        You will evaluate set of candidate mappings in SSSOM format. If you believe a mapping is wrong,
        set the predicate_modifier to "not" and provide a comment explaining why you believe the mapping is incorrect.
        
        Do not duplicate rows, the number of reviewed mappings should be equal to the number of candidate mappings.
             
        Candidate mappings:
        {mappings}
        
        {format_instructions}
    """,
    input_variables=["disease", "definition", "mappings"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

chain = prompt | model | output_parser

In [53]:
def review_mappings(matches_index, chain, source, count):
    reviewed_mappings = [] 
    for disease, matches in matches_index.items():
        label = source.label(disease)
        definition = source.definition(disease)
        if len(matches)>1:
            result = chain.invoke({
                "mappings": matches.to_markdown(index=False),
                "disease": label,
                "definition": definition
                })
            reviewed_mappings.extend(result['mappings'])
            count -= 1
            if count <= 0:
                break
    df_reviewed_mappings = pd.DataFrame(reviewed_mappings)
    return df_reviewed_mappings

count = 15
df_reviewed_mappings = review_mappings(matches_index, chain, source, count)
df_reviewed_mappings.to_csv("reviewed_mappings.tsv", sep="\t", index=False)
df_reviewed_mappings

Unnamed: 0,subject_id,subject_label,predicate_id,predicate_modifier,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,comment
0,MONDO:0016733,ganglioglioma,skos:closeMatch,not,NCIT:C27362,Childhood Ganglioglioma,semapv:LexicalMatching,oaklib,0.5,oio:hasNarrowSynonym,rdfs:label,childhood ganglioglioma,"The term 'ganglioglioma' is a broader category that includes both childhood and adult cases, whereas 'Childhood Ganglioglioma' is specific to pediatric cases. Therefore, it is not appropriate to consider them as close matches."
1,MONDO:0016733,ganglioglioma,skos:exactMatch,not,NCIT:C27362,Childhood Ganglioglioma,semapv:MappingChaining,,0.475,,,,"The term 'ganglioglioma' encompasses both childhood and adult forms of the disease, whereas 'Childhood Ganglioglioma' is specific to pediatric cases. Therefore, it is incorrect to consider them as exact matches."
2,MONDO:0022096,pyogenic granuloma,skos:closeMatch,not,NCIT:C3480,Lobular Capillary Hemangioma,semapv:LexicalMatching,oaklib,0.5,[oio:hasRelatedSynonym],[rdfs:label],lobular capillary hemangioma,"Pyogenic granuloma, although sometimes used synonymously with lobular capillary hemangioma, can present in various forms not strictly classified as lobular capillary hemangiomas. Therefore, this mapping may not always hold true."
3,MONDO:0022096,pyogenic granuloma,skos:closeMatch,not,NCIT:C3480,Lobular Capillary Hemangioma,semapv:LexicalMatching,oaklib,0.5,[oio:hasRelatedSynonym],[rdfs:label],lobular capillary hemangioma,"Pyogenic granuloma, although sometimes used synonymously with lobular capillary hemangioma, can present in various forms not strictly classified as lobular capillary hemangiomas. Therefore, this mapping may not always hold true."
4,MONDO:0006292,malignant mesothelioma,skos:closeMatch,,NCIT:C8420,Diffuse Malignant Mesothelioma,semapv:LexicalMatching,oaklib,0.5,oio:hasNarrowSynonym,rdfs:label,diffuse malignant mesothelioma,
5,MONDO:0006292,malignant mesothelioma,skos:exactMatch,not,NCIT:C8420,Diffuse Malignant Mesothelioma,semapv:MappingChaining,,0.475,,,,The term 'malignant mesothelioma' is broader and not an exact match for 'Diffuse Malignant Mesothelioma'.
6,MONDO:0007959,medulloblastoma,skos:closeMatch,not,NCIT:C27294,Localized Primitive Neuroectodermal Tumor,semapv:LexicalMatching,oaklib,0.5,[oio:hasRelatedSynonym],[rdfs:label],localized primitive neuroectodermal tumor,Medulloblastoma and Localized Primitive Neuroectodermal Tumor are distinct entities; medulloblastoma is a specific type of primitive neuroectodermal tumor but not synonymous with localized primitive neuroectodermal tumor.
7,MONDO:0007959,medulloblastoma,skos:exactMatch,not,NCIT:C27294,Localized Primitive Neuroectodermal Tumor,semapv:MappingChaining,,0.475,,,,Medulloblastoma and Localized Primitive Neuroectodermal Tumor are distinct entities; medulloblastoma is a specific type of primitive neuroectodermal tumor but not synonymous with localized primitive neuroectodermal tumor.
8,MONDO:0006292,malignant mesothelioma,skos:closeMatch,not,NCIT:C27926,Asbestos-Related Malignant Mesothelioma,semapv:LexicalMatching,oaklib,0.5,[oio:hasNarrowSynonym],[rdfs:label],[asbestos-related malignant mesothelioma],"Malignant mesothelioma is a broader category that includes asbestos-related malignant mesothelioma as a subset. Therefore, this should not be a 'closeMatch'."
9,MONDO:0006292,malignant mesothelioma,skos:exactMatch,not,NCIT:C27926,Asbestos-Related Malignant Mesothelioma,semapv:MappingChaining,,0.475,,,,"Malignant mesothelioma is a more general term than asbestos-related malignant mesothelioma. Therefore, they are not exact matches."


In [57]:
df_reviewed_mappings_exact = df_reviewed_mappings[df_reviewed_mappings['predicate_id']=="skos:exactMatch"]
df_reviewed_mappings_exact_correct = df_reviewed_mappings_exact[df_reviewed_mappings_exact['predicate_modifier']!="not"]
df_reviewed_mappings_exact_not = df_reviewed_mappings_exact[df_reviewed_mappings_exact['predicate_modifier']=="not"]
df_reviewed_mappings_exact_correct

Unnamed: 0,subject_id,subject_label,predicate_id,predicate_modifier,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,comment
11,MONDO:0016824,infantile myofibromatosis,skos:exactMatch,,NCIT:C27498,Infantile Hemangiopericytoma,semapv:LexicalMatching,oaklib,0.8,[oio:hasExactSynonym],[rdfs:label],[infantile hemangiopericytoma],"Infantile myofibromatosis and Infantile Hemangiopericytoma are different conditions. Infantile myofibromatosis is characterized by the presence of multiple myofibromas, while Infantile Hemangiopericytoma is a rare vascular tumor. The lexical match does not imply they are the same disease."
12,MONDO:0004520,intratubular embryonal carcinoma,skos:exactMatch,,NCIT:C192096,Intratubular Embryonal Carcinoma,semapv:LexicalMatching,oaklib,0.8,[oio:hasExactSynonym],[rdfs:label],[intratubular embryonal carcinoma],
13,MONDO:0004520,intratubular embryonal carcinoma,skos:exactMatch,,NCIT:C192096,Intratubular Embryonal Carcinoma,semapv:LexicalMatching,oaklib,0.849779,[rdfs:label],[rdfs:label],[intratubular embryonal carcinoma],
15,MONDO:0850335,IDH-wildtype glioblastoma,skos:exactMatch,,NCIT:C39750,"Glioblastoma, IDH-Wildtype",semapv:MappingChaining,,0.475,,,,"This mapping is correct as 'IDH-wildtype glioblastoma' accurately matches 'Glioblastoma, IDH-Wildtype'."
42,MONDO:0002368,papillary serous cystadenocarcinoma,skos:exactMatch,,NCIT:C4182,Serous Surface Papillary Carcinoma,semapv:MappingChaining,,0.475,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
44,MONDO:0003125,testicular sex cord-stromal neoplasm,skos:exactMatch,,NCIT:C39948,Malignant Testicular Sex Cord-Stromal Tumor,semapv:MappingChaining,,0.475,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
46,MONDO:0850303,supratentorial meningioma,skos:exactMatch,,NCIT:C7048,Supratentorial Meningioma,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,supratentorial meningioma,


In [62]:
df_mondo_mappings_exact_source = df_mondo_mappings_exact[df_mondo_mappings_exact['object_id'].str.startswith(source_id_space)]
df_mondo_mappings_exact_source['review_id']="MONDO:MONDO"

df_mondo_mappings_exact_source_with_matches = pd.concat(
    [df_mondo_mappings_exact_source,
    df_reviewed_mappings_exact_correct]
)

for mondo_id in df_mondo_mappings_exact_source_with_matches['subject_id'].unique():
   all_mappings = df_mondo_mappings_exact_source_with_matches[df_mondo_mappings_exact_source_with_matches['subject_id']==mondo_id]
   if len(all_mappings) > 1:
       print(f"{mondo_id} has multiple mappings: {all_mappings['object_id'].tolist()}")

df_mondo_mappings_exact_source_with_matches.to_csv("mondo_mappings_exact_source_with_matches.tsv", sep="\t", index=False)
df_mondo_mappings_exact_source_with_matches

MONDO:0001147 has multiple mappings: ['NCIT:C101209', 'NCIT:C105595']
MONDO:0001748 has multiple mappings: ['NCIT:C3540', 'NCIT:C9332']
MONDO:0002119 has multiple mappings: ['NCIT:C173820', 'NCIT:C8422']
MONDO:0002142 has multiple mappings: ['NCIT:C114541', 'NCIT:C4247']
MONDO:0002277 has multiple mappings: ['NCIT:C34398', 'NCIT:C34403']
MONDO:0002368 has multiple mappings: ['NCIT:C8377', 'NCIT:C4182']
MONDO:0003125 has multiple mappings: ['NCIT:C6358', 'NCIT:C39948']
MONDO:0003143 has multiple mappings: ['NCIT:C2874', 'NCIT:C4488']
MONDO:0003947 has multiple mappings: ['NCIT:C3990', 'NCIT:C84783']
MONDO:0004473 has multiple mappings: ['NCIT:C35697', 'NCIT:C4836']
MONDO:0004520 has multiple mappings: ['NCIT:C7325', 'NCIT:C192096', 'NCIT:C192096']
MONDO:0004795 has multiple mappings: ['NCIT:C3299', 'NCIT:C79601']
MONDO:0005244 has multiple mappings: ['NCIT:C119734', 'NCIT:C4731']
MONDO:0005272 has multiple mappings: ['NCIT:C2872', 'NCIT:C82591']
MONDO:0005283 has multiple mappings: ['NC

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,confidence,review_id,predicate_modifier,mapping_tool,subject_match_field,object_match_field,match_string,comment
2,MONDO:0000001,disease,skos:exactMatch,NCIT:C2991,Disease or Disorder,semapv:UnspecifiedMatching,0.950000,MONDO:MONDO,,,,,,
9,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C26691,Adrenocortical Insufficiency,semapv:UnspecifiedMatching,0.950000,MONDO:MONDO,,,,,,
22,MONDO:0000022,nocturnal enuresis,skos:exactMatch,NCIT:C118172,Nocturnal Enuresis,semapv:UnspecifiedMatching,0.950000,MONDO:MONDO,,,,,,
61,MONDO:0000087,polymicrogyria,skos:exactMatch,NCIT:C116936,Polymicrogyria,semapv:UnspecifiedMatching,0.950000,MONDO:MONDO,,,,,,
68,MONDO:0000088,precocious puberty,skos:exactMatch,NCIT:C79704,Precocious Puberty,semapv:UnspecifiedMatching,0.950000,MONDO:MONDO,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,MONDO:0004520,intratubular embryonal carcinoma,skos:exactMatch,NCIT:C192096,Intratubular Embryonal Carcinoma,semapv:LexicalMatching,0.849779,,,oaklib,[rdfs:label],[rdfs:label],[intratubular embryonal carcinoma],
15,MONDO:0850335,IDH-wildtype glioblastoma,skos:exactMatch,NCIT:C39750,"Glioblastoma, IDH-Wildtype",semapv:MappingChaining,0.475000,,,,,,,"This mapping is correct as 'IDH-wildtype glioblastoma' accurately matches 'Glioblastoma, IDH-Wildtype'."
42,MONDO:0002368,papillary serous cystadenocarcinoma,skos:exactMatch,NCIT:C4182,Serous Surface Papillary Carcinoma,semapv:MappingChaining,0.475000,,,,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
44,MONDO:0003125,testicular sex cord-stromal neoplasm,skos:exactMatch,NCIT:C39948,Malignant Testicular Sex Cord-Stromal Tumor,semapv:MappingChaining,0.475000,,,,,,,Generated by matching the object of a Mondo mapping to the object of a source mapping.
