In [1]:
%reset -f

# Code examples interacting with Mondo

## Install oaklib
 - Installation of oaklib can be found in the [documentation here](https://incatools.github.io/ontology-access-kit/intro/tutorial01.html)
   

In [6]:
from pathlib import Path
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF, SEMAPV
import pandas as pd
import curies
import llm
import re
import logging
import json

### LOGGING SETTINGS
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
logging.getLogger().addHandler(logging.NullHandler())

### LOAD OTHER LIBRARIES
converter = curies.get_obo_converter()

### LLM SETTINGS
model = llm.get_model("gpt-4")
model.key = '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'
response = model.prompt("Five surprising names for a pet pelican")
print(response.text())

### LOAD MONDO
mondo = get_adapter(f"sqlite:obo:mondo")


1. Puffington
2. Snorkel Gobbler
3. Dupree Diver
4. Picklebeak
5. Sir Squawks-a-Lot


In [13]:
def valid_relation(relation):
    prefix = converter.parse_curie(relation)[0]
    if prefix:
        if re.match("^[A-Za-z]+$", prefix):
            return True
    return False

def normalise_assessment(assessment):
    assessment = assessment.strip()
    assessment = re.sub("[\.\?\!]+$", "", assessment)
    assessment = re.sub("^[\.\?\!]+", "", assessment)
    assessment = assessment.lower()
    if assessment in ["yes", "no"]:
        return assessment
    return "INVALID"

def normalize_confidence(confidence):
    try:
        # Convert to float and check if the value is between 0 and 1
        confidence = float(confidence)
        if 0 <= confidence <= 1:
            return confidence
    except (TypeError, ValueError):
        # Catch exceptions if confidence is not a number or cannot be converted to float
        pass
    return "INVALID"

def normalize_label(label):
    label = label.strip()
    label = label.replace("\\n", "")
    label = re.sub("[\.\?\!]+$", "", label)
    label = re.sub("^[\.\?\!]+", "", label)
    return label

In [8]:
prompt_review_rels = """You are a clinical terminology expert.

Your task is to determine if %s (%s) %s (%s) of %s (%s).

A subclass (one kind of relation) is a more specific type of the parent class, 
but we are interested in other relationships as well,
like phenotype associations and anatomical locations.

Provide your assessment as a yes/no answer, and provide 
a number between 0 and 1 to indicate your confidence in your answer.
This confidence score should reflect how strongly you, as a clinical terminology expert,
feel that your assessment is correct.
Return the relationship_id as a CURIE, e.g. RO:0004029, rdfs:subClassOf.

Provide your answer as a json dictionary of the following keys: "assessment", "confidence", "explanation of assessment".
Do not include any other comments in your output.
"""

def review_relationships_in_branch(branch, prompt, ontology, ct = 10, map_rels_to_labels = {}):
    data = []

    for disease in set(ontology.descendants([branch])):
        disease_name = ontology.label(disease)
        outgoing_relationships = ontology.outgoing_relationship_map(disease)
        relationships = outgoing_relationships.keys()
        for relationship in relationships:
            relationship_label = map_rels_to_labels.get(relationship, relationship)
            disease_related_entities = outgoing_relationships[relationship]
            for related_entity in disease_related_entities:
                if not valid_relation(related_entity):
                    print(f"{related_entity} is not a valid relation")
                    continue
                related_entity_name = ontology.label(related_entity)
                response = model.prompt(prompt % (disease_name, disease, relationship_label, relationship, related_entity_name, related_entity))
                try:
                    row = json.loads(response.text())
                except:
                    logging.error(f"Error parsing response: {response.text()}")
                    continue
                row["disease_id"] = disease
                row["disease_name"] = disease_name
                row["relationship"] = relationship
                row["related_entity_id"] = related_entity
                row["related_entity_name"] = related_entity_name
                row["assessment"] = normalise_assessment(row["assessment"])
                row["confidence"] = normalize_confidence(row["confidence"])
                data.append(row)
                print(f"{ct}: {row}")
                ct = ct - 1
                if ct <= 0:
                    break
            if ct <= 0:
                break
        if ct <= 0:
            break

    df = pd.DataFrame(data)
    return df[["disease_id", "disease_name", "relationship", "related_entity_id", "related_entity_name", "assessment", "confidence", "explanation of assessment"]]


In [9]:
map_rels_to_labels = {
    'rdfs:subClassOf': 'is a subclass',
    'RO:0004029': 'is associated with a phenotype',
}

branch = "MONDO:0018076"

df = review_relationships_in_branch(branch, prompt_review_rels, mondo, ct=1, map_rels_to_labels=map_rels_to_labels)
df.to_csv("../../ontology/reports/mondo_subclass_review.tsv", index=False, sep="\t")

1: {'assessment': 'yes', 'confidence': 0.95, 'explanation of assessment': 'Tuberculous epididymitis can be considered a more specific type of epididymitis, as it is epididymitis specifically caused by tuberculosis. Therefore, it could be classified as a subclass (rdfs:subClassOf) of epididymitis.', 'relationship_id': 'rdfs:subClassOf', 'disease_id': 'MONDO:0001537', 'disease_name': 'tuberculous epididymitis', 'relationship': 'rdfs:subClassOf', 'related_entity_id': 'MONDO:0004779', 'related_entity_name': 'epididymitis'}


In [10]:
branches = []
for branch in mondo.incoming_relationships("MONDO:0700096", predicates=["rdfs:subClassOf"]):
    branches.append(branch[1])
print(len(branches))

41


In [20]:
#response = conversation.prompt("Five fun facts about pelicans")
#print(response.text())

branches_map_ids = {branch: mondo.label(branch) for branch in branches}
branches_map_labels = {value: key for key, value in branches_map_ids.items()}

branches_list = "\n".join([f"{mondo.label(branch)}" for branch in branches])

prompt_review_branches = f"""You are a clinical terminology expert.

We have the following groups of diseases in our disease ontology: 
{ branches_list }

The task is to determine to which of these groups of diseases %s (%s) 
should be grouped under.

Provide your answer as a list of json dictionaries. 
The individual json dictionaries have the following keys: "grouping", "confidence", "explanation of grouping".
The value of grouping is a disease name from the list above.
The value of confidence is a number between 0 and 1 to indicate your confidence in your answer.
The value of explanation of grouping is a string explaining why you think the disease should be grouped under the given grouping.
Do not include any other comments in your output.
"""

#print(prompt_review_branches)

def get_confidence(assessments, grouping):
    for assessment in assessments:
        if assessment["grouping"] == grouping:
            return assessment["confidence"]
    return "INVALID"

def get_explanation_of_grouping(assessments, grouping):
    for assessment in assessments:
        if assessment["grouping"] == grouping:
            return assessment["explanation of grouping"]
    return "INVALID"

def review_branch_membership(branch, prompt, ontology, branches_map_labels, ct = 10):
    data = []

    for disease in set(ontology.descendants([branch])):
        disease_name = ontology.label(disease)
        ancestors = ontology.ancestors(disease)
        branch_ancestors_mondo = list(set(ancestors) & set(branches))
        prompt_populated = prompt % (disease_name, disease)
        response = model.prompt(prompt_populated)
        ct = ct - 1
        if ct <= 0:
            break
        try:
            branch_ancestor_assessments_gpt = json.loads(response.text())
        except:
            print(f"Error parsing response: {response.text()}")
            continue
        print(branch_ancestor_assessments_gpt)
        branch_ancestors_gpt = []
        branch_ancestor_labels_gpt = [branch_ancestor_assessment_gpt["grouping"] for branch_ancestor_assessment_gpt in branch_ancestor_assessments_gpt]
        
        for branch_ancestor_label_gpt in branch_ancestor_labels_gpt:
            branch_ancestor_label_gpt = normalize_label(branch_ancestor_label_gpt)
            if branch_ancestor_label_gpt in branches_map_labels:
                branch_ancestor_gpt = branches_map_labels[branch_ancestor_label_gpt]
            else:
                branch_ancestor_gpt = f"{branch_ancestor_label_gpt} (INVALID)"
            branch_ancestors_gpt.append(branch_ancestor_gpt)
        
        branch_ancestors_both = list(set(branch_ancestors_gpt).union(set(branch_ancestors_mondo)))

        for branch_ancestor in branch_ancestors_both:
            if branch_ancestor in branch_ancestors_mondo and branch_ancestor in branch_ancestors_gpt:
                confirmation_status = "in both"
            elif branch_ancestor in branch_ancestors_mondo: 
                confirmation_status = "only mondo"
            elif branch_ancestor in branch_ancestors_gpt:
                confirmation_status = "only gpt"
            else:
                confirmation_status = "INVALID"
            
            branch_ancestor_name = branches_map_ids.get(branch_ancestor, f"{branch_ancestor} INVALID")
            confidence = get_confidence(branch_ancestor_assessments_gpt, branch_ancestor_name)
            explanation_of_grouping = get_explanation_of_grouping(branch_ancestor_assessments_gpt, branch_ancestor_name)
            
            row = {
                "disease_id": disease,
                "disease_name": disease_name,
                "branch_ancestor_id": branch_ancestor,
                "branch_ancestor_name": branch_ancestor_name,
                "confirmation_status": confirmation_status,
                "confidence": confidence,
                "explanation of grouping": explanation_of_grouping
            }
            print(f"{ct}: {row}")
            data.append(row)
        
        if ct <= 0:
            break

    return pd.DataFrame(data)


[{'grouping': 'urinary system disorder', 'confidence': 1, 'explanation of grouping': 'Tuberculous epididymitis is an infection of the epididymis, which is part of the male reproductive/urinary system. Consequently, it falls into the category of urinary system disorder.'}, {'grouping': 'infectious disease', 'confidence': 1, 'explanation of grouping': 'The disease is caused by the bacteria Mycobacterium tuberculosis, making it an infectious disease.'}, {'grouping': 'reproductive system disorder', 'confidence': 0.75, 'explanation of grouping': "Whilst the epididymis is part of the reproductive system, tuberculous epididymitis primarily affects urine transportation - it's classified here due to its location, but this grouping is not as pertinent as urinary system disorder."}]
1: {'disease_id': 'MONDO:0001537', 'disease_name': 'tuberculous epididymitis', 'branch_ancestor_id': 'MONDO:0021166', 'branch_ancestor_name': 'inflammatory disease', 'confirmation_status': 'only mondo', 'confidence': 

Unnamed: 0,disease_id,disease_name,branch_ancestor_id,branch_ancestor_name,confirmation_status,confidence,explanation of grouping
0,MONDO:0001537,tuberculous epididymitis,MONDO:0021166,inflammatory disease,only mondo,INVALID,INVALID
1,MONDO:0001537,tuberculous epididymitis,MONDO:0005550,infectious disease,in both,1,The disease is caused by the bacteria Mycobact...
2,MONDO:0001537,tuberculous epididymitis,MONDO:0002118,urinary system disorder,only gpt,1,Tuberculous epididymitis is an infection of th...
3,MONDO:0001537,tuberculous epididymitis,MONDO:0005039,reproductive system disorder,in both,0.75,Whilst the epididymis is part of the reproduct...


In [21]:
df_bm = review_branch_membership("MONDO:0018076", prompt_review_branches, mondo, branches_map_labels, ct=50)
df_bm

[{'grouping': 'infectious disease', 'confidence': 1.0, 'explanation of grouping': 'Tuberculous epididymitis is caused by the bacteria Mycobacterium tuberculosis, which makes it an infectious disease.'}, {'grouping': 'reproductive system disorder', 'confidence': 1.0, 'explanation of grouping': 'Epididymitis refers to inflammation of the epididymis, a part of the male reproductive system, thereby categorizing it as a reproductive system disorder.'}, {'grouping': 'immune system disorder', 'confidence': 0.7, 'explanation of grouping': 'Tuberculous epididymitis could be classified as an immune system disorder since the tuberculosis infection triggers an immune response in the body. However, this classification is not as direct as the previous mentioned groups.'}]
49: {'disease_id': 'MONDO:0001537', 'disease_name': 'tuberculous epididymitis', 'branch_ancestor_id': 'MONDO:0005046', 'branch_ancestor_name': 'immune system disorder', 'confirmation_status': 'only gpt', 'confidence': 0.7, 'explana

In [None]:
df.to_csv("../../ontology/reports/mondo_branch_review.tsv", index=False, sep="\t")