### Test grounding with OntoGPT
### It doesn't seem to work on the cases where inexact matching doesn't work

In [3]:
# Example of a list of diseases to ground. These are ones where exact matching didn't work
stuff_to_ground = \
"""
LRBA deficiency
Brachydactyly type A
SCN2A-related epileptic encephalopathy
Isolated ectopia lentis (ADAMTSL4-related)
Foveal hypoplasia with nystagmus (SLC38A8-related)
Alpers syndrome (POLG)
Desmin-related cardiomyopathy
STXBP1-related early infantile epileptic encephalopathy
KCNQ2-related early onset epileptic encephalopathy
Mitochondrial DNA depletion syndrome (hepatocerebral form)
ARX-related infantile epileptic spasm syndrome
Juvenile Tay-Sachs disease
CDKL5 deficiency syndrome
SCN2A-related early infantile epileptic encephalopathy
CDKL5 deficiency syndrome
"""

In [4]:
from typing import Tuple, List
from ontogpt.engines.spires_engine import SPIRESEngine
from ontogpt.io.template_loader import get_template_details
import time
from warnings import warn
import logging

# Retry decorator
def retry_on_failure(max_retries=2, delay=1):
    def decorator(func):
        def wrapper(*args, **kwargs):
            retries = 0
            while retries <= max_retries:
                try:
                    return func(*args, **kwargs)
                except (TimeoutError, Exception) as e:  # Handle appropriate exceptions
                    retries += 1
                    if retries > max_retries:
                        logging.error(f"Max retries reached: {e}")
                        raise
                    logging.warning(f"Error occurred: {e}, retrying {retries}/{max_retries}...")
                    time.sleep(delay)
        return wrapper
    return decorator

@retry_on_failure(max_retries=2, delay=2)
def ground_to_mondo_with_ontogpt(
    ke: SPIRESEngine,
    diagnosis_str: str,
    no_grounding: List = [('N/A', 'No grounding found')],
    verbose: bool = False
) -> List[Tuple[str, str]]:
    """Run the multilingual analysis with retry on failures."""

    try:
        # Call the extract function here to ground the answer to OMIM (using MONDO, etc)
        extraction = ke.extract_from_text(text=diagnosis_str)
    except TimeoutError as e:
        logging.error(f"Timeout occurred: {e}")
        raise

    predictions = extraction.named_entities
    pred_ids = []
    pred_names = []
    
    for pred in predictions:
        pred_ids.append(pred.id)
        pred_names.append(pred.label)

    if len(pred_ids) == 0:
        warn(f"No grounded IDs found") if verbose else None
        return None
    # return as list of tuples of Mondos and their labels, e.g. 
    # [('MONDO:0008642', 'VACTERL/vater association')]), ('Feingold syndrome', [('MONDO:0008115', 'Feingold syndrome type 1'), ('MONDO:0015267', 'Feingold syndrome')]
    if predictions:
        return list(zip(pred_ids, pred_names))
    else:
        return no_grounding

# make a SPIRES object
template = "all_disease_grounding"
template_details = get_template_details(template=template)

model = "gpt-4-turbo"
temperature = 1.0
    
ke = SPIRESEngine(
    template_details=template_details,
    model=model,
    temperature=temperature
)

# Example grounding with retry in case of timeouts or other errors
try:
    result = ground_to_mondo_with_ontogpt(ke, "1. Branchiooculofacial syndrome\n2. Unicorn syndrome\n3. Cystic fibrosis")
    print(result)
except Exception as e:
    print(f"Failed after retries: {e}")

[('MONDO:0007235', 'Branchiooculofacial syndrome'), ('MONDO:0009061', 'Cystic fibrosis')]


In [5]:
for line in stuff_to_ground.split('\n'):
    print(line.strip())    
    print(ground_to_mondo_with_ontogpt(ke, line.strip())) 

ERROR:root:Line 'You haven't provided any text to extract entities from. Please provide the text and I will help identify the entities as you have specified.' does not contain a colon; ignoring



None
LRBA deficiency
None
Brachydactyly type A




None
SCN2A-related epileptic encephalopathy




None
Isolated ectopia lentis (ADAMTSL4-related)




[('MONDO:0015998', 'ectopia lentis')]
Foveal hypoplasia with nystagmus (SLC38A8-related)




[('MONDO:0044203', 'Foveal hypoplasia'), ('MONDO:0005712', 'nystagmus')]
Alpers syndrome (POLG)
[('MONDO:0008758', 'Alpers syndrome')]
Desmin-related cardiomyopathy




None
STXBP1-related early infantile epileptic encephalopathy
[('MONDO:0100062', 'early infantile epileptic encephalopathy')]
KCNQ2-related early onset epileptic encephalopathy




None
Mitochondrial DNA depletion syndrome (hepatocerebral form)
[('MONDO:0018158', 'Mitochondrial DNA depletion syndrome')]
ARX-related infantile epileptic spasm syndrome
None
Juvenile Tay-Sachs disease
[('MONDO:0010100', 'Tay-Sachs disease')]
CDKL5 deficiency syndrome
None
SCN2A-related early infantile epileptic encephalopathy
None
CDKL5 deficiency syndrome


ERROR:root:Line 'You haven't provided any text to extract entities from. Please provide the text and I will help identify the entities as you have specified.' does not contain a colon; ignoring


None

None


In [9]:
# install curategpt from here https://github.com/monarch-initiative/curate-gpt.git
!pip3 install git+https://github.com/monarch-initiative/curate-gpt.git

Collecting git+https://github.com/monarch-initiative/curate-gpt.git
  Cloning https://github.com/monarch-initiative/curate-gpt.git to /private/var/folders/vc/lfqgrrhn56d9yj5fbxbw6qr00000gt/T/pip-req-build-jh9j346z
  Running command git clone --filter=blob:none --quiet https://github.com/monarch-initiative/curate-gpt.git /private/var/folders/vc/lfqgrrhn56d9yj5fbxbw6qr00000gt/T/pip-req-build-jh9j346z
  Resolved https://github.com/monarch-initiative/curate-gpt.git to commit 3bb5f37f65386f45780582f3fe3305f045e08b6a
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting chromadb<0.6.0,>=0.5.0 (from curate-gpt==0.2.0)
  Downloading chromadb-0.5.9-py3-none-any.whl.metadata (6.8 kB)
Collecting duckdb<1.1.0,>=1.0.0 (from curate-gpt==0.2.0)
  Downloading duckdb-1.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (762 bytes)
Collecting google-api-python-client<3.0.0,>=2

In [0]:
import os
from curate_gpt.store import get_store
from typing import List, Tuple

# Define the path to the key file
key_file_path = os.path.expanduser("~/openai.key.another")

# Read the key from the file and set the environment variable
with open(key_file_path, "r") as key_file:
    openai_api_key = key_file.read().strip()

# Set the OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

def search(
    query,
    path,
    collection,
    database_type=None,
    limit=50,
    relevance_factor=0.23,
    no_grounding=[('N/A', 'No grounding found')],
    verbose=False,
    **kwargs,
) -> List[Tuple[str, str]]:
    """
    Search a collection using embedding search, returning Mondo disease IDs and labels.
    
    Parameters:
    - query: The search query string.
    - path: The path to the database.
    - collection: The collection to search.
    - show_documents: Whether to show the full documents or just the results.
    - database_type: Type of the database (e.g., chromadb, duckdb).
    - limit: Maximum number of results to return.
    - relevance_factor: Distance threshold to filter the results based on relevance.
    - kwargs: Additional options for the database search.
    
    Returns:
    - List of tuples: [(Mondo ID, Label), ...]
    """
    # Initialize the database store
    db = get_store(database_type, path)
    
    # Perform the search using the provided query
    results = db.search(query, collection=collection, **kwargs)

    # Filter results based on relevance factor (distance)
    if relevance_factor is not None:
        results = [(obj, distance, _meta) for obj, distance, _meta in results if distance <= relevance_factor]

    # Limit the results to the specified number (limit)
    limited_results = results[:limit]

    # Extract Mondo IDs and labels
    pred_ids = []
    pred_labels = []

    for obj, distance, _meta in limited_results:
        disease_mondo_id = obj.get("original_id")  # Use the 'original_id' field for Mondo ID
        disease_label = obj.get("label")
        
        if disease_mondo_id and disease_label:
            pred_ids.append(disease_mondo_id)
            pred_labels.append(disease_label)

    if len(pred_ids) == 0:
        if verbose:
            print("No grounded IDs found")
        return no_grounding

    # Return as a list of tuples (Mondo ID, Label)
    return list(zip(pred_ids, pred_labels))

In [78]:
# Example usage:
path = "../../curate-gpt/stagedb/"
collection = "ont_mondo"
database_type = "chromadb"
show_documents = True

scn2a_query = "SCN2A-related epileptic encephalopathy"

# Expected result
expected_result = [('MONDO:0013388', 'developmental and epileptic encephalopathy, 11')]

# Your search result, which should now return a list of tuples
scn2a_result = search(
    query=scn2a_query,
    path=path,
    collection=collection,
    show_documents=False,
    database_type=database_type,
    limit=1,
    relevance_factor=0.23
)

# Improved assertion with detailed error message
assert scn2a_result == expected_result, (
    f"Grounding did not return the expected result.\n"
    f"Expected: {expected_result}\n"
    f"Got: {scn2a_result}"
)

TypeError: Collection.query() got an unexpected keyword argument 'show_documents'

In [76]:
import pandas as pd

# List of diseases to ground
stuff_to_ground = """
LRBA deficiency
Brachydactyly type A
SCN2A-related epileptic encephalopathy
Isolated ectopia lentis (ADAMTSL4-related)
Foveal hypoplasia with nystagmus (SLC38A8-related)
Alpers syndrome (POLG)
Desmin-related cardiomyopathy
STXBP1-related early infantile epileptic encephalopathy
KCNQ2-related early onset epileptic encephalopathy
Mitochondrial DNA depletion syndrome (hepatocerebral form)
ARX-related infantile epileptic spasm syndrome
Juvenile Tay-Sachs disease
CDKL5 deficiency syndrome
SCN2A-related early infantile epileptic encephalopathy
CDKL5 deficiency syndrome
"""

# Path and collection setup
path = "../../curate-gpt/stagedb/"
collection = "ont_mondo"
database_type = "chromadb"

# Split the diseases into a list
diseases = [disease.strip() for disease in stuff_to_ground.splitlines() if disease.strip()]

# Perform grounding for each disease
results = []
for disease in diseases:
    result = search(
        query=disease,
        path=path,
        collection=collection,
        show_documents=False,
        database_type=database_type,
        limit=1,
        relevance_factor=0.23
    )
    results.append((disease, result[0] if result else ('N/A', 'No grounding found')))

# Create a DataFrame for easy reading
df = pd.DataFrame(results, columns=["Original Disease", "Grounded Result (Mondo ID, Label)"])
df

Unnamed: 0,Original Disease,"Grounded Result (Mondo ID, Label)"
0,LRBA deficiency,"(N/A, No grounding found)"
1,Brachydactyly type A,"(MONDO:0007215, brachydactyly type A1)"
2,SCN2A-related epileptic encephalopathy,"(MONDO:0013388, developmental and epileptic en..."
3,Isolated ectopia lentis (ADAMTSL4-related),"(MONDO:0009152, ectopia lentis 2, isolated, au..."
4,Foveal hypoplasia with nystagmus (SLC38A8-rela...,"(MONDO:0034978, isolated foveal hypoplasia)"
5,Alpers syndrome (POLG),"(N/A, No grounding found)"
6,Desmin-related cardiomyopathy,"(N/A, No grounding found)"
7,STXBP1-related early infantile epileptic encep...,"(MONDO:0012812, developmental and epileptic en..."
8,KCNQ2-related early onset epileptic encephalop...,"(MONDO:0013387, developmental and epileptic en..."
9,Mitochondrial DNA depletion syndrome (hepatoce...,"(MONDO:0014943, mitochondrial DNA depletion sy..."


In [75]:
import random 
from tqdm import tqdm 

# Read the diseases from the file and randomly sample 100 lines
ungrounded_things = "../data/ungrounded_items.txt"

with open(ungrounded_things, "r") as f:
    diseases = [line.strip() for line in f.readlines() if line.strip()]

# Randomly sample 100 diseases
sampled_diseases = random.sample(diseases, 50)

# Perform grounding for each sampled disease
results = []
for disease in tqdm(sampled_diseases, "working.."):
    result = search(
        query=disease,
        path=path,
        collection=collection,
        show_documents=False,
        database_type=database_type,
        limit=1,
        relevance_factor=0.23
    )
    results.append((disease, result[0] if result else ('N/A', 'No grounding found')))

# Create a DataFrame for easy reading
df = pd.DataFrame(results, columns=["Original disease", "CurateGPT grounded result (Mondo ID, Label)"])
df.to_csv("../data/ungrounded_items_grounded_with_curategpt.tsv", sep="\t")
!grep -c -i "No ground" ../data/ungrounded_items_grounded_with_curategpt.tsv
df

working..: 100%|██████████| 50/50 [00:12<00:00,  3.98it/s]

9





Unnamed: 0,Original disease,"CurateGPT grounded result (Mondo ID, Label)"
0,CDKL5 deficiency syndrome,"(MONDO:0100039, CDKL5 disorder)"
1,GRIN2A-related epilepsy-aphasia spectrum,"(MONDO:0017325, early-onset epileptic encephal..."
2,Von Hippel–Lindau syndrome,"(N/A, No grounding found)"
3,SCN2A-related early infantile epileptic enceph...,"(MONDO:0013388, developmental and epileptic en..."
4,STXBP1-related early infantile epileptic encep...,"(MONDO:0012812, developmental and epileptic en..."
5,Mitochondrial inherited diabetes and deafness ...,"(MONDO:0010785, maternally-inherited diabetes ..."
6,TTN-related dilated cardiomyopathy,"(MONDO:0011400, dilated cardiomyopathy 1G)"
7,CHD8-related autism spectrum disorder,"(N/A, No grounding found)"
8,X-linked infantile spasm syndrome (ARX-related),"(N/A, No grounding found)"
9,Autosomal recessive distal renal tubular acido...,"(MONDO:0018440, autosomal recessive distal ren..."
