In [2]:
import os
import pandas as pd
import time  # for adding a delay between retries
from tqdm import tqdm
from openai import OpenAI

# read key file ~/openai.key
with open(os.path.expanduser('~/openai.key'), 'r') as f:
    os.environ['OPENAI_API_KEY'] = f.read().strip()

# Initialize the client
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Directory containing the prompt files
model = "gpt-4o"
prompt_dir = "../supplemental_data/prompts"
output_file = '../supplemental_data/gpt_4o_response/gpt_4o_response.csv'
output_grounded_file = '../supplemental_data/gpt_4o_response/gpt_4o_response_grounded.csv'

# run GPT-4o and put in ../supplemental_data/gpt_4o_response/gpt_4o_response.csv
# should look like this:
# problem	service_answer	metadata
# "The patient was []"  response	PMID_32730804_Individual_3_en-prompt.txt

In [11]:
# Number of retries
max_retries = 3
retry_delay = 5  # seconds between retries

# Make output directory if it doesn't exist
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Function to call OpenAI's GPT model and get the response with retries
def get_gpt_response_with_retries(prompt, model="gpt-4", retries=max_retries):
    for attempt in range(retries):
        try:
            # Update the method according to the new API structure
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": "You are a helpful assistant."},
                          {"role": "user", "content": prompt}]
            )
            # Response is now a Pydantic model, use .choices[0].message.content
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error on attempt {attempt + 1}/{retries}: {e}")
            if attempt < retries - 1:
                time.sleep(retry_delay)  # wait before retrying
            else:
                return f"Error after {retries} retries: {e}"

# see if output file already exists, if so, die
if os.path.exists(output_file):
    raise FileExistsError(f"Output file {output_file} already exists, please move or delete it before running this script.")
else:
    # Create a blank DataFrame with the specified columns
    gpt_4o_responses = pd.DataFrame(columns=["problem", "service_answer", "metadata"])
    
    # List to store new rows
    new_responses = []
    
    # Filter for only .txt files in the directory
    prompt_files = [f for f in os.listdir(prompt_dir) if f.endswith('.txt')]
    
    # tqdm progress bar
    for prompt_file in tqdm(prompt_files):
        prompt_file_path = os.path.join(prompt_dir, prompt_file)
    
        # Read the prompt from the file
        with open(prompt_file_path, 'r') as f:
            prompt = f.read()
    
        # Get the GPT-4 response with retries
        response = get_gpt_response_with_retries(prompt, model=model)
    
        # Extract metadata from the filename (without the path)
        metadata = prompt_file
    
        # Append the new row in the correct format
        new_responses.append({
            "problem": prompt,
            "service_answer": response,
            "metadata": metadata
        })
    
    # Convert to DataFrame
    new_responses_df = pd.DataFrame(new_responses)
    
    # Append new data to df
    gpt_4o_responses = pd.concat([gpt_4o_responses, new_responses_df], ignore_index=True)
    
    # Save to the output CSV file
    gpt_4o_responses.to_csv(output_file, index=False)

    print(f"Responses saved to {output_file}")


100%|██████████| 5267/5267 [1:27:07<00:00,  1.01it/s]  

Responses saved to ../supplemental_data/gpt_4o_response/gpt_4o_response.csv





In [3]:
import pandas as pd

# we start with o1 responses here:
gpt_4o_responses = pd.read_csv(output_file, index_col=0)
gpt_4o_responses

Unnamed: 0_level_0,service_answer,metadata
problem,Unnamed: 1_level_1,Unnamed: 2_level_1
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was an individual. Disease onset was not specified.\nThe individual was found not to have the following features: Global developmental delay.\n",1. Cystic fibrosis\n2. Marfan syndrome\n3. Ehl...,PMID_34722527_individual_103_7_Hui_Wang_Compre...
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 16-year-old adolescent boy. Disease onset occurred when the proband was a newborn.\nHe presented with Global developmental delay, Delayed speech and language development, Ventricular septal defect, Bicuspid aortic valve, Hypoplastic aortic arch, Unilateral renal agenesis, Tracheoesophageal fistula, Anal atresia, and Tethered cord. However, the following features were excluded: Delayed ability to walk, Seizure, Hypotonia, Hypertonia, Aggressive behavior, Self-injurious behavior, Multicystic kidney dysplasia, Hydronephrosis, Kyphosis, Scoliosis, Hallux valgus, Pectus excavatum, Epicanthus, Obesity, and Meningocele.\nAs an infant, he presented with Autistic behavior. \n",1. VACTERL association\n2. CHARGE syndrome\n3....,PMID_32730804_Individual_3_en-prompt.txt
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 14-year-old adolescent boy. Disease onset was not specified.\nHe presented with Atopic dermatitis, Recurrent lower respiratory tract infections, Recurrent upper respiratory tract infections, Seafood allergy, Egg allergy, Nut food product allergy, Rhinitis, Cutaneous abscess, Recurrent bacterial skin infections, Recurrent otitis media, Sinusitis, Verrucae, Osteomyelitis, Onychomycosis, Bronchiectasis, Scoliosis, and High palate. However, the following features were excluded: Drug allergy and Asthma.\n",1. Hyper IgE Syndrome (Job's Syndrome)\n2. Com...,PMID_19776401_Patient_6_1_en-prompt.txt
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 12-year-old adolescent boy. Disease onset occurred when the proband was a newborn.\nHe presented with Feeding difficulties, Dental malocclusion, and Dental crowding. However, the following features were excluded: Choanal stenosis.\nAt an age of , he presented with Hypertelorism. \nAt an age of 2 years 6 months, he presented with Hearing impairment. \nAt an age of 1 year, he presented with Cranial hyperostosis. \nAs a neonate, he presented with Facial palsy and Alveolar ridge overgrowth. \nAt an age of 6 years, he presented with Increased circulating osteocalcin level, Elevated circulating alkaline phosphatase concentration, and Macrocephaly. \n",1. Craniometaphyseal dysplasia\n2. Crouzon syn...,PMID_20358596_Patient_A_en-prompt.txt
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a male. Disease onset was not specified.\nHe presented with Delayed speech and language development, Motor delay, Intellectual disability, Autistic behavior, Hypospadias, Ambiguous genitalia, and Short stature.\n",1. Smith-Lemli-Opitz syndrome\n2. Fragile X sy...,PMID_36586412_8_en-prompt.txt
...,...,...
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 22-year-old woman. Disease onset was not specified.\nShe presented with Nystagmus, Hydronephrosis, and Hearing impairment. However, the following features were excluded: Diabetes insipidus, Seizure, Intellectual disability, Tremor, and Ataxia.\nAt an age of , she presented with Diabetes mellitus and Optic atrophy. \n",1. Wolfram Syndrome\n2. Bardet-Biedl Syndrome\...,PMID_9817917_Family_4_individual_13070_en-prom...
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 25-year-old woman. Disease onset occurred when the proband was 6-year, 0-month old.\nShe presented with Decreased beta-galactosidase activity, Cherry red spot of the macula, Platyspondyly, Intellectual disability, mild, and Abnormality of extrapyramidal motor function. However, the following features were excluded: Splenomegaly and Hepatomegaly.\n",1. GM1 gangliosidosis\n2. Niemann-Pick disease...,PMID_1907800_TS_en-prompt.txt
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 11-year-old adolescent girl. Disease onset was not specified.\nShe presented with Myalgia, Lower limb muscle weakness, Postural instability, Vomiting, Abdominal distention, Muscle spasm, Failure to thrive, Cholelithiasis, Lower limb asymmetry, Upper limb asymmetry, Hepatomegaly, Cognitive impairment, Decreased Achilles reflex, Impaired vibratory sensation, Positive Romberg sign, Incoordination, Elevated circulating aspartate aminotransferase concentration, Increased circulating lactate concentration, Increased serum pyruvate, EMG: myopathic abnormalities, and Hyperintensity of cerebral white matter on MRI.\n",1. Mitochondrial disease\n2. Leigh syndrome\n3...,PMID_28673863_28673863_P1_en-prompt.txt
"I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\nI am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\na diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\ntoday to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\nwhen such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\nvery rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\na list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\nspecified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\nCystic fibrosis, provide this:\n\n1. Branchiooculofacial syndrome\n2. Cystic fibrosis\n\nThis list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\njust list the diagnoses. Here is the case:\n\nThe proband was a 6-year, 0-month old boy. Disease onset occurred when the proband was an infant.\nHe presented with Feeding difficulties in infancy, Abnormality of the dentition, Intellectual disability, Seizure, Global developmental delay, Growth delay, Delayed ability to walk, and Reduced bone mineral density. However, the following features were excluded: Cleft palate and Brain imaging abnormality.\n",1. Angelman syndrome\n2. Rett syndrome\n3. Phe...,PMID_31021519_individual_SATB2_112_en-prompt.txt


### code to ground the diagnosis text to MONDO ontology

In [7]:
import re
from oaklib.interfaces.text_annotator_interface import TextAnnotationConfiguration
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
from curate_gpt.store import get_store
from typing import List, Tuple
import logging 

# Get the OAK annotator for MONDO
from oaklib import get_adapter
# Set up OAK SQLite implementation for MONDO
annotator = get_adapter("sqlite:obo:mondo")

# Compile a regex pattern to detect lines starting with "Differential Diagnosis:"
dd_re = re.compile(r"^[^A-z]*Differential Diagnosis")

# Function to clean and remove "Differential Diagnosis" header if present
def clean_service_answer(answer: str) -> str:
    """Remove the 'Differential Diagnosis' header if present, and clean the first line."""
    lines = answer.split('\n')
    # Filter out any line that starts with "Differential Diagnosis:"
    cleaned_lines = [line for line in lines if not dd_re.match(line)]
    return '\n'.join(cleaned_lines)

# Clean the diagnosis line by removing leading numbers, periods, asterisks, and spaces
def clean_diagnosis_line(line: str) -> str:
    """Remove leading numbers, asterisks, and unnecessary punctuation/spaces from the diagnosis."""
    line = re.sub(r'^\**\d+\.\s*', '', line)  # Remove leading numbers and periods
    line = line.strip('*')  # Remove asterisks around the text
    return line.strip()  # Strip any remaining spaces

def perform_curategpt_grounding(
    diagnosis: str,
    path: str,
    collection: str,
    database_type: str = "chromadb",
    limit: int = 1,
    relevance_factor: float = 0.23,
    verbose: bool = False
) -> List[Tuple[str, str]]:
    """
    Use curategpt to perform grounding for a given diagnosis when initial attempts fail.
    
    Parameters:
    - diagnosis: The diagnosis text to ground.
    - path: The path to the database. You'll need to create an index of Mondo using curategpt in this db
    - collection: The collection to search within curategpt. Name of mondo collection in the db
    NB: You can make this collection by running curategpt thusly:
    `curategpt ontology index --index-fields label,definition,relationships -p stagedb -c ont_mondo -m openai: sqlite:obo:mondo`
    - database_type: The type of database used for grounding (e.g., chromadb, duckdb).
    - limit: The number of search results to return.
    - relevance_factor: The distance threshold for relevance filtering.
    - verbose: Whether to print verbose output for debugging.
    
    Returns:
    - List of tuples: [(Mondo ID, Label), ...]
    """
    # Initialize the database store
    db = get_store(database_type, path)
    
    # Perform the search using the provided diagnosis
    results = db.search(diagnosis, collection=collection)

    # Filter results based on relevance factor (distance)
    if relevance_factor is not None:
        results = [(obj, distance, _meta) for obj, distance, _meta in results if distance <= relevance_factor]

    # Limit the results to the specified number (limit)
    limited_results = results[:limit]

    # Extract Mondo IDs and labels
    pred_ids = []
    pred_labels = []

    for obj, distance, _meta in limited_results:
        disease_mondo_id = obj.get("original_id")  # Use the 'original_id' field for Mondo ID
        disease_label = obj.get("label")
        
        if disease_mondo_id and disease_label:
            pred_ids.append(disease_mondo_id)
            pred_labels.append(disease_label)

    # Return as a list of tuples (Mondo ID, Label)
    if len(pred_ids) == 0:
        if verbose:
            print(f"No grounded IDs found for {diagnosis}")
        return [('N/A', 'No grounding found')]

    return list(zip(pred_ids, pred_labels))


# Perform grounding on the text to MONDO ontology and return the result
def perform_oak_grounding(
    annotator: TextAnnotatorInterface,
    diagnosis: str,
    exact_match: bool = True,
    verbose: bool = False,
    include_list: List[str] = ["MONDO:"],
) -> List[Tuple[str, str]]:
    """
    Perform grounding for a diagnosis. The 'exact_match' flag controls whether exact or inexact
    (partial) matching is used. Filter results to include only CURIEs that match the 'include_list',
    and exclude results that match the 'exclude_list'.
    Remove redundant groundings from the result.
    """
    config = TextAnnotationConfiguration(matches_whole_text=exact_match)
    annotations = list(annotator.annotate_text(diagnosis, configuration=config))

    # Filter and remove duplicates, while excluding unwanted general terms
    filtered_annotations = list(
        {
            (ann.object_id, ann.object_label)
            for ann in annotations
            if any(ann.object_id.startswith(prefix) for prefix in include_list)
        }
    )
    
    if filtered_annotations:
        return filtered_annotations
    else:
        match_type = "exact" if exact_match else "inexact"
        if verbose:
            logging.warning(f"No {match_type} grounded IDs found for: {diagnosis}")
        return [('N/A', 'No grounding found')]

# Now, integrate curategpt into your ground_diagnosis_text_to_mondo function
def ground_diagnosis_text_to_mondo(
    annotator: TextAnnotatorInterface,
    differential_diagnosis: str,
    verbose: bool = False,
    include_list: List[str] = ["MONDO:"],
    use_ontogpt_grounding: bool = True,
    curategpt_path: str = "../../curate-gpt/stagedb/",
    curategpt_collection: str = "ont_mondo",
    curategpt_database_type: str = "chromadb"
) -> List[Tuple[str, List[Tuple[str, str]]]]:
    results = []
    
    # Split the input into lines and process each one
    for line in differential_diagnosis.splitlines():
        clean_line = clean_diagnosis_line(line)
        
        # Skip header lines like "**Differential diagnosis:**"
        if not clean_line or "Differential diagnosis" in clean_line.lower():
            continue
        
        # Try grounding the full line first (exact match)
        grounded = perform_oak_grounding(annotator, clean_line, exact_match=True, verbose=verbose, include_list=include_list)
        
        # Try grounding with curategpt if no grounding is found
        if use_ontogpt_grounding and grounded == [('N/A', 'No grounding found')]:
            grounded = perform_curategpt_grounding(
                diagnosis=clean_line,
                path=curategpt_path,
                collection=curategpt_collection,
                database_type=curategpt_database_type,
                verbose=verbose
            )
        
        # If still no grounding is found, log the final failure
        if grounded == [('N/A', 'No grounding found')]:
            if verbose:
                print(f"Final grounding failed for: {clean_line}")
        
        # Append the grounded results (even if no grounding was found)
        results.append((clean_line, grounded))

    return results

##
## RUN SOME TESTS
##

In [8]:
from warnings import warn

# helper function to check if a grounding is correct
def assert_correct_result(result, expected_result):
    # Assert that the grounded result matches the expected output
    assert len(result) == len(expected_result), f"Length of grounding result:\n{result}\n does not match length of expected result:\n{expected_result}\n"
    
    for res_item, expected_item in zip(result, expected_result):
        # First, assert that the diagnosis name matches exactly
        if res_item[0] != expected_item[0]:
            warn(f"Diagnosis mismatch: {res_item[0]} != {expected_item[0]}")
        
        # Then, assert that the grounding list matches, ignoring order
        assert set(res_item[1]) == set(expected_item[1]), f"Grounding mismatch for {res_item[0]}"
    

# Example grounding with OAK annotation - testing on a sample input
differential_diagnosis_text = """
**Differential Diagnosis:**
1. Branchiooculofacial syndrome
2. Unicorn syndrome
3. Cystic fibrosis
4. 22q11.2 deletion syndrome (Velocardiofacial syndrome)
**5. ATP6V0A4-related distal renal tubular acidosis**
"""

# Cleaning and grounding the sample differential diagnosis text
cleaned_text = clean_service_answer(differential_diagnosis_text)
# Assert that the cleaning process returns non-empty text
assert cleaned_text != "", "Cleaning failed: the cleaned text is empty."

# Define the expected result for the sample input
expected_result = [
    ('Branchiooculofacial syndrome', [('MONDO:0007235', 'branchiooculofacial syndrome')]), 
    ('Unicorn syndrome', [('N/A', 'No grounding found')]), 
    ('Cystic fibrosis', [('MONDO:0009061', 'cystic fibrosis')]), 
    ('22q11.2 deletion syndrome (Velocardiofacial syndrome)', [('MONDO:0008644', 'velocardiofacial syndrome')]), 
    ('ATP6V0A4-related distal renal tubular acidosis', [('MONDO:0012700', 'renal tubular acidosis, distal, 4, with hemolytic anemia')])
]

# Ground the text to MONDO
result = ground_diagnosis_text_to_mondo(annotator, cleaned_text, verbose=False)
print("Grounding Result:")
print(result)

assert_correct_result(result, expected_result)

Grounding Result:
[('Branchiooculofacial syndrome', [('MONDO:0007235', 'branchiooculofacial syndrome')]), ('Unicorn syndrome', [('N/A', 'No grounding found')]), ('Cystic fibrosis', [('MONDO:0009061', 'cystic fibrosis')]), ('22q11.2 deletion syndrome (Velocardiofacial syndrome)', [('MONDO:0008644', 'velocardiofacial syndrome')]), ('ATP6V0A4-related distal renal tubular acidosis', [('MONDO:0012700', 'renal tubular acidosis, distal, 4, with hemolytic anemia')])]


In [None]:
# Apply the cleaning and grounding functions directly to the 'service_answer' column. Might take 45m or so to run.
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

gpt_4o_responses['grounded_diagnosis'] = gpt_4o_responses['service_answer'].progress_apply(
    lambda x: ground_diagnosis_text_to_mondo(annotator, clean_service_answer(x), verbose=False)
)

gpt_4o_responses.to_csv(output_grounded_file, index=False)

# Display a sample of the updated DataFrame
gpt_4o_responses

 74%|███████▎  | 3879/5267 [1:37:11<22:52,  1.01it/s]  

In [None]:
# Load the DataFrame with the grounded diagnosis text (not using o1_responses from above to avoid re-running the previous cell)
gpt_4o_responses = pd.read_csv(output_grounded_file, index_col=0)
gpt_4o_responses