In [1]:
import pandas as pd

# we start with o1 responses here:
o1_responses = pd.read_csv('../supplemental_data/gpt_o1_response/gpt_o1_response.csv', index_col=0)
o1_responses

Unnamed: 0,problem,service_answer,metadata
0,I am running an experiment on a clinical case ...,"I'm sorry, but I cannot generate a differentia...",PMID_34722527_individual_103_7_Hui_Wang_Compre...
1,I am running an experiment on a clinical case ...,1. VACTERL association \n2. Feingold syndrome ...,PMID_32730804_Individual_3_en-prompt.txt
2,I am running an experiment on a clinical case ...,1. Autosomal recessive hyper-IgE syndrome (DOC...,PMID_19776401_Patient_6_1_en-prompt.txt
3,I am running an experiment on a clinical case ...,1. Sclerosteosis \n2. Van Buchem disease \n3. ...,PMID_20358596_Patient_A_en-prompt.txt
4,I am running an experiment on a clinical case ...,1. Smith-Lemli-Opitz syndrome \n2. ATR-X syndr...,PMID_36586412_8_en-prompt.txt
...,...,...,...
5262,I am running an experiment on a clinical case ...,1. Wolfram syndrome \n2. Alström syndrome \n3....,PMID_9817917_Family_4_individual_13070_en-prom...
5263,I am running an experiment on a clinical case ...,1. GM1 gangliosidosis \n2. Galactosialidosis \...,PMID_1907800_TS_en-prompt.txt
5264,I am running an experiment on a clinical case ...,1. Mitochondrial neurogastrointestinal encepha...,PMID_28673863_28673863_P1_en-prompt.txt
5265,I am running an experiment on a clinical case ...,1. Down syndrome\n2. Kabuki syndrome \n3. 22q1...,PMID_31021519_individual_SATB2_112_en-prompt.txt


In [0]:
import re
from oaklib.interfaces.text_annotator_interface import TextAnnotationConfiguration
from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface
import logging
from typing import Tuple, List

# Compile a regex pattern to detect lines starting with "Differential Diagnosis:"
dd_re = re.compile(r"^[^A-z]*Differential Diagnosis\:")

# Function to clean and remove "Differential Diagnosis" header if present
def clean_service_answer(answer: str) -> str:
    """Remove the 'Differential Diagnosis' header if present, and clean the first line."""
    lines = answer.split('\n')
    # Filter out any line that starts with "Differential Diagnosis:"
    cleaned_lines = [line for line in lines if not dd_re.match(line)]
    return '\n'.join(cleaned_lines)

# Clean the diagnosis line by removing leading numbers, dots, and spaces
def clean_diagnosis_line(line: str) -> str:
    return re.sub(r'^\d+\.\s*', '', line).strip()

# Ground the diagnosis text to MONDO ontology
def ground_diagnosis_text_to_mondo(
    annotator: TextAnnotatorInterface,
    differential_diagnosis: str,
    verbose: bool = False
) -> List[Tuple[str, List[Tuple[str, str]]]]:
    results = []
    config = TextAnnotationConfiguration(matches_whole_text=True)
    
    # Split the input into lines and process each one
    for line in differential_diagnosis.splitlines():
        clean_line = clean_diagnosis_line(line)
        
        if not clean_line:
            continue
        
        annotations = annotator.annotate_text(clean_line, configuration=config)
        
        pred_ids = []
        pred_names = []
        
        for ann in annotations:
            pred_ids.append(ann.object_id)
            pred_names.append(ann.object_label)

        if len(pred_ids) == 0:
            if verbose:
                logging.warning(f"No grounded IDs found for: {clean_line}")
            results.append((clean_line, [('N/A', 'No grounding found')]))
        else:
            logging.info(f"Diagnosis: {clean_line}\tPredicted IDs: {'|'.join(pred_ids)}\tPredicted Names: {'|'.join(pred_names)}")
            results.append((clean_line, list(zip(pred_ids, pred_names))))

    return results

In [10]:
from oaklib import get_adapter

# Set up OAK SQLite implementation for MONDO
annotator = get_adapter("sqlite:obo:mondo")

# Example grounding with OAK annotation - testing on a sample input
differential_diagnosis_text = """
**Differential Diagnosis:**
1. Branchiooculofacial syndrome
2. Unicorn syndrome
3. Cystic fibrosis
"""

# Cleaning and grounding the sample differential diagnosis text
cleaned_text = clean_service_answer(differential_diagnosis_text)
print("Cleaned Text:")
print(cleaned_text)

# Assert that the cleaning process returns non-empty text
assert cleaned_text != "", "Cleaning failed: the cleaned text is empty."

# Define the expected result for the sample input
expected_result = [
    ('Branchiooculofacial syndrome', [('MONDO:0007235', 'branchiooculofacial syndrome'), ('MONDO:0007235', 'branchiooculofacial syndrome')]),
    ('Unicorn syndrome', [('N/A', 'No grounding found')]),
    ('Cystic fibrosis', [('MONDO:0009061', 'cystic fibrosis'), ('MONDO:0009061', 'cystic fibrosis')])
]

# Ground the cleaned text to MONDO
result = ground_diagnosis_text_to_mondo(annotator, cleaned_text, verbose=True)
print("Grounding Result:")
print(result)

# Assert that the grounded result matches the expected output
assert result == expected_result, f"Grounding failed: the result does not match the expected output. Got: {result}"

Cleaned Text:

1. Branchiooculofacial syndrome
2. Unicorn syndrome
3. Cystic fibrosis




Grounding Result:
[('Branchiooculofacial syndrome', [('MONDO:0007235', 'branchiooculofacial syndrome'), ('MONDO:0007235', 'branchiooculofacial syndrome')]), ('Unicorn syndrome', [('N/A', 'No grounding found')]), ('Cystic fibrosis', [('MONDO:0009061', 'cystic fibrosis'), ('MONDO:0009061', 'cystic fibrosis')])]


In [9]:
# Apply the cleaning and grounding functions directly to the 'service_answer' column with progress bar
o1_responses['grounded_diagnosis'] = o1_responses['service_answer'].progress_apply(
    lambda x: ground_diagnosis_text_to_mondo(annotator, clean_service_answer(x), verbose=False)
)

# Save the DataFrame with the new 'grounded_diagnosis' column to a CSV file
output_file = '../supplemental_data/gpt_o1_response/gpt_o1_response_grounded.csv'
o1_responses.to_csv(output_file, index=False)

# Display a sample of the updated DataFrame
o1_responses.head()

100%|██████████| 5267/5267 [45:23<00:00,  1.93it/s]


Unnamed: 0,problem,service_answer,metadata,grounded_diagnosis
0,I am running an experiment on a clinical case ...,"I'm sorry, but I cannot generate a differentia...",PMID_34722527_individual_103_7_Hui_Wang_Compre...,"[(I'm sorry, but I cannot generate a different..."
1,I am running an experiment on a clinical case ...,1. VACTERL association \n2. Feingold syndrome ...,PMID_32730804_Individual_3_en-prompt.txt,"[(VACTERL association, [('MONDO:0008642', 'VAC..."
2,I am running an experiment on a clinical case ...,1. Autosomal recessive hyper-IgE syndrome (DOC...,PMID_19776401_Patient_6_1_en-prompt.txt,[(Autosomal recessive hyper-IgE syndrome (DOCK...
3,I am running an experiment on a clinical case ...,1. Sclerosteosis \n2. Van Buchem disease \n3. ...,PMID_20358596_Patient_A_en-prompt.txt,"[(Sclerosteosis, [('MONDO:0017838', 'scleroste..."
4,I am running an experiment on a clinical case ...,1. Smith-Lemli-Opitz syndrome \n2. ATR-X syndr...,PMID_36586412_8_en-prompt.txt,"[(Smith-Lemli-Opitz syndrome, [('MONDO:0010035..."
