In [None]:
import pandas as pd

In [77]:
def create_ddx_prompt(schema: str, case_text: str) -> dict:
    """
    Creates a prompt for the LLM to structure differential diagnosis discussion
    
    Args:
        schema: The JSON schema string
        case_text: The unstructured case discussion text
    
    Returns:
        dict: The messages for the API call
    """
    
    system_message = {
        "role": "system",
        "content": f"""You are a medical differential diagnosis structuring assistant. 
        Your task is to analyze unstructured clinical case discussions and convert them into a structured format 
        focusing on the diagnostic reasoning process. Please try to be as comprehensive as possible, including all diagnoses discussed,
        even if they are excluded by the expert. After you create this structured list, please create a ranking and categorization for diagnosis
        This ranking should be guided by the discussion, and defer to seniority, such that the most likely diagnosis should be that
        which the expert discussant that was most likely, not the medical students


Follow these key guidelines:

1. Create unique IDs for each element using prefixes:
   - DD for diagnoses (e.g., DD001)

2. For each diagnosis discussed:
   - Summarize supporting and opposing evidence and save as unstructured text under "findings"
   - Note suggested missing workup/testing and save as unstructured text under "Potential missing workup"
   - Determine confidence level based on discussion

3. For ranking and categorization:
   - Order diagnoses by likelihood based on case discussion
   - Tag critical "can't miss" diagnoses that must be excluded
   - Note confidence level (high/moderate/low)

The output must exactly match this schema structure:
{schema}

Return ONLY the JSON object with no additional text or explanations."""
    }
    
    user_message = {
        "role": "user",
        "content": f"""Please analyze this clinical case discussion and structure the differential diagnosis analysis according to the schema above:

{case_text}"""
    }
    
    return {
        "messages": [system_message, user_message],
        "temperature": 0,
        "response_format": {"type": "json_object"}
    }

In [78]:
def save_ddx_json(response_content: str, output_path: str = "ddx_output.json") -> None:
    try:
        # Parse the response to ensure it's valid JSON
        structured_ddx = json.loads(response_content)
        
        # Save to file with proper formatting
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(structured_ddx, f, indent=2, ensure_ascii=False)
        
        print(f"Successfully saved differential diagnosis analysis to {output_path}")
        
    except json.JSONDecodeError as e:
        print(f"Error: Response is not valid JSON: {e}")
        print("Raw response content:")
        print(response_content)
        
    except Exception as e:
        print(f"Error saving file: {e}")

In [79]:
client = OpenAI(api_key="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")

# Read the case file and schema
case_text = read_file('cases/NET_case_test_DDX.txt')
schema_text = read_file('cases/schemas/ddx_schema.json')
prompt = create_ddx_prompt(schema_text, case_text)

# Make the API call
response = client.chat.completions.create(
    model="gpt-4o-2024-08-06",
    **prompt
)



INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/NET_case_test_DDX.txt
INFO:root:Successfully read 12031 characters from cases/NET_case_test_DDX.txt
INFO:root:First 100 characters of cases/NET_case_test_DDX.txt:
  Dr. Leigh H. Simmons: May we review the imaging studies? A coronal reformatted image from a contra
INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/schemas/ddx_schema.json
INFO:root:Successfully read 1538 characters from cases/schemas/ddx_schema.json
INFO:root:First 100 characters of cases/schemas/ddx_schema.json:
{
    "metadata": {
        "id": "string",
        "title": "string",
        "difficulty": "string
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [80]:
save_ddx_json(response.choices[0].message.content, "case_DDx_file_comprehensive.json")

Successfully saved differential diagnosis analysis to case_DDx_file_comprehensive.json


In [86]:
client = OpenAI(api_key="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")

# Read the case file and schema
case_text = read_file('cases/NET_case_test_POC.txt')
schema_text = read_file('cases/schemas/case_schema.json')

# Structure the case
#structured_case = structure_clinical_case(client, case_text, schema_text)
json_format = """
{
        "original presentation": Beginning case description extracted from note,
        "Phase 1": Start character integer for the first phase,
        "Phase 2": Start character integer for the second phase,
        ...
}"""
# System prompt that explains the task and format
system_message = {
    "role": "system",
    "content": f"""You are a medical case structuring assistant. I will give you an unstructured clinical case text 
    that describes the medical course for a patient. Your task is to split this into 1-4 "phases", episodes of the case
    that represent time periods in which we can pause and think about the information within the case and what the differential diagnosis is.
    Regardless of how many phases, you should return the "original presentation" which is the first couple sentences which introduce the case
    and the reason a bit of backgroud to start the thinking process
    If there is only 1 such presentation (for example a single outpatient visit, or a hospital admission) you can just return the character
    number that ends the original presentation. 
    If multiphase, please return the character number in the string at which each phase break occurs. Please return in the following format:
    {json_format}
    """

}

# User message with the case text
user_message = {
    "role": "user",
    "content": f"Here is the case text to process as above:\n\n{case_text}"
}


model_cur= "gpt-4o-2024-08-06"
response = client.chat.completions.create(
    model=model_cur,
    messages=[system_message, user_message],
    temperature=0,  # Use 0 for consistent, structured output
)

# Extract and parse JSON from response
structured_case = response.choices[0].message.content

INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/NET_case_test_POC.txt
INFO:root:Successfully read 8053 characters from cases/NET_case_test_POC.txt
INFO:root:First 100 characters of cases/NET_case_test_POC.txt:
Case Presentation:

  Dr. Michael T. Forrester (Medicine): A 54-year-old man was admitted to this ho
INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/schemas/case_schema.json
INFO:root:Successfully read 3292 characters from cases/schemas/case_schema.json
INFO:root:First 100 characters of cases/schemas/case_schema.json:
{
    "metadata": {
        "id": "string",
        "title": "string",
        "difficulty": "string
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [88]:
dict_cur = json.loads(structured_case)

In [91]:
dict_cur

{'original presentation': 'A 54-year-old man was admitted to this hospital because of diarrhea, vomiting, and weight loss.',
 'Phase 1': 122,
 'Phase 2': 1080,
 'Phase 3': 1865,
 'Phase 4': 2760}

## Final step: use final DDx and information at or before each stage to create expected DDx. 

In [104]:
client = OpenAI(api_key="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")

# Read the case file and schema
case_text = read_file('cases/NET_case_test_POC.txt')
ddx_json = read_file('case_DDx_file_comprehensive.json')
schema_text = read_file('cases/schemas/case_schema.json')

# Structure the case
#structured_case = structure_clinical_case(client, case_text, schema_text)


# System prompt that explains the task and format
system_message = {
    "role": "system",
    "content": f"""You are a medical case structuring assistant. Your task is to convert unstructured clinical case text
    into a structured JSON format that includes a differential diagnosis (DDx) that should be considered given the current information.
    I am going to give you three seperate elements:
    A. A clinical case, that may be truncated at a specific time we want to create the DDx
    B. A structured DDx created by experts at the END of the case, that includes reasons they included or excluded a diagnosis
    C. A schema to demonstrate the way the output should be structured
Follow these guidelines:
1. Extract key case elements including history, physical exam, testing, and management
2. For each element, extract the literal text that corresponse to it and save as "response"
3. Identify teaching points within each section
4. Generate appropriate differential diagnoses based on the case details CURRENTLY present AND the final ddx presented in the json. 
    To do this, FIRST create a ranked differential diagnosis given the information present and your background understanding of medicine.
    Then, look at the structured DDx from experts, who may have more information than currently provided to the student. 
5. Advancement criteria: Information that should be collected from within this section in order for the learner to be considered done

This information should be put into the schema structured below. For each individual element there should be these features
id: This is a unique code with a section identifier (H, P, T, or M) and a digit between 1 and 1000 sequentially
content: question that will illicit this piece of information
response: the information within the case that relates to the question
teaching_points: the teaching points for this case


The output should match this exact schema structure:
{schema_text}

Important: Return ONLY the JSON object with no additional text or explanation."""
}

case_text_cur = case_text[:dict_cur['Phase 2']]
# User message with the case text
user_message = {
    "role": "user",
    "content": f"Please structure this clinical case according to the schema shown above:\n\n{case_text}\n\n"+\
                f"Here is the structured differential diagnosis information created by the experts at the end of the case (which may mean they had more information than is currently present): {ddx_json}"
}




INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/NET_case_test_POC.txt
INFO:root:Successfully read 8053 characters from cases/NET_case_test_POC.txt
INFO:root:First 100 characters of cases/NET_case_test_POC.txt:
Case Presentation:

  Dr. Michael T. Forrester (Medicine): A 54-year-old man was admitted to this ho
INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/case_DDx_file_comprehensive.json
INFO:root:Successfully read 6522 characters from case_DDx_file_comprehensive.json
INFO:root:First 100 characters of case_DDx_file_comprehensive.json:
{
  "metadata": {
    "id": "case_001",
    "title": "Chronic Diarrhea and Pancreatic Mass",
    "di
INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/schemas/case_schema.json
INFO:root:Successfully read 4006 characters from cases/schemas/case_schema.json
INFO:root:First 100 characters of cases/sc

In [105]:
count_tokens([system_message['content'],user_message['content']])

[1053, 3326]

In [106]:
model_cur= "gpt-4o-2024-08-06"
response = client.chat.completions.create(
    model=model_cur,
    messages=[system_message, user_message],
    temperature=0,  # Use 0 for consistent, structured output
)

# Extract and parse JSON from response
structured_case = response.choices[0].message.content

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [107]:
print(structured_case)

```json
{
    "metadata": {
        "id": "case_001",
        "title": "Chronic Diarrhea and Pancreatic Mass",
        "difficulty": "high",
        "specialties": ["Gastroenterology", "Endocrinology"],
        "keywords": ["chronic diarrhea", "pancreatic mass", "gastrinoma", "Zollinger-Ellison syndrome"]
    },
    "phases": {
        "original presentation": {
            "elements": [
                {
                    "id": "OP001",
                    "response": "A 54-year-old man was admitted to this hospital because of diarrhea, vomiting, and weight loss."
                }
            ]
        },
        "history": {
            "elements": [
                {
                    "id": "H001",
                    "content": "What is the patient's travel history?",
                    "required": true,
                    "response": "The patient traveled to Colorado and the Middle East, with symptoms developing after both trips.",
                    "teaching_points": [
 

In [120]:
client = OpenAI(api_key="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")
model_cur= "gpt-4o-2024-08-06"
# Read the case file and schema
case_text = read_file('cases/raw_cases/NET_case_test_POC_DDX.txt')
schema_text = read_file('cases/schemas/case_schema.json')

# Structure the case
#structured_case = structure_clinical_case(client, case_text, schema_text)


# System prompt that explains the task and format
system_message = {
    "role": "system",
    "content": f"""You are a medical case structuring assistant. Your task is to convert unstructured clinical case text into a structured JSON format. 
    The cases will contain two main sections, The first section is "Presentation of case" we want to extract. 
    The "Differential Diagnosis" section is provided to help contextualize how experts thought about the information and create the teaching points.
Follow these guidelines:
1. Identify the "original presentation" which should just be the first few sentences of the case that
provides the context for further case exploration. It should be more than just the first sentence. Store this in the metadata where indictated
2. Extract key case elements including history, physical exam, testing, and management
3. Save the literal text related to this case element under "response"
4. Identify teaching points within each section


This information should be put into the schema structured below. For each individual element there should be these features
id: This is a unique code with a section identifier (H, P, T, or M) and a digit between 1 and 1000 sequentially
content: question that will illicit this piece of information
response: the information within the case that relates to the question
teaching_points: the teaching points for this case related to this information


The output should match this exact schema structure:
{schema_text}

Important: Return ONLY the JSON object with no additional text or explanation."""
}

# User message with the case text
user_message = {
    "role": "user",
    "content": f"Please structure this clinical case according to the schema shown above:\n\n{case_text}"
}


model_cur= "gpt-4o-2024-08-06"
response = client.chat.completions.create(
    model=model_cur,
    messages=[system_message, user_message],
    temperature=0,  # Use 0 for consistent, structured output
    response_format={"type": "json_object"}
)

# Extract and parse JSON from response
structured_case = response.choices[0].message.content

INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/raw_cases/NET_case_test_POC_DDX.txt
INFO:root:Successfully read 20110 characters from cases/raw_cases/NET_case_test_POC_DDX.txt
INFO:root:First 100 characters of cases/raw_cases/NET_case_test_POC_DDX.txt:
Case Presentation:

  Dr. Michael T. Forrester (Medicine): A 54-year-old man was admitted to this ho
INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/schemas/case_schema.json
INFO:root:Successfully read 4865 characters from cases/schemas/case_schema.json
INFO:root:First 100 characters of cases/schemas/case_schema.json:
{
    "metadata": {
        "id": "string",
        "title": "string",
        "original_pesentation
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [121]:
save_ddx_json(response.choices[0].message.content, 'cases/tz_test_case.json')

Successfully saved differential diagnosis analysis to cases/tz_test_case.json


## adding a GPT created differential for each section (for now)

In [131]:
def create_ddx_prompt(section, case_data):
    """
    Creates a prompt for generating differential diagnoses based on cumulative case information
    
    Args:
        section (str): Current section being analyzed ('history', 'physical', or 'testing')
        case_data (dict): The structured case data up to the current section
    
    Returns:
        list: List of message dictionaries in the format expected by OpenAI's API
    """
    
    # Build the case information text based on section
    case_info = f"Original Presentation: {case_data['metadata']['original_presentation']}\n\n"
    
    sections_to_include = []
    if section == "history" or section == "physical" or section == "testing":
        sections_to_include.append("history")
    if section == "physical" or section == "testing":
        sections_to_include.append("physical")
    if section == "testing":
        sections_to_include.append("testing")
    
    for curr_section in sections_to_include:
        case_info += f"{curr_section.upper()}:\n"
        # Add required elements
        for item in case_data['clinical_elements'][curr_section]['required']:
            case_info += f"Question: {item['content']}\n"
            case_info += f"Finding: {item['response']}\n\n"
        # Add optional elements
        for item in case_data['clinical_elements'][curr_section]['optional']:
            if item['elicited']:
                case_info += f"Question: {item['content']}\n"
                case_info += f"Finding: {item['response']}\n\n"

    messages = [
        {
            "role": "system",
            "content": """You are an expert medical diagnostician. Your task is to generate a differential diagnosis 
            based on the available case information. Consider all information provided cumulatively up to this point.
            
            For each diagnosis in the differential:
            1. Assess its likelihood given the current information
            2. Identify key features that would be expected
            3. List supporting and refuting evidence from the case
            4. Specify what additional information would be helpful
            
            Format your response as a JSON array matching this structure:
            {
                "current_ideal_differential_diagnosis": [
                    {
                        "name": "diagnosis name",
                        "category": "disease category",
                        "likelihood": "high/medium/low based on current information",
                        "key_features": ["expected feature 1", "expected feature 2"],
                        "supporting_evidence": ["evidence from case that supports this diagnosis"],
                        "refuting_evidence": ["evidence from case that refutes this diagnosis"],
                        "additional_information_needed": ["specific questions or tests needed"]
                    }
                ]
            }
            
            Important: Return ONLY the JSON object with no additional text."""
        },
        {
            "role": "user",
            "content": f"""Based on the following case information up to this point, 
            generate a differential diagnosis list that reflects what an expert clinician 
            should be considering at this stage:\n\n{case_info}"""
        }
    ]
    
    return messages



In [136]:
structured_case = json.loads(read_file('cases/AA_tz_test_case.json'))

INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/AA_tz_test_case.json
INFO:root:Successfully read 7200 characters from cases/AA_tz_test_case.json
INFO:root:First 100 characters of cases/AA_tz_test_case.json:
{
  "metadata": {
    "id": "case_001",
    "title": "54-year-old man with diarrhea, vomiting, and w


In [137]:
client = OpenAI(api_key="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")
model_cur= "gpt-4o-2024-08-06"
for section in ["history", "physical", "testing"]:
    messages = create_ddx_prompt(section, structured_case)
    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=0,
        response_format={"type": "json_object"}
    )
    # Parse the response and update the case structure
    ddx = json.loads(response.choices[0].message.content)
    structured_case['clinical_elements'][section]['current_ideal_differential_diagnosis'] = \
        ddx['current_ideal_differential_diagnosis']


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [140]:
save_ddx_json(json.dumps(structured_case), 'cases/AAA_tz_test_case_with_DDx.json')

Successfully saved differential diagnosis analysis to cases/AAA_tz_test_case_with_DDx.json


In [130]:
response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=0
    )

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 400 Bad Request"


BadRequestError: Error code: 400 - {'error': {'message': "Invalid type for 'messages[0]': expected an object, but got a string instead.", 'type': 'invalid_request_error', 'param': 'messages[0]', 'code': 'invalid_type'}}

In [74]:
import tiktoken
def count_tokens(input_strings, encoding_name="cl100k_base"):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding(encoding_name)
    
    token_counts = []
    for text in input_strings:
        tokens = tokenizer.encode(text)
        token_counts.append(len(tokens))
    return token_counts


In [None]:
# Load custom functions for routing
with open("nci_app_functions.json") as fcn_file:
    custom_functions = json.load(fcn_file)