In [None]:
import pandas as pd
import tiktoken

In [None]:
def count_tokens(input_strings, encoding_name="cl100k_base"):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding(encoding_name)
    
    token_counts = []
    for text in input_strings:
        tokens = tokenizer.encode(text)
        token_counts.append(len(tokens))
    return token_counts


def save_case_json(response_content: str, output_path: str = "ddx_output.json") -> None:
    try:
        # Parse the response to ensure it's valid JSON
        structured_ddx = json.loads(response_content)
        
        # Save to file with proper formatting
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(structured_ddx, f, indent=2, ensure_ascii=False)
        
        print(f"Successfully saved differential diagnosis analysis to {output_path}")
        
    except json.JSONDecodeError as e:
        print(f"Error: Response is not valid JSON: {e}")
        print("Raw response content:")
        print(response_content)
        
    except Exception as e:
        print(f"Error saving file: {e}")
        
def read_file(file_path: str) -> str:
    """
    Read a file (JSON or TXT) and return its contents as a string.
    
    Args:
        file_path (str): Path to the file to read
        
    Returns:
        str: Contents of the file as a string
        
    Raises:
        FileNotFoundError: If the file doesn't exist
        ValueError: If the file extension is not supported
    """
    import json
    from pathlib import Path
    
    # Convert to Path object for easier handling
    path = Path(file_path)
    
    # Check if file exists
    if not path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Get file extension
    extension = path.suffix.lower()
    
    try:
        if extension == '.json':
            # Read JSON file
            with open(path, 'r', encoding='utf-8') as f:
                # Load JSON then dump to string to preserve JSON formatting
                return json.dumps(json.load(f), indent=2)
        elif extension == '.txt':
            # Read text file
            with open(path, 'r', encoding='utf-8') as f:
                return f.read()
        else:
            raise ValueError(f"Unsupported file extension: {extension}")
    except Exception as e:
        raise Exception(f"Error reading file {file_path}: {str(e)}")

## Save structured case from unstructured text

In [None]:
client = OpenAI(api_key="***")
model_cur= "gpt-4o-2024-08-06"
# Read the case file and schema
case_text = read_file('cases/raw_cases/NET_case_test_POC_DDX.txt')
schema_text = read_file('cases/schemas/case_schema.json')

# Structure the case
#structured_case = structure_clinical_case(client, case_text, schema_text)


# System prompt that explains the task and format
system_message = {
    "role": "system",
    "content": f"""You are a medical case structuring assistant. Your task is to convert unstructured clinical case text into a structured JSON format. 
    The cases will contain two main sections, The first section is "Presentation of case" we want to extract. 
    The "Differential Diagnosis" section is provided to help contextualize how experts thought about the information and create the teaching points.
Follow these guidelines:
1. Identify the "original presentation" which should just be the first few sentences of the case that
provides the context for further case exploration. It should be more than just the first sentence. Store this in the metadata where indictated
2. Extract key case elements including history, physical exam, testing, and management
3. Save the literal text related to this case element under "response"
4. Identify teaching points within each section


This information should be put into the schema structured below. For each individual element there should be these features
id: This is a unique code with a section identifier (H, P, T, or M) and a digit between 1 and 1000 sequentially
content: question that will illicit this piece of information
response: the information within the case that relates to the question
teaching_points: the teaching points for this case related to this information


The output should match this exact schema structure:
{schema_text}

Important: Return ONLY the JSON object with no additional text or explanation."""
}

# User message with the case text
user_message = {
    "role": "user",
    "content": f"Please structure this clinical case according to the schema shown above:\n\n{case_text}"
}


model_cur= "gpt-4o-2024-08-06"
response = client.chat.completions.create(
    model=model_cur,
    messages=[system_message, user_message],
    temperature=0,  # Use 0 for consistent, structured output
    response_format={"type": "json_object"}
)

# Extract and parse JSON from response
structured_case = response.choices[0].message.content

In [None]:
save_case_json(response.choices[0].message.content, 'cases/AA_tz_test_case.json')

## add in the ddx that is only based on current information in case (right now just GPT derived for standin)

In [None]:
def create_ddx_prompt(section, case_data):
    """
    Creates a prompt for generating differential diagnoses based on cumulative case information
    
    Args:
        section (str): Current section being analyzed ('history', 'physical', or 'testing')
        case_data (dict): The structured case data up to the current section
    
    Returns:
        list: List of message dictionaries in the format expected by OpenAI's API
    """
    
    # Build the case information text based on section
    case_info = f"Original Presentation: {case_data['metadata']['original_presentation']}\n\n"
    
    sections_to_include = []
    if section == "history" or section == "physical" or section == "testing":
        sections_to_include.append("history")
    if section == "physical" or section == "testing":
        sections_to_include.append("physical")
    if section == "testing":
        sections_to_include.append("testing")
    
    for curr_section in sections_to_include:
        case_info += f"{curr_section.upper()}:\n"
        # Add required elements
        for item in case_data['clinical_elements'][curr_section]['required']:
            case_info += f"Question: {item['content']}\n"
            case_info += f"Finding: {item['response']}\n\n"
        # Add optional elements
        for item in case_data['clinical_elements'][curr_section]['optional']:
            if item['elicited']:
                case_info += f"Question: {item['content']}\n"
                case_info += f"Finding: {item['response']}\n\n"

    messages = [
        {
            "role": "system",
            "content": """You are an expert medical diagnostician. Your task is to generate a differential diagnosis 
            based on the available case information. Consider all information provided cumulatively up to this point.
            
            For each diagnosis in the differential:
            1. Assess its likelihood given the current information
            2. Identify key features that would be expected
            3. List supporting and refuting evidence from the case
            4. Specify what additional information would be helpful
            
            Format your response as a JSON array matching this structure:
            {
                "current_ideal_differential_diagnosis": [
                    {
                        "name": "diagnosis name",
                        "category": "disease category",
                        "likelihood": "high/medium/low based on current information",
                        "key_features": ["expected feature 1", "expected feature 2"],
                        "supporting_evidence": ["evidence from case that supports this diagnosis"],
                        "refuting_evidence": ["evidence from case that refutes this diagnosis"],
                        "additional_information_needed": ["specific questions or tests needed"]
                    }
                ]
            }
            
            Important: Return ONLY the JSON object with no additional text."""
        },
        {
            "role": "user",
            "content": f"""Based on the following case information up to this point, 
            generate a differential diagnosis list that reflects what an expert clinician 
            should be considering at this stage:\n\n{case_info}"""
        }
    ]
    
    return messages



In [None]:
structured_case = json.loads(read_file('cases/AA_tz_test_case.json'))
client = OpenAI(api_key="XXX")
model_cur= "gpt-4o-2024-08-06"
for section in ["history", "physical", "testing"]:
    messages = create_ddx_prompt(section, structured_case)
    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        temperature=0,
        response_format={"type": "json_object"}
    )
    # Parse the response and update the case structure
    ddx = json.loads(response.choices[0].message.content)
    structured_case['clinical_elements'][section]['current_ideal_differential_diagnosis'] = \
        ddx['current_ideal_differential_diagnosis']
save_case_json(json.dumps(structured_case), 'cases/AAA_tz_test_case_with_DDx.json')

## creating structure for multi-phase cases
## !!!!Not currently used!!!!

In [86]:
client = OpenAI(api_key="***")

# Read the case file and schema
case_text = read_file('cases/NET_case_test_POC.txt')
schema_text = read_file('cases/schemas/case_schema.json')

# Structure the case
#structured_case = structure_clinical_case(client, case_text, schema_text)
json_format = """
{
        "original presentation": Beginning case description extracted from note,
        "Phase 1": Start character integer for the first phase,
        "Phase 2": Start character integer for the second phase,
        ...
}"""
# System prompt that explains the task and format
system_message = {
    "role": "system",
    "content": f"""You are a medical case structuring assistant. I will give you an unstructured clinical case text 
    that describes the medical course for a patient. Your task is to split this into 1-4 "phases", episodes of the case
    that represent time periods in which we can pause and think about the information within the case and what the differential diagnosis is.
    Regardless of how many phases, you should return the "original presentation" which is the first couple sentences which introduce the case
    and the reason a bit of backgroud to start the thinking process
    If there is only 1 such presentation (for example a single outpatient visit, or a hospital admission) you can just return the character
    number that ends the original presentation. 
    If multiphase, please return the character number in the string at which each phase break occurs. Please return in the following format:
    {json_format}
    """

}

# User message with the case text
user_message = {
    "role": "user",
    "content": f"Here is the case text to process as above:\n\n{case_text}"
}


model_cur= "gpt-4o-2024-08-06"
response = client.chat.completions.create(
    model=model_cur,
    messages=[system_message, user_message],
    temperature=0,  # Use 0 for consistent, structured output
)

# Extract and parse JSON from response
structured_case = response.choices[0].message.content

INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/NET_case_test_POC.txt
INFO:root:Successfully read 8053 characters from cases/NET_case_test_POC.txt
INFO:root:First 100 characters of cases/NET_case_test_POC.txt:
Case Presentation:

  Dr. Michael T. Forrester (Medicine): A 54-year-old man was admitted to this ho
INFO:root:Attempting to read file from: /Users/tzack/Documents/Socratic_AI/socratic_ai_sam_newest/cases/schemas/case_schema.json
INFO:root:Successfully read 3292 characters from cases/schemas/case_schema.json
INFO:root:First 100 characters of cases/schemas/case_schema.json:
{
    "metadata": {
        "id": "string",
        "title": "string",
        "difficulty": "string
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
