# Clinical Abstraction Task - LLM-Based Clinical Insights 


In [28]:
#Libraries 
%pip install openai pandas python-dotenv openpyxl scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.10 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [63]:
#Import Statements 
import os, json, pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
#Load ENV Variables
load_dotenv(); 

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 

In [72]:
#Read in Excel File 
df = pd.read_excel("synthetic_clinical_notes.xlsx")
df.head(10)

Unnamed: 0,note
0,"Patient: John Smith, 58-year-old male\nMedical..."
1,"Patient: Linda Green, 45-year-old female\nMedi..."
2,"Patient: Michael Brown, 62-year-old male\nMedi..."
3,"Patient: Sarah Johnson, 50-year-old female\nMe..."
4,"Patient: Carlos Ramirez, 55-year-old male\nMed..."
5,"Patient: Rebecca Lee, 29-year-old female\nMedi..."
6,"Patient: Thomas Wilson, 67-year-old male\nMedi..."
7,"Patient: Emily Dawson, 36-year-old female\nMed..."
8,"Patient: Robert Kim, 48-year-old male\nMedical..."
9,"Patient: Diane Carter, 60-year-old female\nMed..."


In [65]:
system_prompt = """

You are a medical assistant who is tasked with extracting structured clinical insights from unstructured clinical notes. From these notes, extract these insights into a 
JSON output, following this exact schema: 

{
  "patient_name": "",
  "age": "",
  "gender": "",
  "disease": "",
  "chief_complaint": "",
  "vital_signs": {},
  "symptoms": [],
  "medications": [],
  "lab_results": {},
  "assessment_plan": ""
  "summarization_main_findings": "Generate a one-sentence summary for each patient highlighting their key risk factors."
}

Important Guidelines: 
- Do NOT include any additional explanation or text outside of the JSON.
- If a field is not mentioned in the note, use an empty string (""), an empty object ({}), or an empty list ([]) as appropriate.
- Do not infer or hallucinate information that is not explicitly stated in the note.

"""

In [66]:
judge_prompt = """
You are a strict clinical note evaluator.

Your task is to assess whether a structured JSON extraction from a clinical note is accurate, complete, and hallucination-free.

Return ONLY a valid JSON object using the exact format below. Do NOT include any markdown formatting, code blocks, triple backticks, or explanations.

Schema:

{
  "overall_verdict": "Generate a two-three sentence summary on how accurate the structured JSON extraction is to the ground truth. Explain and state what was wrong and right, and how it can be improved if need be.",
  "field_verdicts": {
    "patient_name": <verdict>,
    "age": <verdict>,
    "gender": <verdict>,
    "disease": <verdict>,
    "chief_complaint": <verdict>,
    "vital_signs": <verdict>,
    "symptoms": <verdict>,
    "medications": <verdict>,
    "lab_results": <verdict>,
    "assessment_plan": <verdict>,
    "summarization_main_findings": <verdict>
  },
  "evidence": {
    "patient_name": "<quote from note or ''>",
    "age": "...",
    "gender": "...",
    "disease": "...",
    "chief_complaint": "...",
    "vital_signs": "...",
    "symptoms": "...",
    "medications": "...",
    "lab_results": "...",
    "assessment_plan": "...",
    "summarization_main_findings": "..."
  }
}

Valid values for each <verdict> are:
- "correct" - the field strongly and/or exactly matches the content of the note, or is correctly left empty when not mentioned.
- "missing" - the field is mentioned in the note but missing from the extracted JSON.
- "hallucinated" - the field includes non-empty content that is not present in the note.
- "incorrect" - the field is present in both but contains inaccurate or mismatched content.

Note: If a field is *not* present in the note and is returned as an empty string (""), object ({}), or list ([]), this should be considered "correct", not "hallucinated".

Special evaluation for "summarization_main_findings":

This field should summarize the patient's key risk factors or medical concerns from the note in one sentence. It does not need to match the text verbatim but should:
- Include the most important clinical findings (e.g., high A1C, fatigue, hypertension).
- Avoid introducing new or unrelated conditions.

Valid verdicts for this field are:
- "accurate" - the summary correctly reflects key findings in the note.
- "inaccurate" - the summary misses, distorts, or overstates the clinical findings.
- "irrelevant" - the summary is unrelated to the note or nonsensical.

Do NOT label this field as "hallucinated" based on wording differences alone.
"""



In [68]:
#Calling on Main LLM and Judge LLM

#Main LLM

def call_clinical_note_model(note, temperature=0.1, max_tokens=3000):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": note}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    content = response.choices[0].message.content.strip()
    return json.loads(content)



#Judge LLM
def call_judge_model(note, extracted_json, max_tokens=3000, temperature=0.1):
    prompt =  f"""{judge_prompt}

Here is the clinical note:
\"\"\"{note}\"\"\"

Here is the extracted JSON:
{json.dumps(extracted_json, indent=2)}

"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature,
    )

    content = response.choices[0].message.content.strip()

    #  Unwrap triple backtick blocks 
    if content.startswith("```json") or content.startswith("```"):
        content = content.replace("```json", "").replace("```", "").strip()

    #  Handle extra quotation marks
    if content.startswith('"') and content.endswith('"'):
        content = json.loads(content)  # unwrap the string first
        print(" Cleaned judge response:")
        print(content)


    return json.loads(content)



     





In [73]:
note = df["note"].iloc[7]
response = call_clinical_note_model(note)
print(json.dumps(response, indent=2))

{
  "patient_name": "Emily Dawson",
  "age": "36",
  "gender": "female",
  "disease": "Type 2 Diabetes",
  "chief_complaint": "Increased thirst and frequent urination for the past two weeks.",
  "vital_signs": {},
  "symptoms": [
    "increased thirst",
    "frequent urination",
    "mild anxiety"
  ],
  "medications": [
    "Metformin 500 mg once daily"
  ],
  "lab_results": {
    "A1C": "9.0%"
  },
  "assessment_plan": "Stress the importance of medication adherence. Consider adding a short-acting insulin if glucose remains uncontrolled. Advise adopting consistent meal times.",
  "summarization_main_findings": "Emily has poorly controlled Type 2 Diabetes with an A1C of 9.0%, is non-compliant with her medication, and is experiencing stress from a new job."
}


In [74]:
# Grab one note
note = df["note"].iloc[7]

# Extract structured output
extracted = call_clinical_note_model(note)

# Judge the output
if "error" not in extracted:
    audit = call_judge_model(note, extracted)
    print("\n Final judge audit:")
    print(json.dumps(audit, indent=2))
else:
    print(" Extraction failed:")
    print(extracted)



 Final judge audit:
{
  "overall_verdict": "The structured JSON extraction is mostly accurate, capturing the key details from the clinical note. However, it misses the mention of the patient's stressful new job and the illness of her dog, which are relevant to her current condition. Including these details would improve the completeness of the extraction.",
  "field_verdicts": {
    "patient_name": "correct",
    "age": "correct",
    "gender": "correct",
    "disease": "correct",
    "chief_complaint": "correct",
    "vital_signs": "correct",
    "symptoms": "correct",
    "medications": "correct",
    "lab_results": "correct",
    "assessment_plan": "correct",
    "summarization_main_findings": "accurate"
  },
  "evidence": {
    "patient_name": "Emily Dawson",
    "age": "36-year-old",
    "gender": "female",
    "disease": "Type 2 Diabetes",
    "chief_complaint": "Increased thirst and frequent urination for the past two weeks.",
    "vital_signs": "",
    "symptoms": "Increased t

#

In [75]:
#Run Extraction + Judge 

audits = []
extracted_outputs = []

for note in df["note"]:
    try:
        extracted = call_clinical_note_model(note)
        audit = call_judge_model(note, extracted)
        extracted_outputs.append(extracted)
        audits.append(audit)
    except Exception as e:
        extracted_outputs.append({"error": str(e)})
        audits.append({"error": str(e)})

# Save Results
with open("predicted_clinical_insights.json", "w") as f:
    json.dump(extracted_outputs, f, indent=2)

with open("judge_LLM_results.json", "w") as f:
    json.dump(audits, f, indent=2)

In [80]:
#Some Metrics 

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Your fields
standard_fields = [
    "patient_name", "age", "gender", "disease", "chief_complaint",
    "vital_signs", "symptoms", "medications", "lab_results", "assessment_plan"
]

# Special handling for summarization
def extract_labels(field_name, audits):
    y_true, y_pred = [], []

    for audit in audits:
        if "field_verdicts" not in audit:
            continue

        verdict = audit["field_verdicts"].get(field_name, "")

        if field_name == "summarization_main_findings":
            if verdict == "accurate":
                y_true.append(1)
                y_pred.append(1)
            elif verdict == "inaccurate":
                y_true.append(1)
                y_pred.append(0)
            elif verdict == "irrelevant":
                y_true.append(0)
                y_pred.append(1)
        else:
            if verdict == "correct":
                y_true.append(1)
                y_pred.append(1)
            elif verdict == "missing":
                y_true.append(1)
                y_pred.append(0)
            elif verdict in ("hallucinated", "incorrect"):
                y_true.append(0)
                y_pred.append(1)
            else:
                continue  # Skip unknown/empty verdicts

    return y_true, y_pred

# Evaluate all fields
all_fields = standard_fields + ["summarization_main_findings"]

print("Evaluation Metrics for Each Field:\n")

for field in all_fields:
    y_true, y_pred = extract_labels(field, audits)
    if y_true:
        p = precision_score(y_true, y_pred)
        r = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        acc = accuracy_score(y_true, y_pred)
        print(f"{field:25} → Precision: {p:.2f} | Recall: {r:.2f} | F1: {f1:.2f} | Accuracy: {acc:.2f}")
    else:
        print(f"{field:25} → No data available")





Evaluation Metrics for Each Field:

patient_name              → Precision: 1.00 | Recall: 1.00 | F1: 1.00 | Accuracy: 1.00
age                       → Precision: 0.95 | Recall: 1.00 | F1: 0.97 | Accuracy: 0.95
gender                    → Precision: 1.00 | Recall: 1.00 | F1: 1.00 | Accuracy: 1.00
disease                   → Precision: 0.95 | Recall: 1.00 | F1: 0.97 | Accuracy: 0.95
chief_complaint           → Precision: 0.89 | Recall: 0.89 | F1: 0.89 | Accuracy: 0.80
vital_signs               → Precision: 1.00 | Recall: 0.85 | F1: 0.92 | Accuracy: 0.85
symptoms                  → Precision: 0.76 | Recall: 0.81 | F1: 0.79 | Accuracy: 0.65
medications               → Precision: 0.95 | Recall: 1.00 | F1: 0.97 | Accuracy: 0.95
lab_results               → Precision: 0.85 | Recall: 1.00 | F1: 0.92 | Accuracy: 0.85
assessment_plan           → Precision: 1.00 | Recall: 1.00 | F1: 1.00 | Accuracy: 1.00
summarization_main_findings → Precision: 1.00 | Recall: 0.95 | F1: 0.97 | Accuracy: 0.95
