In [1]:
%pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/anaconda/envs/azureml_py310_sdkv2/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import json
import os
from openai import AzureOpenAI

In [3]:
client = AzureOpenAI(
    api_key="3rs7REkTMR2yVrpxfHvtUx1olOpOfi72F4E6HADU6VW6CbeLVuTKJQQJ99BLACHYHv6XJ3w3AAABACOGuqhe",
    api_version="2024-02-15-preview",
    azure_endpoint="https://ehr-ai-gpt-a1.openai.azure.com/"
)

DEPLOYMENT_NAME = "gpt-4o"

In [4]:
CSV_PATH = "structured_data.csv"  # user places file here
df = pd.read_csv(CSV_PATH)

In [5]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(r"[^\w]+", "_", regex=True)
    .str.strip("_")
)
print(df.columns.tolist())

['patient_id', 'name', 'age', 'gender', 'date', 'encounter_type', 'chief_complaint', 'hpi', 'temperature', 'pulse_bpm', 'o2_saturation', 'diagnosis', 'plan', 'icd_10_code']


In [6]:
COLUMN_MAP = {
    "patient_id": "patient_id",
    "patient_name": "name",
    "age": "age",
    "gender": "gender",
    "chief_complaint": "chief_complaint",
    "hpi": "hpi",
    "temperature": "temperature",
    "pulse": "pulse_bpm",
    "o2_saturation": "o2_saturation"
}

In [7]:
def build_prompt(row):
    return f"""
You are a clinical documentation assistant.

Generate a professional clinical note using the following EHR data:

Patient Age: {row[COLUMN_MAP['age']]}
Gender: {row[COLUMN_MAP['gender']]}

Chief Complaint:
{row[COLUMN_MAP['chief_complaint']]}

History of Present Illness (HPI):
{row[COLUMN_MAP['hpi']]}

Vitals:
Temperature: {row[COLUMN_MAP['temperature']]} °F
Pulse: {row[COLUMN_MAP['pulse']]} bpm
Oxygen Saturation: {row[COLUMN_MAP['o2_saturation']]} %

The note should be concise and clinically formatted.
"""

In [8]:
def validate_schema(df, column_map):
    missing = [v for v in column_map.values() if v not in df.columns]
    if missing:
        raise ValueError(f"CSV missing required columns: {missing}")

validate_schema(df, COLUMN_MAP)
print("CSV schema validated successfully")

CSV schema validated successfully


In [9]:
def generate_clinical_note(prompt):
    response = client.chat.completions.create(
        model=DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": "You generate clinical documentation."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=300
    )
    return response.choices[0].message.content.strip()

def clean_text(text):
    return text.replace("\\u00b0", "°")

In [10]:
ICD10_LOOKUP = {
    # Respiratory
    "pneumonia": ("J18.9", "Pneumonia, unspecified organism"),
    "bronchitis": ("J20.9", "Acute bronchitis, unspecified"),
    "asthma": ("J45.909", "Unspecified asthma, uncomplicated"),
    "copd": ("J44.9", "Chronic obstructive pulmonary disease, unspecified"),

    # Cardiovascular
    "hypertension": ("I10", "Essential (primary) hypertension"),
    "heart failure": ("I50.9", "Heart failure, unspecified"),
    "chest pain": ("R07.9", "Chest pain, unspecified"),

    # Endocrine
    "diabetes": ("E11.9", "Type 2 diabetes mellitus without complications"),
    "hypothyroidism": ("E03.9", "Hypothyroidism, unspecified"),

    # Neurological
    "stroke": ("I63.9", "Cerebral infarction, unspecified"),
    "seizure": ("R56.9", "Unspecified convulsions"),
    "migraine": ("G43.909", "Migraine, unspecified, not intractable"),

    # Oncology
    "benign brain tumor": ("D33.9", "Benign neoplasm of brain, unspecified"),
    "malignant brain tumor": ("C71.9", "Malignant neoplasm of brain, unspecified"),
    "breast cancer": ("C50.919", "Malignant neoplasm of breast, unspecified"),

    # Gastrointestinal
    "gastritis": ("K29.70", "Gastritis, unspecified, without bleeding"),
    "acid reflux": ("K21.9", "Gastro-esophageal reflux disease without esophagitis"),
    "abdominal pain": ("R10.9", "Unspecified abdominal pain"),

    # Musculoskeletal
    "arthritis": ("M19.90", "Osteoarthritis, unspecified site"),
    "back pain": ("M54.9", "Dorsalgia, unspecified"),
    "knee pain": ("M25.569", "Pain in unspecified knee"),

    # Infectious
    "urinary tract infection": ("N39.0", "Urinary tract infection, site not specified"),
    "viral fever": ("B34.9", "Viral infection, unspecified"),

    # General / Preventive
    "general examination": ("Z00.00", "General medical examination"),
    "follow-up": ("Z09", "Follow-up examination after treatment")
}

In [11]:
def assign_icd10(note_text, diagnosis_text=""):
    text = f"{note_text} {diagnosis_text}".lower()
    for key, (code, desc) in ICD10_LOOKUP.items():
        if key in text:
            return code, desc
    return "Z00.00", "General medical examination"

In [12]:
generated_results = []

for _, row in df.iterrows():
    prompt = build_prompt(row)
    note = generate_clinical_note(prompt)
    note = clean_text(note)
    icd10_code, icd10_desc = assign_icd10(
        note,
        row.get("diagnosis", "")
            )

    generated_results.append({
        "patient_id": row["patient_id"],
        "patient_name": row["name"],
        "age": row["age"],
        "gender": row["gender"],
        "clinical_note": note,
        "icd10_code": icd10_code,
        "icd10_description": icd10_desc
    })

In [13]:
pd.DataFrame(generated_results)

Unnamed: 0,patient_id,patient_name,age,gender,clinical_note,icd10_code,icd10_description
0,P-1001,Robert Brown,47,Male,**Clinical Note**\n\n**Patient Information:** ...,G43.909,"Migraine, unspecified, not intractable"
1,P-1002,Sarah Wilson,29,Male,**Clinical Note**\n\n**Patient Information:** ...,G43.909,"Migraine, unspecified, not intractable"
2,P-1003,Olivia Clark,26,Male,**Clinical Note**\n\n**Patient Name:** [Redact...,J45.909,"Unspecified asthma, uncomplicated"
3,P-1004,Michael Johnson,28,Male,**Clinical Note**\n\n**Patient Name:** [Redact...,J45.909,"Unspecified asthma, uncomplicated"
4,P-1005,Michael Johnson,23,Female,**Clinical Note**\n\n**Patient Information:** ...,Z00.00,General medical examination
...,...,...,...,...,...,...,...
95,P-1096,Emily Davis,22,Female,**Clinical Note**\n\n**Patient Information:** ...,G43.909,"Migraine, unspecified, not intractable"
96,P-1097,John Doe,29,Female,**Clinical Note**\n\n**Patient Information:** ...,J20.9,"Acute bronchitis, unspecified"
97,P-1098,Michael Johnson,57,Male,**Clinical Note**\n\n**Patient Information:** ...,K29.70,"Gastritis, unspecified, without bleeding"
98,P-1099,Michael Johnson,39,Female,**Clinical Note**\n\n**Patient Information:** ...,G43.909,"Migraine, unspecified, not intractable"


In [14]:
with open("all_patients_clinical_notes.json", "w") as f:
    json.dump(generated_results, f, indent=2)

print("Saved: all_patients_clinical_notes.json")

Saved: all_patients_clinical_notes.json


In [15]:
json_file = "all_patients_clinical_notes.json"

In [16]:
import json
import os

def json_to_txt_per_patient(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        records = json.load(f)

    txt_files = []

    for record in records:
        filename = f"{record['patient_id']}_{record['patient_name'].replace(' ', '_').strip()}.txt"
        with open(filename, "w", encoding="utf-8") as txt:
            txt.write(record["clinical_note"])
        txt_files.append(filename)

    return txt_files

In [17]:
txt_files = json_to_txt_per_patient(json_file)

In [19]:
from IPython.display import display, HTML

html = "<h3>Download Clinical Notes</h3><ul>"

for file in txt_files:
    html += f'<li><a href="{file}" download>{file}</a></li>'


from IPython.display import FileLink

for file in txt_files:
    display(FileLink(file))