In [1]:
import json
import logging
import os

import openai
from openai import OpenAI
from tqdm import tqdm

In [2]:
logging.basicConfig(filename='experiments.log',
                    filemode='w',
                    level=logging.INFO,
                    format='%(message)s')

# Set up OpenAI

In [3]:
# You should not store API keys in a google drive folder - it is not secure. If using github, add credentials.json to .gitignore.

credentials_path = 'credentials.json'

with open(credentials_path, 'r') as file:
    credentials = json.load(file)

openai_api_key = credentials['openai_api_key']
base_url = credentials['base_url']

# Prompts

In [4]:
def generate_letter(filepath: str, system_prompt: str, user_prompt: str):
    data = clean_data(filepath)
    user_prompt = f"""\
{user_prompt}
Now, write a discharge summary letter only using the patient's EHR data provided in the following context:
{data}"""

    client = OpenAI(api_key=openai_api_key, base_url=base_url)
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
    )
    response = completion.choices[0].message.content

    logging.info(f'<data>\n{filepath}\n</data>')
    logging.info(f'<system>\n{system_prompt}\n</system>')
    logging.info(f'<user>\n{user_prompt}\n</user>')
    logging.info(f'<assistant>:\n{response}\n</assistant>')

    return system_prompt, user_prompt, response


def clean_data(filepath: str) -> dict:
    with open(f'data/{filepath}', 'r') as file:
        data = json.load(file)

    data.pop('patient_id', None)
    data['patient_demographics'].pop('name', None)

    return data

In [5]:
no_persona = 'You are an assistant for writing discharge summary letters.'

baseline_prompt = """\
The letter is written to a patient when leaving hospital after a stay for the review. It should be prose and not contain bullet points.
Minimize hallucinations.
Don't include all private information, such as ID and date of birth. Use ___ to represent names of patients and medical professionals.\
"""

In [6]:
persona = "You are a resident physician tasked with writing a discharge summary letter of a patient from the patient's Electronic Health Records."

instruction_prompt = """\
The letter is written to a patient when leaving hospital after a stay for the review. It should be prose and not contain bullet points.
Include summary of stay including, if applicable, key issues, interventions, procedures, and treatments.
Include summary of condition at discharge.
Include recommendations for follow-up plans and ongoing treatment.
Write the text in the style of a discharge letter using medical terminology. Proceed chronologically, write factually, objectively and concretely, formally, professionally, politely, respectfully. Avoid generalizations, assumptions, verbose and cumbersome formulations, and redundancy. Do not invent or presume anything. Generally, formulate in the imperfect tense, reported speech from individuals (e.g., from patients) in subjunctive I, and medical assessments and diagnoses in the present tense.
Minimize hallucinations. Do not generate recommendations on your own or add anything. Write nothing about topics for which there is no information basis from the EHR.
Don't include all private information, such as ID and date of birth. Use ___ to represent names of patients and medical professionals.\
"""

In [7]:
example = """\n
This is an example of discharge summary letter:
Dear ___,

You came into hospital because you were getting more short of breath and your ankles were swollen.

A heart scan, called an echocardiogram, showed that your heart was not pumping as well as it should.

We started you on water tablets called Furosemide. They will stop fluid collecting in your legs and lungs.

Please take 2 Furosemide tablets (2 x 40 mg = 80 mg) each morning and weigh yourself each morning.

If your weight drops by more than 2 kg over the week, reduce the dose to 1 tablet per day.

We have asked your GP practice to see you in about 2 weeks' time. They will take a blood test to check your kidney function.

We will see you in 6 weeks' time in the Cardiology Outpatient Clinic.

You will receive an appointment letter in the post.

Sincerely,
____
"""

# Experiments

In [8]:
configs = {
    'baseline': (no_persona, baseline_prompt),
    'instruction': (persona, instruction_prompt),
    'instruction_example': (persona, instruction_prompt + example),
}

for i in range(1, 5):
    filepath = f'data_{i}.json'
    for prompts in configs:
        (system_prompt, user_prompt) = configs[prompts]
        for version in tqdm(range(5)):
            system_prompt, user_prompt, response = generate_letter(filepath,
                                                                   system_prompt,
                                                                   user_prompt)
            # Write output separately for each version
            os.makedirs('experiments', exist_ok=True)
            output = f'experiments/data_{i}_{prompts}_{version+1}.json'
            with open(output, 'w') as file:
                json.dump({
                    'system': system_prompt,
                    'user': user_prompt,
                    'assistant': response
                }, file, indent=2)

100%|██████████| 5/5 [00:45<00:00,  9.15s/it]
100%|██████████| 5/5 [01:09<00:00, 13.83s/it]
100%|██████████| 5/5 [00:47<00:00,  9.49s/it]
100%|██████████| 5/5 [00:50<00:00, 10.01s/it]
100%|██████████| 5/5 [01:16<00:00, 15.26s/it]
100%|██████████| 5/5 [00:57<00:00, 11.54s/it]
100%|██████████| 5/5 [00:38<00:00,  7.70s/it]
100%|██████████| 5/5 [00:48<00:00,  9.75s/it]
100%|██████████| 5/5 [00:46<00:00,  9.40s/it]
100%|██████████| 5/5 [00:46<00:00,  9.27s/it]
100%|██████████| 5/5 [00:45<00:00,  9.13s/it]
100%|██████████| 5/5 [00:35<00:00,  7.14s/it]
