Import Statements

In [1]:
import random
from typing import List, Tuple

Helper Functions

In [2]:
def return_days_inside_3_day_window(window_center_day: int, discharge_day: int) -> List[int]:
	"""Given the day number at the center of a 3-day window and the current discharge day,
	returns all possible days within that 3-day window that still fit within the patient stay

	Args:
		window_center_day (int): day number of event at the center of the window
		discharge_day (int): current discharge day before the function is called

	Returns:
		List[int]: list of all day numbers that can be chosen bc they're within the 3 day 
		window and the patient's stay
	"""
	# first day in window is 3 days before event OR admission day, whichever comes last
	earliest_possible_day = max(1, window_center_day - 3)

	# last day in window is 3 days after event OR discharge day, whichever comes first
	latest_possible_day = min(window_center_day + 3, discharge_day)

	# range collects consecutive numbers from first number to last number-1 
	possible_findings_days = list(range(earliest_possible_day, latest_possible_day + 1))
	return possible_findings_days



def get_days_outside_3_day_window(window_center_day: int, discharge_day: int) -> Tuple[List[int], int]:
	"""Given the day number at the center of a 3-day window and the current discharge day,
	returns all possible days outside that 3-day window that still fit within the patient stay
	AND ALSO the discharge day, which might be increased -
	If the patient stay is too short (so all patient stay days are inside the 3-day window), 
	the discharge day will be increased (with some randomization of up to 5 extra days) to 
	ensure that there's at least one day outside the 3-day window

	Args:
		window_center_day (int): day number of event at the center of the window
		discharge_day (int): current discharge day before the function is called

	Returns:
		Tuple[List[int], int]: first return variable is a list of all day numbers that can be 
		chosen bc they're outside the 3 day window, second return variable is the (possibly larger)
		discharge day. After calling this function, we must check to see if externally-stored value of
		discharge day needs to be updated to match.
	"""
	dis_day = discharge_day

	# if event is very early in stay, the window includes admission (day 1)
	if window_center_day < 5:
		# findings must happen after window but before discharge
		last_window_day = window_center_day + 3
		
		# make sure there are days in stay after window
		if last_window_day >= discharge_day:
			min_extra_days_needed = last_window_day - discharge_day + 1
			# lengthen stay by increasing discharge day number
			dis_day = discharge_day + random.randint(min_extra_days_needed, min_extra_days_needed + 5)

		possible_findings_days = list(range(last_window_day + 1, dis_day + 1))

	else:
		# collect day numbers in stay before the 3-day window
		left_of_window_days = list(range(1, window_center_day - 3))

		# collect day numbers in stay after the 3-day window
		right_of_window_days = list(range(window_center_day + 4, (dis_day + 1)))

		# choose a random day from the combined list of possible outside-of-window days in stay
		possible_findings_days = left_of_window_days + right_of_window_days

	return possible_findings_days, dis_day

def multiple_choice_question(answer: str, choice_type: str, choice_list: List[str]) -> str:
	"""Given the answer to a multiple choice question (which may be "none", a phrase to make 
	the generic prompt fit the specific situation, and the list of possible choices (phrases),
	returns an appropriate, randomized prompt. Example: given answer = "none", choice_type is = "infection symptoms",
	choice_list = ["fever", "redness", "swelling"], returned value will be "The medical record should not mention 
	any of the following infection symptoms: fever, redness, swelling." If the answer is not "none",
	the function will choose a random NON-ZERO number of items from the list and build a prompt like this: 
	"The medical record should note the following infection symptoms: redness, swelling"

	Args:
		answer (str): "none" or "yes", the desired answer to an algorithm question
		choice_type (str): short phrase as described above
		choice_list (List[str]): list of all possible multiple choice answers

	Returns:
		str: the complete prompt based on the parameters as described above
	"""
	if answer == "none":
		choices_together = ", ".join(choice_list)
		prompt = f"The medical record should not mention any of the following {choice_type}: {choices_together}."
	else:
		num_to_select = random.randint(1, len(choice_list))
		selected_elements = random.sample(choice_list, num_to_select)
		elements_together = ", ".join(selected_elements)
		prompt = f"The medical record should note the following {choice_type}: {elements_together}."
	return prompt

QA Dictionary Definition

In [3]:
hai_other_prompt_qa_dict = {}

hai_other_prompt_qa_dict[1] = {
    "EQ1": "no"
}

hai_other_prompt_qa_dict[2] = {
    "EQ1": "yes",
    "EQ2": "yes",
    "EQ3": "no"
}

hai_other_prompt_qa_dict[3] = {
    "EQ1": "yes",
    "EQ2": "yes",
    "EQ3": "yes"
}

Build Prompt Functions

In [4]:
def build_EQ1_prompt(answer):
    if answer == "yes":
        EQ1_prompt = "The record should indicate that the patient acquired an infection in the hospital."
    else:
        EQ1_prompt = "The record should not mention any hospital-acquired infections not explicitly described in this prompt." # allows for combining with other algos
    return EQ1_prompt

# EQ2 has no branching; it's a follow-up to EQ1 being yes to collect / add more details - so the dict value is always yes, and answer isn't needed here
def build_EQ2_prompt():
    eq2_list = [
        "gastrointestinal infection caused by something other than C. Difficile",
        "eye, ear, nose, throat or mouth infection",
        "skin or soft tissue infection that was not related to a surgical site",
        "cardiovascular infection",
        "device-related infection (colonoscope, duodenoscope, bronchoscope)"
    ]
    rand_infection = random.choice(eq2_list)
    EQ2_prompt = f"The hospital acquired infection is of the following type: {rand_infection}."
    return EQ2_prompt

def build_EQ3_prompt(answer):
    if answer == "yes":
        mdro_list = [
            "methicillin-resistant Staphylococcus aureus (MRSA)",
            "vancomycin-resistant Enterobacter (VRE)",
            "carbepenem-resistant Enterobacteriacea (CRE)",
        ]
        rand_mdro = random.choice(mdro_list)
        EQ3_prompt = f"A multi drug resistant organism of type {rand_mdro} was associated with the hospital-acquired infection just described."
    else:
       EQ3_prompt = "The record should state that no multi-drug resistant organism was found to be associated with the hospital-acquired infection just described." 
    return EQ3_prompt

Main Program

In [5]:
# use this at the end of the main program to save each full GPT-ready prompt to a json file
# this file name shows what story the resulting PDF is; we'll use that during manual PDF generation
# so we'll store it in the JSON too
def generate_pdf_file_name(story_number):
    # dictionaries don't store their key-value pairs in order, but sorting alphabetically will fix that
    sorted_items = sorted(hai_other_prompt_qa_dict[story_number].items())
    
    # Correct way to join key-value pairs
    formatted_string = "_".join(f"{key}_{value}" for key, value in sorted_items)

    return f"{formatted_string}.pdf"

In [6]:
# for clabsi this will be numbers 1 through 19
story_numbers = list(hai_other_prompt_qa_dict.keys())
story_prompts_dict = {} # this will hold the full GPT-ready prompt for each story.

In [7]:
for num in story_numbers:

    list_of_prompts = [] # you can't change strings, so we'll buid a list of prompts based
    # on what question keys are in the story dictionary, add some basics about age, etc, 
    # and at the very end, join them together into a string and save it in the story_prompts_dict[num].

    # set up basic data about the stay that might be changed by functions
    discharge_day_number = random.randint(4, 8)
    # If R1 / R2 answer = "no" this changes to months old, from 1 to 11 (this avoids umbilical catheter complexity)
    patient_age = f"{random.randint(1, 99)} years old" # R1 and R2 both check if >= 365 days

    # create placeholders for variables that might get set / passed around between functions

    question_keys = list(hai_other_prompt_qa_dict[num].keys())

    ###### in this section, check if each key is in the question_keys list and, if so, call their function
    # and use the returned value to update list_of_prompts, any other variables.
    if "EQ1" in question_keys:
        ans = hai_other_prompt_qa_dict[num]["EQ1"]
        eq1_prompt = build_EQ1_prompt(ans)
        list_of_prompts.append(eq1_prompt)
    if "EQ2" in question_keys:
        eq2_prompt = build_EQ2_prompt()
        list_of_prompts.append(eq2_prompt)
    if "EQ3" in question_keys:
        ans = hai_other_prompt_qa_dict[num]["EQ3"]
        eq3_prompt = build_EQ3_prompt(ans)
        list_of_prompts.append(eq3_prompt)
    #######


    # after all build prompt functions that should be called are, add general prompts about the stay that might
    # not have been stated yet (remove duplicates at the end)
    list_of_prompts.append(f"Patient is {patient_age} old.")
    list_of_prompts.append(f"Patient was discharged on day number {discharge_day_number}.")

    prompt_string = " ".join(list_of_prompts)
    story_prompts_dict[num] = prompt_string


In [8]:
# when the loop is done running (prompts are generated for all stories)
# print to screen to be sure everything looks right
for num in story_numbers:
    print(f"full prompt to generate story number {num}:")
    print(story_prompts_dict[num])
    print()

full prompt to generate story number 1:
The record should not mention any hospital-acquired infections not explicitly described in this prompt. Patient is 18 years old old. Patient was discharged on day number 7.

full prompt to generate story number 2:
The record should indicate that the patient acquired an infection in the hospital. The hospital acquired infection is of the following type: cardiovascular infection. The record should state that no multi-drug resistant organism was found to be associated with the hospital-acquired infection just described. Patient is 65 years old old. Patient was discharged on day number 7.

full prompt to generate story number 3:
The record should indicate that the patient acquired an infection in the hospital. The hospital acquired infection is of the following type: gastrointestinal infection caused by something other than C. Difficile. A multi drug resistant organism of type vancomycin-resistant Enterobacter (VRE) was associated with the hospital-a

In [9]:
import csv
csv_output_file = "hai_other_prompts.csv"
with open(csv_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write header
    writer.writerow(["story_definition", "story_prompt"])

    # Write each row
    for num in story_numbers:
        story_definition = generate_pdf_file_name(num)  # Function to get filename
        story_prompt = story_prompts_dict.get(num, "")  # Retrieve prompt, default to empty if missing

        writer.writerow([story_definition, story_prompt])

In [10]:
import json
json_output_file = "hai_other_prompts.json"
data = []
# Build JSON data
for num in story_numbers:
    story_definition = generate_pdf_file_name(num)  # Function to get filename
    story_prompt = story_prompts_dict.get(num, "")  # Retrieve prompt, default to empty if missing

    data.append({
        "story_definition": story_definition,
        "story_prompt": story_prompt
    })
# Save to JSON file
with open(json_output_file, "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)