In [1]:
import json
import random
import re
import string

from openai import OpenAI
import pandas as pd

random.seed(0)

# key.json should be of the form {"key": "INSERT KEY HERE"}
with open("key.json") as f:
    client = OpenAI(api_key = json.load(f)["key"])

QUESTION_KB = "symptom_question_kb.csv"

In [None]:
def extract(instructions, input):
    for _ in range(4):
        try:
            output = client.responses.create(
                model="gpt-4",
                instructions=instructions,
                input=input,
            )
            return output
        except Exception as e:
            print(e)

    return ""

In [5]:
TEST = True

SYMPTOM_FILE = "mayoclinic_symptom_list.txt"
with open(SYMPTOM_FILE, "r") as f:
    data = [[line.strip().lower()] for line in f if line.strip()]
    

if TEST:
    data = [['abdominal pain'], ['anal pain'], ['ankle pain']]

symptom_df = pd.DataFrame(data, columns=['Finding'])

TASK_DESCRIPTION = "Write a question to ask if the patient has the given symptom\n"
PROMPT_FORMAT = "Symptom: {symptom}"

results = []

for index_label, row in list(symptom_df.iterrows()):

    input = PROMPT_FORMAT.format(symptom=row["Finding"]).strip()

    prompt = TASK_DESCRIPTION + input
    generation = extract(TASK_DESCRIPTION, input)

    results.append({
        "symptom": row["Finding"],
        "prompt": prompt,
        "kb_question": generation
    })

    print(f"Completed prompt {index_label} ({row['Finding']}) : {generation}")

with open(QUESTION_KB, mode='w') as f:
    json.dump(results, f, indent=2)

Error code: 404 - {'error': {'message': 'Your organization must be verified to use the model `o4-mini-2025-04-16`. Please go to: https://platform.openai.com/settings/organization/general and click on Verify Organization. If you just verified, it can take up to 15 minutes for access to propagate.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
Error code: 404 - {'error': {'message': 'Your organization must be verified to use the model `o4-mini-2025-04-16`. Please go to: https://platform.openai.com/settings/organization/general and click on Verify Organization. If you just verified, it can take up to 15 minutes for access to propagate.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}
Error code: 404 - {'error': {'message': 'Your organization must be verified to use the model `o4-mini-2025-04-16`. Please go to: https://platform.openai.com/settings/organization/general and click on Verify Organization. If you just verified, it can 

In [None]:
def create_prompt(train_df, prompt_format, row):
    to_take = 10
    train_prompts = train_df.sample(to_take)

    all_prompts_formatted = [
        PROMPT_FORMAT.format(symptom=inst["Finding"],
                             kb_q=inst["KB Question"],
                             rephrased_q=inst["Rephrased"])
        for i, inst in train_prompts.iterrows()
    ]

    all_prompts_formatted.append(
        PROMPT_FORMAT.format(symptom=row["Finding"],
                             kb_q=row["KB Question"],
                             rephrased_q=""))

    return all_prompts_formatted.strip()

In [None]:
TASK_DESCRIPTION = "Rephrase the question asking if the patient has the given symptom"
PROMPT_FORMAT = "Symptom: {symptom} [PRESENT]. Question: {kb_q} => {rephrased_q}"
TEST = True

train_df = pd.read_json("example_contexts.json")
train_df.columns = ["Finding", "KB Question", "Rephrased"]

if TEST:
    test_df = pd.read_json("example_symptoms.csv")
else:
    test_df = pd.read_json("symptom_question_kb.csv")
test_df.columns = ["Finding", "KB Question"]

already_generated_df = pd.read_csv("paraphrased_questions.json")

In [None]:
results = []

ignored = 0
already_generated_findings = already_generated_df["Finding"].unique()
for index_label, row in list(test_df.iterrows()):
    if row["Finding"] in already_generated_findings:
        print("\tAlready generated for", row["Finding"])
        ignored += 1
        continue

    # Keep generating until we get n distinct questions
    # We add the KB Question in initially because we want to count that as a 'duplicate'
    # generation also
    generations = [row["KB Question"]]
    prompts = []
    num_distinct_to_generate = 5
    while len(set(generations)) < num_distinct_to_generate + 1:  # + 1 for the KB Question
        prompt = create_prompt(train_df, PROMPT_FORMAT, row)
        generation = extract(TASK_DESCRIPTION, prompt)
        prompts.append(prompt)
        generations.append(generation)

    # Take the KB question back out
    generations.pop(0)
    results.append({
        "symptom": row["Finding"],
        "prompts": prompts,
        "kb_question": row["KB Question"],
        "generation": generations
    })

    print(f"Completed prompt {index_label} ({row['Finding']})")

print("Ignored", ignored)
with open("paraphrased_questions.json", mode='w') as f:
    json.dump(results, f, indent=2)