In [1]:
import pandas as pd

# we start with o1 responses here:
o1_responses = pd.read_csv('../supplemental_data/gpt_o1_response/gpt_o1_response.csv', index_col=0)
o1_responses

Unnamed: 0,problem,service_answer,metadata
0,I am running an experiment on a clinical case ...,"I'm sorry, but I cannot generate a differentia...",PMID_34722527_individual_103_7_Hui_Wang_Compre...
1,I am running an experiment on a clinical case ...,1. VACTERL association \n2. Feingold syndrome ...,PMID_32730804_Individual_3_en-prompt.txt
2,I am running an experiment on a clinical case ...,1. Autosomal recessive hyper-IgE syndrome (DOC...,PMID_19776401_Patient_6_1_en-prompt.txt
3,I am running an experiment on a clinical case ...,1. Sclerosteosis \n2. Van Buchem disease \n3. ...,PMID_20358596_Patient_A_en-prompt.txt
4,I am running an experiment on a clinical case ...,1. Smith-Lemli-Opitz syndrome \n2. ATR-X syndr...,PMID_36586412_8_en-prompt.txt
...,...,...,...
5262,I am running an experiment on a clinical case ...,1. Wolfram syndrome \n2. Alström syndrome \n3....,PMID_9817917_Family_4_individual_13070_en-prom...
5263,I am running an experiment on a clinical case ...,1. GM1 gangliosidosis \n2. Galactosialidosis \...,PMID_1907800_TS_en-prompt.txt
5264,I am running an experiment on a clinical case ...,1. Mitochondrial neurogastrointestinal encepha...,PMID_28673863_28673863_P1_en-prompt.txt
5265,I am running an experiment on a clinical case ...,1. Down syndrome\n2. Kabuki syndrome \n3. 22q1...,PMID_31021519_individual_SATB2_112_en-prompt.txt


In [None]:
# run GPT-4o and put in ../supplemental_data/gpt_o1_response/gpt_4o_response.csv
# should look like this:
# problem	service_answer	metadata
# "The patient was []"  response	PMID_32730804_Individual_3_en-prompt.txt

import os
import pandas as pd
import time  # for adding a delay between retries
from tqdm import tqdm
from openai import OpenAI

# read key file ~/openai.key
with open(os.path.expanduser('~/openai.key'), 'r') as f:
    os.environ['OPENAI_API_KEY'] = f.read().strip()

# Initialize the client
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Directory containing the prompt files
model = "gpt-4o"
prompt_dir = "../supplemental_data/prompts"
output_file = '../supplemental_data/gpt_4o_response/gpt_4o_response.csv'

# Number of retries
max_retries = 3
retry_delay = 5  # seconds between retries

# Make output directory if it doesn't exist
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Function to call OpenAI's GPT model and get the response with retries
def get_gpt_response_with_retries(prompt, model="gpt-4", retries=max_retries):
    for attempt in range(retries):
        try:
            # Update the method according to the new API structure
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": "You are a helpful assistant."},
                          {"role": "user", "content": prompt}]
            )
            # Response is now a Pydantic model, use .choices[0].message.content
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error on attempt {attempt + 1}/{retries}: {e}")
            if attempt < retries - 1:
                time.sleep(retry_delay)  # wait before retrying
            else:
                return f"Error after {retries} retries: {e}"

# Create a blank DataFrame with the specified columns
o1_responses = pd.DataFrame(columns=["problem", "service_answer", "metadata"])

# List to store new rows
new_responses = []

# Filter for only .txt files in the directory
prompt_files = [f for f in os.listdir(prompt_dir) if f.endswith('.txt')]

# tqdm progress bar
for prompt_file in tqdm(prompt_files):
    prompt_file_path = os.path.join(prompt_dir, prompt_file)

    # Read the prompt from the file
    with open(prompt_file_path, 'r') as f:
        prompt = f.read()

    # Get the GPT-4 response with retries
    response = get_gpt_response_with_retries(prompt, model=model)

    # Extract metadata from the filename (without the path)
    metadata = prompt_file

    # Append the new row in the correct format
    new_responses.append({
        "problem": prompt,
        "service_answer": response,
        "metadata": metadata
    })

# Convert to DataFrame
new_responses_df = pd.DataFrame(new_responses)

# Append new data to the existing blank o1_responses
o1_responses = pd.concat([o1_responses, new_responses_df], ignore_index=True)

# Save to the output CSV file
o1_responses.to_csv(output_file, index=False)

print(f"Responses saved to {output_file}")


  3%|▎         | 154/5267 [03:13<2:18:36,  1.63s/it]