In [4]:
# import openai
# import os
# 
# # Read API key
# api_key_file = os.path.expanduser("~/openai.key")
# with open(api_key_file, "r") as file:
#     openai.api_key = file.read().strip()
# 
# model = "o1-preview-2024-09-12"
# messages =[{'role': "user", 'content': "Write a haiku about recursion in programming."}]
# response = openai.chat.completions.create(model=model, messages=messages)
# 
# generated_text = response.choices[0].message.content
# print(generated_text)

Sure, here's a haiku about recursion in programming:

Function calls itself  
Diving deeper each time in  
Until the base case


In [2]:
import openai
import os
import requests
import re
import csv

def fetch_and_extract_prompts(repo_url, output_dir, dry_run=False):
    """
    Fetches prompt files from the GitHub repository and extracts the [text] section.

    Args:
        repo_url (str): GitHub raw directory URL where the prompts are located.
        output_dir (str): Directory to save the extracted prompts.
        dry_run (bool): If True, skips writing files to the output directory.

    Returns:
        list: List of extracted prompts with metadata.
    """
    # Fetch the HTML for the repository directory
    response = requests.get(repo_url)
    response.raise_for_status()

    # Parse file list and construct absolute GitHub URLs
    file_urls = re.findall(r'href="(/[^"]+\.txt)"', response.text)
    file_urls = [f"https://github.com{url}" for url in file_urls if "cases" in url]

    prompts = []

    for file_url in file_urls:
        # Convert to raw GitHub URL
        full_url = file_url.replace("github.com/", "raw.githubusercontent.com/").replace("/blob/", "/")
        file_name = os.path.basename(file_url)

        try:
            # Fetch the file content
            file_response = requests.get(full_url)
            file_response.raise_for_status()
            content = file_response.text

            # Extract the [text] section
            match = re.search(r"\[text\](.*?)$", content, re.DOTALL)
            if match:
                text_section = match.group(1).strip()
                prompts.append((file_name, text_section))

                if not dry_run:
                    # Save the prompt locally
                    with open(os.path.join(output_dir, file_name), "w") as file:
                        file.write(text_section)
        except requests.HTTPError as e:
            print(f"Failed to fetch {full_url}: {e}")

    return prompts


def prepend_instructions_and_generate(prompts_with_metadata, instruction, output_dir, model, dry_run=False):
    """
    Prepends instructions to prompts, optionally calls OpenAI API for completion,
    and emits a result file with service answers and metadata.

    Args:
        prompts_with_metadata (list): List of tuples (file_name, prompt_text).
        instruction (str): Instruction to prepend to each prompt.
        output_dir (str): Directory to save prompt files and results.
        model (str): OpenAI model name to use.
        dry_run (bool): If True, skips calling the API and writing result files.
    """
    if not dry_run:
        # Read API key
        api_key_file = os.path.expanduser("~/openai.key")
        with open(api_key_file, "r") as file:
            openai.api_key = file.read().strip()

        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)

    result_rows = []  # To store rows for the result file

    for file_name, prompt in prompts_with_metadata:
        # Prepend instructions
        full_prompt = f"{instruction}\n\n{prompt}"

        if dry_run:
            # Print simulated workflow for dry run
            print(f"Simulated API call for file: {file_name}")
            print(f"Prompt:\n{full_prompt}\n")
            result_rows.append(["Simulated response", file_name])
        else:
            # Call OpenAI API
            messages = [{'role': "user", 'content': full_prompt}]
            response = openai.chat.completions.create(model=model, messages=messages)

            # Extract generated text
            generated_text = response.choices[0].message.content
            result_rows.append([generated_text, file_name])

    if not dry_run:
        # Emit result file
        result_file_path = os.path.join(output_dir, "result.csv")
        with open(result_file_path, "w", newline="") as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(["service_answer", "metadata"])  # Header row
            csv_writer.writerows(result_rows)

        print(f"Processing completed. Results saved to {result_file_path}")
    else:
        print("Dry run completed. No files written or API calls made.")


repo_url = "https://github.com/monarch-initiative/phenopacket2prompt/blob/main/docs/cases"
output_directory = "../supplemental_data/gpt_o1_preview_VIGNETTE/"

# make directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

instruction = (
    "I am running an experiment on a clinical case report to see how your diagnoses compare with those of human experts.\n"
    "I am going to give you part of a medical case. In this case, you are “Dr. GPT-4”, an AI language model who is providing\n"
    "a diagnosis. Here are some guidelines. First, there is a single definitive diagnosis, and it is a diagnosis that is known\n"
    "today to exist in humans. The diagnosis is almost always confirmed by some sort of genetic test, though in rare cases\n"
    "when such a test does not exist for a diagnosis the diagnosis can instead be made using validated clinical criteria or\n"
    "very rarely just confirmed by expert opinion. After you read the case, I want you to give a differential diagnosis with\n"
    "a list of candidate diagnoses ranked by probability starting with the most likely candidate. Each candidate should be\n"
    "specified with disease name. For instance, if the first candidate is Branchiooculofacial syndrome and the second is\n"
    "Cystic fibrosis, provide this:\n\n"
    "1. Branchiooculofacial syndrome\n"
    "2. Cystic fibrosis\n\n"
    "This list should provide as many diagnoses as you think are reasonable. You do not need to explain your reasoning,\n"
    "just list the diagnoses. Here is the case:"
)

model_name = "o1-preview-2024-09-12"
dry_run = False

# Fetch prompts and metadata
prompts_with_metadata = fetch_and_extract_prompts(repo_url, output_directory, dry_run)

# Generate completions
prepend_instructions_and_generate(prompts_with_metadata, instruction, output_directory, model_name, dry_run)

FileNotFoundError: [Errno 2] No such file or directory: '../supplemental_data/gpt_o1_preview_VIGNETTE/PMID_15673476.txt'