In [None]:
from pathlib import Path
from collections import defaultdict
import time
import json
from joblib import Memory
import random
import matplotlib.pyplot as plt

import numpy as np
import openai
import pandas as pd
from tqdm import tqdm

joblib_memory = Memory(".joblib_cache", verbose=0)
openai.api_key = Path("openai.key").read_text().strip()

parent_child_pairs = pd.read_csv("data/celebrity_relations/parent_child_pairs.csv")

original_accuracy = parent_child_pairs["can_reverse"].sum() / len(parent_child_pairs)
print("original_accuracy", original_accuracy)
display(parent_child_pairs)

In [None]:
NUM_EXAMPLES_IN_PROMPT = 16
SYSTEM_PROMPT = "You are a helpful assistant, being quizzed on celebrities. If you are not sure, you **must** guess a name. Respond with **only** the name."


def create_prompt(parent: str, child: str) -> str:
    examples = []
    for row in random.sample(
        list(parent_child_pairs.itertuples(False)), k=len(parent_child_pairs)
    ):
        if len(examples) >= NUM_EXAMPLES_IN_PROMPT:
            break
        if row.child in (child, parent) or row.parent in (child, parent):
            continue  # don't use, because it contains answer / or is close to answer
        examples.append(f"Q: A parent of X is {row.parent}. Who is X?\n{row.child}")
    prompt = (
        "This is a quiz on the family connections of celebrities. Here are some example question and answers:"
        + "\n".join(examples)
        + f"\nQ: A parent of X is {parent}. Who is X?"
    )
    return prompt


print(create_prompt(child="Sasha Calle", parent="Samira Calle"))

In [None]:
def ask_llm(user_prompt: str, system_prompt: str, model="gpt-4", temperature=0) -> str:
    for pause in [0.1, 0.3, 1, 3, 10, 30, 100]:
        time.sleep(pause)  # add a pause in all cases due to rate limiting
        try:
            return get_response(
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                model=model,
                temperature=temperature,
            )
        except (openai.error.Timeout, openai.error.RateLimitError):
            pass
    return "Failed to get response"


@joblib_memory.cache
def get_response(
    system_prompt: str, user_prompt: str, model: str, temperature: float
) -> str:
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    response_message = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )["choices"][0]["message"]["content"].strip()
    return response_message


print(
    ask_llm(
        user_prompt=create_prompt(child="Sasha Calle", parent="Samira Calle"),
        system_prompt=SYSTEM_PROMPT,
    )
)

In [None]:
predictions = defaultdict(lambda: [])
prompts = defaultdict(lambda: [])
for model in ["gpt-4", "gpt-3.5-turbo-0613"]:
    random.seed(293948209)
    with tqdm(
        total=len(parent_child_pairs),
        desc=model,
    ) as pbar:
        for row in parent_child_pairs.itertuples(False):
            user_prompt = create_prompt(parent=row.parent, child=row.child)
            prediction = ask_llm(
                user_prompt=user_prompt,
                system_prompt=SYSTEM_PROMPT,
                model=model,
            )
            if row.child not in row.parent:
                assert row.child not in user_prompt
            predictions[model].append(prediction)
            prompts[model].append(
                {"user_prompt": user_prompt, "system_prompt": SYSTEM_PROMPT}
            )

            accuracy = (
                np.array(predictions[model])
                == parent_child_pairs["child"][: len(predictions[model])]
            ).sum() / len(predictions[model])

            pbar.update(1)
            pbar.set_postfix(accuracy=accuracy)

    Path(f"{model}_parent_child_predictions.json").write_text(
        json.dumps({"predictions": predictions[model], "prompts": prompts}, indent=2)
    )
    Path(f"{model}_accuracy.txt").write_text(str(accuracy))

In [None]:
accuracies = {
    "gpt-4-reported": original_accuracy,
    "gpt-3.5-turbo-reported": 0.12,
}
for model in ["gpt-4", "gpt-3.5-turbo-0613"]:
    accuracy = (
        np.array(predictions[model]) == parent_child_pairs["child"]
    ).sum() / len(parent_child_pairs)
    accuracies[model] = accuracy
# reorder manually
accuracies = {
    "gpt-3.5-turbo-reported": 0.12,
    "gpt-3.5": accuracies["gpt-3.5-turbo-0613"],
    "gpt-4-reported": original_accuracy,
    "gpt-4": accuracies["gpt-4"],
}
accuracies

In [None]:
# plot bar chart comparing the reported and new results
# rotate so bars are horizontal
plt.rc("axes", axisbelow=True)
plt.barh(
    list(accuracies.keys()),
    list(accuracies.values()),
    color=["#1f77b4", "#ff7f0e", "#1f77b4", "#ff7f0e"],
)
plt.xlabel("Accuracy")
plt.ylabel("Model")
plt.title("Experiment 2 child retrieval results with improved prompts")
plt.grid(axis="x")
plt.tight_layout()
plt.show()