In [1]:
from pathlib import Path
from collections import defaultdict
import time
import json
from joblib import Memory
import random

import numpy as np
import openai
import pandas as pd
from tqdm import tqdm

joblib_memory = Memory(".joblib_cache", verbose=0)
openai.api_key = Path("openai.key").read_text().strip()

parent_child_pairs = pd.read_csv("data/celebrity_relations/parent_child_pairs.csv")
parent_child_pairs["child"] = parent_child_pairs["child"].str.lower()
parent_child_pairs["parent"] = parent_child_pairs["parent"].str.lower()

accuracy = parent_child_pairs["can_reverse"].sum() / len(parent_child_pairs)
print("accuracy", accuracy)
display(parent_child_pairs)

accuracy 0.3271645736946464


Unnamed: 0,child,parent,parent_type,child_prediction,can_reverse
0,sasha calle,samira calle,mother,,False
1,sasha calle,sam calle,father,,False
2,golshifteh farahani,fahimeh rahim nia,mother,,False
3,golshifteh farahani,behzad farahani,father,Golshifteh Farahani,True
4,kate mara,kathleen mcnulty rooney,mother,,False
...,...,...,...,...,...
1508,ron howard,rance howard,father,Ron Howard,True
1509,kevin spacey,kathleen ann spacey,mother,Kevin Spacey,True
1510,kevin spacey,thomas geoffrey fowler,father,,False
1511,steven yeun,june yeun,mother,Steven Yeun,True


In [2]:
NUM_EXAMPLES_IN_PROMPT = 10
SYSTEM_PROMPT = "You are a helpful assistant, being quizzed on celebrities. If you are not sure, you **must** guess a name. Respond with **only** the name. (Reply in lowercase, without using special characters.)"


def create_prompt(parent: str, child: str) -> str:
    examples = []
    for row in random.sample(
        list(parent_child_pairs.itertuples(False)), k=NUM_EXAMPLES_IN_PROMPT + 2
    ):
        if len(examples) >= NUM_EXAMPLES_IN_PROMPT:
            break
        if row.child == child:
            continue  # don't include this one in prompt, because it contains the answer
        examples.append(f"Q: A parent of X is {row.parent}. Who is X?\n{row.child}")
    prompt = (
        "This is a quiz on the family connections of celebrities. Here are some example question and answers:"
        + "\n".join(examples)
        + f"\nQ: A parent of X is {parent}. Who is X?"
    )
    return prompt


print(create_prompt(child="Sasha Calle", parent="Samira Calle"))

This is a quiz on the family connections of celebrities. Here are some example question and answers:Q: A parent of X is bernadette plaza. Who is X?
aubrey plaza
Q: A parent of X is john lane paxton. Who is X?
bill paxton
Q: A parent of X is dana cason. Who is X?
gina carano
Q: A parent of X is james f. gunn. Who is X?
james gunn
Q: A parent of X is geoffrey freeman. Who is X?
martin freeman
Q: A parent of X is monica lisa.. Who is X?
jonathan majors
Q: A parent of X is nancy dorff. Who is X?
stephen dorff
Q: A parent of X is fiona biggar. Who is X?
daniel portman
Q: A parent of X is frank wersching jr.. Who is X?
annie wersching
Q: A parent of X is patricia cuthbert. Who is X?
elisha cuthbert
Q: A parent of X is Samira Calle. Who is X?


In [3]:
@joblib_memory.cache
def ask_llm(user_prompt: str, system_prompt: str, model="gpt-4", temperature=0) -> str:
    for pause in [0.3, 1, 3, 10, 30, 100]:
        time.sleep(pause)  # add a pause in all cases due to rate limiting
        try:
            messages = [
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {"role": "user", "content": user_prompt},
            ]
            response_message = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=temperature,
            )["choices"][0]["message"]["content"]
            return response_message.strip()
        except (openai.error.Timeout, openai.error.RateLimitError):
            pass
    return "Failed to get response"


print(
    ask_llm(
        user_prompt=create_prompt(child="Sasha Calle", parent="Samira Calle"),
        system_prompt=SYSTEM_PROMPT,
    )
)

sofia vergara


In [14]:
predictions = defaultdict(lambda: [])
prompts = defaultdict(lambda: [])
for model in ["gpt-4", "gpt-3.5-turbo-0613"]:
    random.seed(293948209)
    with tqdm(
        total=len(parent_child_pairs),
        desc=model,
    ) as pbar:
        for row in parent_child_pairs.itertuples(False):
            user_prompt = create_prompt(parent=row.parent, child=row.child)
            if model == "gpt-4":  # because of cache (forgot to use `model` initially)
                prediction = ask_llm(
                    user_prompt=user_prompt,
                    system_prompt=SYSTEM_PROMPT,
                )
            else:
                prediction = ask_llm(
                    user_prompt=user_prompt,
                    system_prompt=SYSTEM_PROMPT,
                    model=model,
                )
            prediction = prediction.lower()
            predictions[model].append(prediction)
            prompts[model].append(
                {"user_prompt": user_prompt, "system_prompt": SYSTEM_PROMPT}
            )

            accuracy = (
                np.array(predictions[model])
                == parent_child_pairs["child"][: len(predictions[model])]
            ).sum() / len(predictions[model])

            pbar.update(1)
            pbar.set_postfix(accuracy=accuracy)

    Path(f"{model}_parent_child_predictions.json").write_text(
        json.dumps({"predictions": predictions[model], "prompts": prompts}, indent=2)
    )
    Path(f"{model}_accuracy.txt").write_text(str(accuracy))


gpt-4: 100%|██████████| 1513/1513 [00:03<00:00, 416.15it/s, accuracy=0.545]
gpt-3.5-turbo-0613: 100%|██████████| 1513/1513 [03:20<00:00,  7.56it/s, accuracy=0.484] 
