In [2]:
import os
import json

import openai
from openai import OpenAI

In [3]:
api_key = os.getenv("OPENAI_API_KEY_MA")
if api_key is None:
    raise ValueError(
        "API key not found. Please set the OPENAI_API_KEY environment variable."
    )

In [4]:
oai_client = OpenAI(api_key=api_key)
gpt_model = "gpt-4o-mini-2024-07-18"

In [79]:
def evaluate_answers(question, answer, reference_answers, client, model):
    reference_answers_formatted = "\n".join(reference_answers)
    try:
        response = client.responses.create(
            model=model,
            input=[
                {
                    "role": "system",
                    "content": "You are an assistant that evaluates whether answer candidates contain the correct information based on reference answers.",
                },
                {
                    "role": "user",
                    "content": f"Your task is to evaluate whether a generated answer candidate to a given question is correct or not, given a set of correct reference answers.\nThe question is: {question}\nThe generated answer candidate is: {answer}\nThe correct reference answers are:\n{reference_answers_formatted}\nIs the answer candidate correct or not? The answer might be overly verbose, try to extract what is meant.",
                },
            ],
            text={
                "format": {
                    "type": "json_schema",
                    "name": "answer_verification",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "is_correct": {
                                "type": "boolean",
                                "description": "Indicates whether the given answer matches any of the correct answers.",
                            },
                        },
                        "required": [
                            "is_correct",
                        ],
                        "additionalProperties": False,
                    },
                }
            },
        )
        event = json.loads(response.output_text)
        return event
    except openai.BadRequestError as e:
        error_message = str(e)  # Extract the error message as a string
        print(f"Error: {error_message}")
        return {"error": error_message}

In [80]:
q = "'Quinsy' is a term for an abscess on which part of the body?"
a = "The term 'quinsy' is an old-fashioned term for an abscess on the brain. It is derived from the Latin word 'quinsyus,' which means 'swollen' or 'inflamed.' Quinsy was a common term used in the 18th and 19th centuries to describe an abscess on the brain, particularly"
a_correct = "The term 'quinsy' is an old-fashioned term for an abscess on the tonsils. It is derived from the Latin word 'quinsyus,' which means 'swollen' or 'inflamed.' Quinsy was a common term used in the 18th and 19th centuries to describe an abscess on the tonsils, particularly"
a_ambiguous = "The term 'quinsy' is an old-fashioned term for an abscess on the brain. It is derived from the Latin word 'quinsyus,' which means 'swollen' or 'inflamed.' Quinsy was a common term used in the 18th and 19th centuries to describe an abscess on the tonsils, particularly"
r_a = ["Tonsil", "Tonsills", "Tonsils", "Tonsels"]

In [81]:
evaluation = evaluate_answers(q, a, r_a, oai_client, gpt_model)
eval_correct = evaluate_answers(q, a_correct, r_a, oai_client, gpt_model)
eval_ambiguous = evaluate_answers(q, a_ambiguous, r_a, oai_client, gpt_model)

In [82]:
evaluation

{'is_correct': False}

In [83]:
evaluation["is_correct"], eval_correct["is_correct"], eval_ambiguous["is_correct"]

(False, True, False)

In [84]:
evaluate_answers(
    "What number in Bingo is sometimes referred to as Heinz varieties?",
    "Answer: In Bingo, \"Heinz 57\" refers to a specific pattern, also known as \"Household Bingo,\" which consists of 57 numbers.",
    ["57", "fifty-seven"],
    oai_client, gpt_model
)

{'is_correct': True}