In [9]:
import os
import json

import openai
from openai import OpenAI

In [10]:
api_key = os.getenv("OPENAI_API_KEY_MA")
if api_key is None:
    raise ValueError(
        "API key not found. Please set the OPENAI_API_KEY environment variable."
    )

In [86]:
oai_client = OpenAI(api_key=api_key)
gpt_model = "gpt-4o-mini-2024-07-18"

In [152]:
def evaluate_answers(question, answer, reference_answers, client, model):
    reference_answers_formatted = "\n".join(reference_answers)
    try:
        response = client.responses.create(
            model=model,
            input=[
                {
                    "role": "system",
                    "content": (
                        "You are a fact-checking evaluator. You decide whether an answer contains the same specific factual content "
                        "as one of a set of reference answers. You focus only on the core factual entity mentioned â€” such as a number, name, or place."
                    ),
                },
                {
                    "role": "user",
                    "content": f"""
        You will be given a question, a generated answer, and a list of reference answers.

        Determine whether the answer correctly contains the **same factual answer** (e.g., a number, name, location, etc.) as one of the references.

        ### Accept the answer if:
        - The correct fact is **clearly present**, even if the answer adds irrelevant or incorrect background.
        - The answer uses different words or phrasing to convey the same meaning.

        ### Reject the answer if:
        - It states a different fact or contradicts the correct one.
        - It introduces confusion or falsehood **that changes the meaning of the core fact**.

        ---

        **Question**: {question}  
        **Generated Answer**: {answer}  
        **Reference Answers**: {reference_answers_formatted}

        Respond only in JSON:
        {{"is_correct": true}} or {{"is_correct": false}}
        """,
                },
            ],
            text={
                "format": {
                    "type": "json_schema",
                    "name": "answer_verification",
                    "strict": True,
                    "schema": {
                        "type": "object",
                        "properties": {
                            "is_correct": {
                                "type": "boolean",
                                "description": "True if the core factual answer (e.g., number or name) is included and not contradicted.",
                            },
                        },
                        "required": ["is_correct"],
                        "additionalProperties": False,
                    },
                }
            },
        )
        try:
            # Attempt to parse the response as JSON
            event = json.loads(response.output_text)
            return event
        except json.JSONDecodeError as json_error:
            # Handle JSON parsing errors
            print(f"JSONDecodeError: {json_error}")
            return {"error": "Invalid JSON response from API"}
    except openai.BadRequestError as e:
        error_message = str(e)  # Extract the error message as a string
        print(f"Error: {error_message}")
        return {"error": error_message}

In [153]:
q = "'Quinsy' is a term for an abscess on which part of the body?"
a = "The term 'quinsy' is an old-fashioned term for an abscess on the brain. It is derived from the Latin word 'quinsyus,' which means 'swollen' or 'inflamed.' Quinsy was a common term used in the 18th and 19th centuries to describe an abscess on the brain, particularly"
a_correct = "The term 'quinsy' is an old-fashioned term for an abscess on the tonsils. It is derived from the Latin word 'quinsyus,' which means 'swollen' or 'inflamed.' Quinsy was a common term used in the 18th and 19th centuries to describe an abscess on the tonsils, particularly"
a_ambiguous = "The term 'quinsy' is an old-fashioned term for an abscess on the brain. It is derived from the Latin word 'quinsyus,' which means 'swollen' or 'inflamed.' Quinsy was a common term used in the 18th and 19th centuries to describe an abscess on the tonsils, particularly"
r_a = ["Tonsil", "Tonsills", "Tonsils", "Tonsels"]

In [154]:
evaluation = evaluate_answers(q, a, r_a, oai_client, gpt_model)
eval_correct = evaluate_answers(q, a_correct, r_a, oai_client, gpt_model)
eval_ambiguous = evaluate_answers(q, a_ambiguous, r_a, oai_client, gpt_model)

In [155]:
evaluation

{'is_correct': False}

In [156]:
evaluation["is_correct"], eval_correct["is_correct"], eval_ambiguous["is_correct"]

(False, True, False)

In [167]:
evaluate_answers(
    "What number in Bingo is sometimes referred to as Heinz varieties?",
    "Answer: In Bingo, \"Heinz 57\" refers to a specific pattern, also known as \"Household Bingo,\" which consists of 57 numbers.",
    ["57", "fifty-seven"],
    oai_client, gpt_model
)

{'is_correct': True}