In [10]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
from generation_evaluation_suit.llm_as_judge_graded import llm_as_judge_graded

question = "What drove revenue change as of the FY22 for AMD?"
gold_answer = "In 2022, AMD reported Higher sales of their EPYC server processors, higher semi-custom product sales, and the inclusion of Xilinx embedded product sales"
generated_answer = "Revenue change for AMD in FY22 was driven by a 64% increase in Data Center segment revenue, a 21% increase in Gaming segment revenue, and significant growth in Embedded segment revenue from Xilinx product sales."

result = llm_as_judge_graded(
    question=question,
    gold_answer=gold_answer,
    generated_answer=generated_answer,
    model="gpt-4o-mini",
    max_retries=3,
    retry_delay_ms=1000
)

print(result)

{'score': 3, 'key_facts_gold': ['AMD', 'higher sales', 'EPYC server processors', 'higher semi-custom product sales', 'inclusion of Xilinx embedded product sales'], 'facts_present': ['AMD', 'revenue change', '64% increase in Data Center segment revenue', '21% increase in Gaming segment revenue', 'significant growth in Embedded segment revenue from Xilinx product sales'], 'facts_missing': ['higher sales of EPYC server processors', 'higher semi-custom product sales'], 'justification': 'The generated answer provides specific percentage increases for the Data Center and Gaming segments, as well as mentioning growth from Xilinx products, which aligns with the gold answer. However, it omits the mention of higher sales of EPYC server processors and higher semi-custom product sales, which are key components of the revenue change.', 'success': True, 'raw_response': {'score': 3, 'key_facts_gold': ['AMD', 'higher sales', 'EPYC server processors', 'higher semi-custom product sales', 'inclusion of X

In [None]:
from generation_evaluation_suit.llm_as_judge_binary import llm_as_judge_binary

question = "What is the FY2018 capital expenditure amount (in USD millions) for 3M?"
gold = "$1577.00"
gen = "1580 million dollars"

result = llm_as_judge_binary(
    question=question,
    gold_answer=gold,
    generated_answer=gen,
    tolerance=0.01
)

print(result)

{'match': True, 'gold_num': 1577.0, 'gen_num': 1580.0, 'relative_error': 0.19023462270133165, 'absolute_error': 3.0, 'error_category': 'within_tolerance', 'justification': 'The generated answer (1580 million) differs from the gold answer (1577 million) by 3 units, resulting in a relative error of 0.191%. Since 0.191% is less than the 1% tolerance, this is within tolerance and is a match.', 'success': True, 'raw_response': {'match': True, 'gold_number': 1577.0, 'generated_number': 1580.0, 'relative_error': 0.191, 'absolute_error': 3.0, 'error_category': 'within_tolerance', 'justification': 'The generated answer (1580 million) differs from the gold answer (1577 million) by 3 units, resulting in a relative error of 0.191%. Since 0.191% is less than the 1% tolerance, this is within tolerance and is a match.'}, 'metadata': {'provider': 'openai', 'model': 'gpt-4o-mini', 'temperature': 0.0, 'tolerance': 0.01, 'question': 'What is the FY2018 capital expenditure amount (in USD millions) for 3M?

In [11]:
from generation_evaluation_suit.numerical_exact_match import numerical_exact_match

result = numerical_exact_match(
    gold_answer="$0.8",
    generated_answer="79%",
    tolerance=0.05  # 5% tolerance
)

if result['match']:
    print(f"✓ Match! Error: {result['relative_error']:.2f}%")
else:
    print(f"✗ No match. Category: {result['error_category']}")


print(result)

✓ Match! Error: 1.25%
{'match': True, 'gold_num': 0.8, 'gen_num': 79.0, 'gold_scale': None, 'gen_scale': None, 'gold_is_percentage': False, 'gen_is_percentage': True, 'relative_error': 1.25, 'absolute_error': 1.0, 'error_category': 'within_tolerance', 'normalized_gold': 80.0, 'normalized_gen': 79.0, 'common_scale': None}


In [12]:
from generation_evaluation_suit.detect_refusal import detect_refusal

result = detect_refusal("I don't know the answer to that question.")

print(result)

{'is_refusal': True, 'confidence': 1.0, 'refusal_type': 'explicit', 'matched_pattern': "\\bi\\s+(?:do\\s+not|don't|cannot|can't|could\\s+not|couldn't)\\s+(?:know|have|provide|answer|calculate|determine|find)", 'answer_length': 41}


In [13]:
from generation_evaluation_suit.token_f1 import token_f1
result = token_f1(
    gold_answer="The consumer segment shrunk by 0.9% organically.",
    generated_answer="The Consumer segment has dragged down growth.",
    normalize=True,
    remove_stopwords=False
)

print(f"F1: {result['f1']:.3f}")
print(f"Precision: {result['precision']:.3f}")
print(f"Recall: {result['recall']:.3f}")
print(f"Common tokens: {result['common_tokens']}")
print(f"Missing tokens: {result['missing_tokens']}")

F1: 0.400
Precision: 0.429
Recall: 0.375
Common tokens: {'segment', 'the', 'consumer'}
Missing tokens: {'0', 'organically', 'shrunk', 'by', '9'}


In [20]:
from generation_evaluation_suit.evaluate_answer import evaluate_answer, print_evaluation_summary

result = evaluate_answer(
    question="What is the FY2018 capital expenditure?",
    question_type="metrics-generated",
    gold_answer="$1577.00",
    generated_answer="1577 million dollars"
)

# Use built-in pretty printer
print_evaluation_summary(result)

Exception: Evaluation failed for metrics-generated: numerical_exact_match failed: name 'numerical_exact_match' is not defined