In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
"""
Graded LLM-as-Judge for Financial QA Evaluation
================================================

This module provides LLM-based evaluation for domain-relevant and novel-generated
questions using a 0-4 graded scoring system.

Uses OpenAI's structured output with Pydantic for reliable parsing.

Author: Financial QA Evaluation System
Version: 1.0
"""

import time
from typing import Dict, Any, List, Optional
from pydantic import BaseModel, Field


class GradedJudgment(BaseModel):
    """
    Pydantic schema for graded LLM judgment output.
    Ensures structured and parseable response from LLM.
    """
    score: int = Field(
        description="Score from 0-4 based on factual correctness and completeness",
        ge=0,
        le=4
    )
    key_facts_gold: List[str] = Field(
        description="List of key facts extracted from the gold answer"
    )
    facts_present: List[str] = Field(
        description="List of facts from gold answer that are present in generated answer"
    )
    facts_missing: List[str] = Field(
        description="List of facts from gold answer that are missing in generated answer"
    )
    justification: str = Field(
        description="Brief explanation (2-3 sentences) of why this score was assigned"
    )


def llm_as_judge_graded(
    question: str,
    gold_answer: str,
    generated_answer: str,
    provider: str = "openai",
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
    max_retries: int = 3,
    retry_delay_ms: int = 500,
    return_details: bool = True
) -> Dict[str, Any]:
    """
    Evaluate answer quality using LLM with 0-4 graded scoring.
    
    This evaluator is used for:
    - Domain-relevant questions (all lengths)
    - Novel-generated questions (all)
    
    The LLM judges semantic equivalence, factual accuracy, and completeness
    rather than exact word matching.
    
    Args:
        question: The question being answered
        gold_answer: The gold standard answer (ground truth)
        generated_answer: The generated answer to evaluate
        provider: LLM provider ('openai', 'anthropic', 'ollama')
        model: Model name (e.g., 'gpt-4o-mini', 'claude-sonnet-4', 'llama3.1:8b')
        temperature: Temperature for generation (0.0 for deterministic)
        max_retries: Maximum number of retry attempts on failure
        retry_delay_ms: Delay between retries in milliseconds
        return_details: If True, include full LLM response and metadata
    
    Returns:
        Dictionary containing:
            - score: int (0-4)
            - key_facts_gold: List[str] - Key facts from gold answer
            - facts_present: List[str] - Facts present in generated answer
            - facts_missing: List[str] - Facts missing from generated answer
            - justification: str - Explanation of the score
            - raw_response: dict - Full LLM response (if return_details=True)
            - metadata: dict - Call information (if return_details=True)
    
    Scoring Rubric:
        4 (Perfect): All key facts present, accurate, comprehensive
        3 (Good): Most key facts present, minor omissions
        2 (Acceptable): Some key facts present, significant omissions
        1 (Poor): Few key facts, mostly incorrect/irrelevant
        0 (Wrong): Completely incorrect or refusal to answer
    
    Examples:
        >>> result = llm_as_judge_graded(
        ...     question="What is 3M's inventory turnover in FY2022?",
        ...     gold_answer="AES has converted inventory 9.5 times in FY 2022.",
        ...     generated_answer="AES Corporation sold its inventory roughly 12 times in FY2022."
        ... )
        >>> print(f"Score: {result['score']}/4")
        >>> print(f"Facts missing: {result['facts_missing']}")
    """
    
    # Import get_llm function (assumes it's available in the environment)
    # from your_module import get_llm
    # For now, we'll create the LLM directly
    # You can replace this with: llm = get_llm(provider, model, temperature)
    
    from langchain_openai import ChatOpenAI
    
    # Create LLM with structured output
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        model_kwargs={"response_format": {"type": "json_object"}} if provider == "openai" else {}
    )
    
    # Apply structured output schema
    structured_llm = llm.with_structured_output(GradedJudgment)
    
    # Construct evaluation prompt with few-shot examples
    prompt = _create_graded_prompt(question, gold_answer, generated_answer)
    
    # Call LLM with retry logic
    try:
        judgment = _call_llm_with_retry(
            structured_llm,
            prompt,
            max_retries=max_retries,
            retry_delay_ms=retry_delay_ms
        )
    except Exception as e:
        # If all retries fail, return error result
        return {
            'score': 0,
            'key_facts_gold': [],
            'facts_present': [],
            'facts_missing': [],
            'justification': f"LLM evaluation failed after {max_retries} retries: {str(e)}",
            'error': str(e),
            'success': False
        }
    
    # Build result dictionary
    result = {
        'score': judgment.score,
        'key_facts_gold': judgment.key_facts_gold,
        'facts_present': judgment.facts_present,
        'facts_missing': judgment.facts_missing,
        'justification': judgment.justification,
        'success': True
    }
    
    if return_details:
        result['raw_response'] = judgment.dict()
        result['metadata'] = {
            'provider': provider,
            'model': model,
            'temperature': temperature,
            'question': question,
            'gold_answer': gold_answer,
            'generated_answer': generated_answer
        }
    
    return result


def _create_graded_prompt(question: str, gold_answer: str, generated_answer: str) -> str:
    """
    Create the evaluation prompt with few-shot examples.
    
    Args:
        question: The question being answered
        gold_answer: Gold standard answer
        generated_answer: Generated answer to evaluate
    
    Returns:
        Formatted prompt string
    """
    
    prompt = f"""You are an expert evaluator for a financial question-answering system. Your task is to evaluate how well a generated answer matches the gold standard answer.

**Scoring Rubric:**
- **4 (Perfect)**: All key facts present, accurate, comprehensive. Generated answer fully captures the gold answer's information.
- **3 (Good)**: Most key facts present with minor omissions. The core information is correct but some details are missing.
- **2 (Acceptable)**: Some key facts present but significant omissions. Partial correctness with important information missing.
- **1 (Poor)**: Few key facts correct, mostly incorrect or irrelevant information.
- **0 (Wrong)**: Completely incorrect, contradicts gold answer, or is a refusal to answer.

**Important Guidelines:**
- Focus on FACTUAL CORRECTNESS, not exact wording
- Different phrasings of the same fact should be recognized as correct
- Numbers must match (with reasonable rounding)
- If generated answer includes information not in gold answer, don't penalize unless it contradicts
- A refusal to answer (e.g., "I don't know", "Data not available") should score 0

---

**Few-Shot Examples:**

**Example 1 - Novel-Generated Question:**
Question: "Which segment dragged down 3M's overall growth in 2022 excluding M&A?"
Gold Answer: "The consumer segment shrunk by 0.9% organically."
Generated Answer: "The Consumer segment has dragged down 3M's overall growth in 2022."

Evaluation:
- Key facts in gold: [consumer segment, shrunk/declined, 0.9%, organically]
- Facts present: [consumer segment, dragged down growth]
- Facts missing: [0.9%, organically]
- Score: 2 (Some key facts present - identifies the segment correctly but misses the specific percentage and "organically" qualifier)
- Justification: "The generated answer correctly identifies the consumer segment as the problem area but omits the specific 0.9% decline and the 'organically' qualifier, which are important quantitative details."

**Example 2 - Domain-Relevant Question:**
Question: "Does AMD have a reasonably healthy liquidity profile based on its quick ratio for FY22?"
Gold Answer: "Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities)."
Generated Answer: "Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio of approximately 1.57 for FY22."

Evaluation:
- Key facts in gold: [Yes, quick ratio, 1.57, healthy liquidity, calculation formula]
- Facts present: [Yes, quick ratio, 1.57, healthy liquidity]
- Facts missing: [calculation formula]
- Score: 4 (All essential facts present - the calculation formula is supplementary detail, and the core answer is complete)
- Justification: "The generated answer captures all essential information: affirmative answer, the specific quick ratio value (1.57), and the assessment of healthy liquidity. The missing calculation formula is supplementary detail that doesn't affect the core answer quality."

**Example 3 - Domain-Relevant Question:**
Question: "Roughly how many times has AES Corporation sold its inventory in FY2022?"
Gold Answer: "AES has converted inventory 9.5 times in FY 2022."
Generated Answer: "AES Corporation sold its inventory roughly 12 times in FY2022; however, conventional inventory management may not be meaningful due to the nature of its business in the energy sector."

Evaluation:
- Key facts in gold: [AES, inventory turnover, 9.5 times, FY2022]
- Facts present: [AES, inventory turnover, FY2022]
- Facts missing: [9.5 times - generated says 12 times which is wrong]
- Score: 1 (The number is significantly wrong: 12 vs 9.5, which is a ~26% error. The qualification about energy sector doesn't compensate for the incorrect figure.)
- Justification: "While the generated answer correctly identifies the context and adds useful qualification about the energy sector, it provides an incorrect inventory turnover number (12 vs 9.5 times), which is a significant factual error for a quantitative question."

---

**Now evaluate the following:**

**Question:** {question}

**Gold Answer:** {gold_answer}

**Generated Answer:** {generated_answer}

Provide your evaluation in the structured format with:
1. score (0-4)
2. key_facts_gold (list of key facts from gold answer)
3. facts_present (list of facts present in generated answer)
4. facts_missing (list of facts missing from generated answer)
5. justification (2-3 sentences explaining the score)
"""
    
    return prompt


def _call_llm_with_retry(
    llm,
    prompt: str,
    max_retries: int = 3,
    retry_delay_ms: int = 500
) -> GradedJudgment:
    """
    Call LLM with retry logic on failure.
    
    Args:
        llm: LangChain LLM with structured output
        prompt: Evaluation prompt
        max_retries: Maximum retry attempts
        retry_delay_ms: Delay between retries in milliseconds
    
    Returns:
        GradedJudgment object
    
    Raises:
        Exception: If all retries fail
    """
    
    last_error = None
    
    for attempt in range(max_retries):
        try:
            response = llm.invoke(prompt)
            return response
        except Exception as e:
            last_error = e
            if attempt < max_retries - 1:
                # Wait before retry
                time.sleep(retry_delay_ms / 1000.0)
                continue
            else:
                # All retries exhausted
                raise Exception(f"LLM call failed after {max_retries} attempts. Last error: {str(e)}")
    
    # Should not reach here, but just in case
    raise Exception(f"LLM call failed: {str(last_error)}")


def batch_llm_as_judge_graded(
    questions: List[str],
    gold_answers: List[str],
    generated_answers: List[str],
    provider: str = "openai",
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
    max_retries: int = 3,
    retry_delay_ms: int = 500
) -> Dict[str, Any]:
    """
    Evaluate multiple answers using graded LLM-as-Judge.
    
    Args:
        questions: List of questions
        gold_answers: List of gold answers
        generated_answers: List of generated answers
        provider: LLM provider
        model: Model name
        temperature: Generation temperature
        max_retries: Retry attempts per call
        retry_delay_ms: Delay between retries
    
    Returns:
        Dictionary with results and statistics
    """
    
    if not (len(questions) == len(gold_answers) == len(generated_answers)):
        raise ValueError(
            f"Length mismatch: {len(questions)} questions, "
            f"{len(gold_answers)} gold answers, {len(generated_answers)} generated answers"
        )
    
    results = []
    scores = []
    failed_count = 0
    
    for i, (q, gold, gen) in enumerate(zip(questions, gold_answers, generated_answers)):
        print(f"Evaluating {i+1}/{len(questions)}...", end="\r")
        
        result = llm_as_judge_graded(
            question=q,
            gold_answer=gold,
            generated_answer=gen,
            provider=provider,
            model=model,
            temperature=temperature,
            max_retries=max_retries,
            retry_delay_ms=retry_delay_ms,
            return_details=False
        )
        
        results.append(result)
        
        if result.get('success', False):
            scores.append(result['score'])
        else:
            failed_count += 1
    
    print()  # Clear progress line
    
    # Calculate statistics
    total = len(questions)
    success_count = total - failed_count
    
    if scores:
        mean_score = sum(scores) / len(scores)
        median_score = sorted(scores)[len(scores) // 2]
        
        # Score distribution
        score_distribution = {i: scores.count(i) for i in range(5)}
    else:
        mean_score = 0.0
        median_score = 0
        score_distribution = {i: 0 for i in range(5)}
    
    return {
        'results': results,
        'total': total,
        'success_count': success_count,
        'failed_count': failed_count,
        'mean_score': mean_score,
        'median_score': median_score,
        'score_distribution': score_distribution,
        'scores': scores
    }


def _test_llm_as_judge_graded():
    """Quick test for graded LLM-as-Judge"""
    
    print("Testing graded LLM-as-Judge...")
    print("NOTE: This requires OpenAI API key to be set")
    print()
    
    # Test case 1: Good match
    print("Test 1: Good semantic match")
    result = llm_as_judge_graded(
        question="Which segment dragged down 3M's overall growth in 2022?",
        gold_answer="The consumer segment shrunk by 0.9% organically.",
        generated_answer="The Consumer segment has dragged down 3M's overall growth in 2022.",
        model="gpt-4o-mini"
    )
    
    print(f"Score: {result['score']}/4")
    print(f"Facts present: {result['facts_present']}")
    print(f"Facts missing: {result['facts_missing']}")
    print(f"Justification: {result['justification']}")
    print()
    
    # Test case 2: Wrong number
    print("Test 2: Wrong numeric answer")
    result = llm_as_judge_graded(
        question="How many times has AES converted inventory in FY2022?",
        gold_answer="AES has converted inventory 9.5 times in FY 2022.",
        generated_answer="AES Corporation sold its inventory roughly 12 times in FY2022.",
        model="gpt-4o-mini"
    )
    
    print(f"Score: {result['score']}/4")
    print(f"Justification: {result['justification']}")
    print()
    
    # Test case 3: Refusal
    print("Test 3: Refusal detection")
    result = llm_as_judge_graded(
        question="What is the inventory turnover ratio?",
        gold_answer="The ratio is 9.5 times.",
        generated_answer="I cannot calculate this without specific data.",
        model="gpt-4o-mini"
    )
    
    print(f"Score: {result['score']}/4")
    print(f"Justification: {result['justification']}")


if __name__ == "__main__":
    print("Graded LLM-as-Judge Module")
    print("="*70)
    print()
    print("To test, run: _test_llm_as_judge_graded()")
    print("Make sure OPENAI_API_KEY is set in environment")

Graded LLM-as-Judge Module

To test, run: _test_llm_as_judge_graded()
Make sure OPENAI_API_KEY is set in environment


In [6]:
"""
Comprehensive Test Suite for Graded LLM-as-Judge
=================================================

Tests cover:
- Domain-relevant examples
- Novel-generated examples
- Edge cases (refusals, perfect matches, wrong answers)
- Batch processing

NOTE: These tests require OPENAI_API_KEY to be set in environment
"""

import os


def check_api_key():
    """Check if OpenAI API key is available"""
    if not os.getenv("OPENAI_API_KEY"):
        print("‚ö†Ô∏è  WARNING: OPENAI_API_KEY not found in environment")
        print("   These tests will fail without API access")
        print("   Set with: export OPENAI_API_KEY='your-key-here'")
        return False
    return True


def test_novel_generated_examples():
    """Test with novel-generated question examples"""
    print("\n" + "="*70)
    print("TEST SUITE 1: Novel-Generated Examples")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, expected_score_min, expected_score_max, description)
        (
            "Which segment dragged down 3M's overall growth in 2022 excluding M&A?",
            "The consumer segment shrunk by 0.9% organically.",
            "The Consumer segment.",
            1, 3,
            "Partial answer - segment correct but missing details"
        ),
        (
            "Which segment dragged down 3M's overall growth in 2022 excluding M&A?",
            "The consumer segment shrunk by 0.9% organically.",
            "The Consumer segment has dragged down 3M's overall growth in 2022.",
            2, 3,
            "Good answer but missing specific percentage"
        ),
        (
            "Which derivative had the highest notional value for Verizon in FY 2021?",
            "Cross currency swaps. Its notional value was $32,502 million.",
            "Interest rate swaps had the highest notional value among Verizon's derivative instruments in FY 2021.",
            0, 1,
            "Wrong instrument type"
        ),
        (
            "Which derivative had the highest notional value for Verizon in FY 2021?",
            "Cross currency swaps. Its notional value was $32,502 million.",
            "Cross currency swaps had the highest notional value in FY 2021, at $32,502 million.",
            4, 4,
            "Perfect match with all facts"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, min_score, max_score, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Expected score range: {min_score}-{max_score}")
        
        try:
            result = llm_as_judge_graded(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                model="gpt-4o-mini"
            )
            
            score = result['score']
            
            if min_score <= score <= max_score:
                passed += 1
                print(f"‚úì Score: {score}/4 (within expected range)")
                print(f"  Facts present: {len(result['facts_present'])}/{len(result['key_facts_gold'])}")
            else:
                failed += 1
                print(f"‚úó Score: {score}/4 (expected {min_score}-{max_score})")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nNovel-Generated Examples: {passed}/{len(tests)} passed")
    return passed, failed


def test_domain_relevant_examples():
    """Test with domain-relevant question examples"""
    print("\n" + "="*70)
    print("TEST SUITE 2: Domain-Relevant Examples")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        (
            "Does AMD have a reasonably healthy liquidity profile based on its quick ratio for FY22?",
            "Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities).",
            "Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio for FY22.",
            2, 4,
            "Good answer - yes + context, missing specific ratio"
        ),
        (
            "Does AMD have a reasonably healthy liquidity profile based on its quick ratio for FY22?",
            "Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities).",
            "Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio of approximately 1.57 for FY22.",
            4, 4,
            "Perfect - has all key facts"
        ),
        (
            "Roughly how many times has AES Corporation sold its inventory in FY2022?",
            "AES has converted inventory 9.5 times in FY 2022.",
            "AES Corporation sold its inventory roughly 12 times in FY2022.",
            0, 2,
            "Wrong number - 12 vs 9.5 is significant error"
        ),
        (
            "Roughly how many times has AES Corporation sold its inventory in FY2022?",
            "AES has converted inventory 9.5 times in FY 2022.",
            "The inventory turnover ratio for AES Corporation in FY2022 cannot be calculated without specific COGS and average inventory figures.",
            0, 0,
            "Refusal - should score 0"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, min_score, max_score, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Expected score range: {min_score}-{max_score}")
        
        try:
            result = llm_as_judge_graded(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                model="gpt-4o-mini"
            )
            
            score = result['score']
            
            if min_score <= score <= max_score:
                passed += 1
                print(f"‚úì Score: {score}/4 (within expected range)")
                print(f"  Justification: {result['justification'][:100]}...")
            else:
                failed += 1
                print(f"‚úó Score: {score}/4 (expected {min_score}-{max_score})")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nDomain-Relevant Examples: {passed}/{len(tests)} passed")
    return passed, failed


def test_edge_cases():
    """Test edge cases"""
    print("\n" + "="*70)
    print("TEST SUITE 3: Edge Cases")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        (
            "What is the value?",
            "42",
            "42",
            4, 4,
            "Exact match - simple"
        ),
        (
            "What is the value?",
            "The answer is 42.",
            "I don't know",
            0, 0,
            "Refusal"
        ),
        (
            "What is the value?",
            "The answer is 42.",
            "Data not available.",
            0, 0,
            "Data unavailable refusal"
        ),
        (
            "Describe the trend.",
            "Revenue increased by 25% from $100M to $125M.",
            "Revenue went up significantly, approximately 25%, from around $100M to $125M.",
            3, 4,
            "Semantic equivalence - different words, same meaning"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, min_score, max_score, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Expected score range: {min_score}-{max_score}")
        
        try:
            result = llm_as_judge_graded(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                model="gpt-4o-mini"
            )
            
            score = result['score']
            
            if min_score <= score <= max_score:
                passed += 1
                print(f"‚úì Score: {score}/4")
            else:
                failed += 1
                print(f"‚úó Score: {score}/4 (expected {min_score}-{max_score})")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nEdge Cases: {passed}/{len(tests)} passed")
    return passed, failed


def test_retry_logic():
    """Test retry logic with intentional failure"""
    print("\n" + "="*70)
    print("TEST SUITE 4: Retry Logic")
    print("="*70)
    
    print("Testing with invalid model to trigger retry...")
    
    try:
        result = llm_as_judge_graded(
            question="Test question",
            gold_answer="Test gold",
            generated_answer="Test generated",
            model="invalid-model-name",
            max_retries=2,
            retry_delay_ms=100
        )
        
        # Should return error result
        if not result.get('success', True):
            print("‚úì Retry logic works - returns error result after retries")
            return 1, 0
        else:
            print("‚úó Should have failed but didn't")
            return 0, 1
    except Exception as e:
        print(f"‚úì Retry logic works - raised exception after retries: {type(e).__name__}")
        return 1, 0


def test_batch_processing():
    """Test batch evaluation"""
    print("\n" + "="*70)
    print("TEST SUITE 5: Batch Processing")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    questions = [
        "What is the value?",
        "What is the result?",
        "What is the answer?",
    ]
    
    gold_answers = [
        "The value is 42.",
        "The result is positive.",
        "The answer is yes.",
    ]
    
    generated_answers = [
        "42",
        "Positive result.",
        "I don't know",
    ]
    
    print(f"Processing {len(questions)} questions...")
    
    try:
        result = batch_llm_as_judge_graded(
            questions=questions,
            gold_answers=gold_answers,
            generated_answers=generated_answers,
            model="gpt-4o-mini",
            retry_delay_ms=100
        )
        
        print(f"\nResults:")
        print(f"  Total: {result['total']}")
        print(f"  Success: {result['success_count']}")
        print(f"  Failed: {result['failed_count']}")
        print(f"  Mean score: {result['mean_score']:.2f}")
        print(f"  Score distribution: {result['score_distribution']}")
        
        if result['success_count'] == len(questions):
            print("‚úì Batch processing working correctly")
            return 1, 0
        else:
            print(f"‚úó Some evaluations failed")
            return 0, 1
    except Exception as e:
        print(f"‚úó Batch processing failed: {e}")
        return 0, 1


def run_all_tests():
    """Run all test suites"""
    print("\n" + "="*70)
    print("COMPREHENSIVE TEST SUITE FOR GRADED LLM-AS-JUDGE")
    print("="*70)
    
    if not check_api_key():
        print("\n‚ùå Cannot run tests without OPENAI_API_KEY")
        print("Please set your API key and try again")
        return False
    
    total_passed = 0
    total_failed = 0
    
    test_suites = [
        test_novel_generated_examples,
        test_domain_relevant_examples,
        test_edge_cases,
        test_retry_logic,
        test_batch_processing,
    ]
    
    for test_func in test_suites:
        try:
            passed, failed = test_func()
            total_passed += passed
            total_failed += failed
        except Exception as e:
            print(f"‚úó Test suite crashed: {e}")
            total_failed += 1
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Total Tests: {total_passed + total_failed}")
    print(f"‚úì Passed: {total_passed}")
    print(f"‚úó Failed: {total_failed}")
    if total_passed + total_failed > 0:
        print(f"Success Rate: {100 * total_passed / (total_passed + total_failed):.1f}%")
    print("="*70)
    
    return total_failed == 0


if __name__ == "__main__":
    # Import the module
    import sys
    sys.path.append('/home/claude')
    
    success = run_all_tests()
    
    if success:
        print("\nüéâ ALL TESTS PASSED! üéâ")
    else:
        print("\n‚ö†Ô∏è  SOME TESTS FAILED - Review output above")
        print("Note: LLM scores can vary, some variance is expected")


COMPREHENSIVE TEST SUITE FOR GRADED LLM-AS-JUDGE

TEST SUITE 1: Novel-Generated Examples

Test: Partial answer - segment correct but missing details
Expected score range: 1-3


/var/folders/lj/175ptt0d6knb0gg0lg2h4n2h0000gp/T/ipykernel_57483/3816411334.py:153: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  result['raw_response'] = judgment.dict()


‚úì Score: 1/4 (within expected range)
  Facts present: 1/4

Test: Good answer but missing specific percentage
Expected score range: 2-3
‚úì Score: 2/4 (within expected range)
  Facts present: 2/4

Test: Wrong instrument type
Expected score range: 0-1
‚úì Score: 0/4 (within expected range)
  Facts present: 4/3

Test: Perfect match with all facts
Expected score range: 4-4
‚úì Score: 4/4 (within expected range)
  Facts present: 4/4

Novel-Generated Examples: 4/4 passed

TEST SUITE 2: Domain-Relevant Examples

Test: Good answer - yes + context, missing specific ratio
Expected score range: 2-4
‚úì Score: 3/4 (within expected range)
  Justification: The generated answer correctly affirms that AMD has a healthy liquidity profile and mentions the qui...

Test: Perfect - has all key facts
Expected score range: 4-4
‚úì Score: 4/4 (within expected range)
  Justification: The generated answer captures all essential information: it confirms a healthy liquidity profile, pr...

Test: Wrong number - 

In [8]:
question = "What drove revenue change as of the FY22 for AMD?"
gold_answer = "In 2022, AMD reported Higher sales of their EPYC server processors, higher semi-custom product sales, and the inclusion of Xilinx embedded product sales"
generated_answer = "Revenue change for AMD in FY22 was driven by a 64% increase in Data Center segment revenue, a 21% increase in Gaming segment revenue, and significant growth in Embedded segment revenue from Xilinx product sales."

result = llm_as_judge_graded(
    question=question,
    gold_answer=gold_answer,
    generated_answer=generated_answer,
    model="gpt-4o-mini",
    max_retries=3,
    retry_delay_ms=1000
)

print(result)

{'score': 3, 'key_facts_gold': ['AMD', 'higher sales', 'EPYC server processors', 'higher semi-custom product sales', 'inclusion of Xilinx embedded product sales'], 'facts_present': ['AMD', 'revenue change', '64% increase in Data Center segment revenue', '21% increase in Gaming segment revenue', 'significant growth in Embedded segment revenue from Xilinx product sales'], 'facts_missing': ['higher sales of EPYC server processors', 'higher semi-custom product sales'], 'justification': 'The generated answer provides specific percentage increases for the Data Center and Gaming segments, as well as mentioning Xilinx product sales, which aligns with the gold answer. However, it omits the mention of higher sales of EPYC server processors and higher semi-custom product sales, which are key components of the revenue change.', 'success': True, 'raw_response': {'score': 3, 'key_facts_gold': ['AMD', 'higher sales', 'EPYC server processors', 'higher semi-custom product sales', 'inclusion of Xilinx e

/var/folders/lj/175ptt0d6knb0gg0lg2h4n2h0000gp/T/ipykernel_57483/3816411334.py:153: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  result['raw_response'] = judgment.dict()
