In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [6]:
"""
Aggregate Evaluator - Master Router for Financial QA Evaluation
================================================================
This module provides the master evaluation function that routes to appropriate
metrics based on question type and returns comprehensive evaluation results.

Routes to:
- metrics-generated: numerical_exact_match + llm_as_judge_binary
- novel-generated: token_f1 + llm_as_judge_graded  
- domain-relevant: llm_as_judge_graded only

Author: Financial QA Evaluation System
Version: 1.0
"""

from typing import Dict, Any, Optional, List
import sys
import os

# Import all the metric functions we built in Phase 1 and Phase 2
# Note: Adjust these imports based on your actual file structure
try:
    from numerical_exact_match import numerical_exact_match
    from token_f1 import token_f1
    from detect_refusal import detect_refusal
    from llm_as_judge_binary import llm_as_judge_binary
    from llm_as_judge_graded import llm_as_judge_graded
except ImportError as e:
    print(f"Warning: Could not import some modules: {e}")
    print("Make sure all Phase 1 and Phase 2 modules are in the Python path")


# Default configuration
DEFAULT_CONFIG = {
    # Numerical matching
    'tolerance': 0.01,  # 1% for numerical_exact_match
    
    # Token F1
    'normalize': True,
    'remove_stopwords': False,
    
    # LLM settings
    'llm_provider': 'openai',
    'llm_model': 'gpt-4o-mini',
    'llm_temperature': 0.0,
    'llm_max_retries': 3,
    'llm_retry_delay_ms': 500,
    
    # Refusal detection
    'pre_check_refusal': True,
    
    # Return details
    'return_details': True
}


def evaluate_answer(
    question: str,
    question_type: str,
    gold_answer: str,
    generated_answer: str,
    config: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
    """
    Master evaluation function that routes to appropriate metrics based on question type.
    
    This is the main entry point for evaluating financial QA system answers.
    It intelligently routes to the right metrics and returns comprehensive results.
    
    Args:
        question: The question being answered
        question_type: One of 'metrics-generated', 'novel-generated', 'domain-relevant'
        gold_answer: The gold standard answer (ground truth)
        generated_answer: The generated answer to evaluate
        config: Optional configuration dictionary (uses DEFAULT_CONFIG if None)
    
    Returns:
        Dictionary containing:
            - question_type: The type of question
            - question: The question text
            - gold_answer: The gold answer
            - generated_answer: The generated answer
            - refusal_check: Results from refusal detection
            - metrics: Dictionary of results from each applicable metric
            - summary: High-level summary of evaluation
    
    Question Type Routing:
        - metrics-generated: 
            * numerical_exact_match (rule-based numerical validation)
            * llm_as_judge_binary (LLM-based numerical validation)
        
        - novel-generated:
            * token_f1 (token-level overlap scoring)
            * llm_as_judge_graded (LLM-based semantic evaluation)
        
        - domain-relevant:
            * llm_as_judge_graded (LLM-based semantic evaluation)
    
    Raises:
        ValueError: If question_type is not recognized
        Exception: If any metric evaluation fails
    
    Examples:
        >>> # Metrics-generated question
        >>> result = evaluate_answer(
        ...     question="What is the FY2018 capex for 3M?",
        ...     question_type="metrics-generated",
        ...     gold_answer="$1577.00",
        ...     generated_answer="1577 million dollars"
        ... )
        >>> print(result['summary']['refusal_detected'])  # False
        >>> print(result['metrics']['numerical_exact_match']['match'])  # True
        >>> print(result['metrics']['llm_as_judge_binary']['match'])  # True
        
        >>> # Novel-generated question
        >>> result = evaluate_answer(
        ...     question="Which segment dragged down growth?",
        ...     question_type="novel-generated",
        ...     gold_answer="The consumer segment shrunk by 0.9% organically.",
        ...     generated_answer="The Consumer segment."
        ... )
        >>> print(result['metrics']['token_f1']['f1'])  # Some F1 score
        >>> print(result['metrics']['llm_as_judge_graded']['score'])  # 0-4 score
        
        >>> # Domain-relevant question
        >>> result = evaluate_answer(
        ...     question="Does AMD have healthy liquidity?",
        ...     question_type="domain-relevant",
        ...     gold_answer="Yes. The quick ratio is 1.57...",
        ...     generated_answer="Yes, AMD has healthy liquidity..."
        ... )
        >>> print(result['metrics']['llm_as_judge_graded']['score'])  # 0-4 score
    """
    
    # Merge config with defaults
    cfg = DEFAULT_CONFIG.copy()
    if config:
        cfg.update(config)
    
    # Validate question_type
    valid_types = ['metrics-generated', 'novel-generated', 'domain-relevant']
    if question_type not in valid_types:
        raise ValueError(
            f"Invalid question_type: '{question_type}'. "
            f"Must be one of: {valid_types}"
        )
    
    # Initialize result structure
    result = {
        'question_type': question_type,
        'question': question,
        'gold_answer': gold_answer,
        'generated_answer': generated_answer,
        'refusal_check': None,
        'metrics': {},
        'summary': {
            'question_type': question_type,
            'refusal_detected': False,
            'metrics_computed': [],
            'evaluation_complete': False,
            'errors': []
        }
    }
    
    # Step 1: Pre-check for refusal (if enabled)
    if cfg['pre_check_refusal']:
        try:
            refusal_result = detect_refusal(generated_answer)
            result['refusal_check'] = refusal_result
            result['summary']['refusal_detected'] = refusal_result['is_refusal']
            
            # If it's a clear refusal, we can skip some metrics
            # But we still run them for analysis purposes
        except Exception as e:
            result['summary']['errors'].append(f"Refusal detection failed: {str(e)}")
            # Continue with evaluation even if refusal check fails
    
    # Step 2: Route to appropriate metrics based on question type
    try:
        if question_type == 'metrics-generated':
            result['metrics'] = _evaluate_metrics_generated(
                question, gold_answer, generated_answer, cfg
            )
            result['summary']['metrics_computed'] = ['numerical_exact_match', 'llm_as_judge_binary']
        
        elif question_type == 'novel-generated':
            result['metrics'] = _evaluate_novel_generated(
                question, gold_answer, generated_answer, cfg
            )
            result['summary']['metrics_computed'] = ['token_f1', 'llm_as_judge_graded']
        
        elif question_type == 'domain-relevant':
            result['metrics'] = _evaluate_domain_relevant(
                question, gold_answer, generated_answer, cfg
            )
            result['summary']['metrics_computed'] = ['llm_as_judge_graded']
        
        result['summary']['evaluation_complete'] = True
        
    except Exception as e:
        # If any metric fails, raise exception (as per your requirement - Option B)
        result['summary']['evaluation_complete'] = False
        result['summary']['errors'].append(f"Evaluation failed: {str(e)}")
        raise Exception(f"Evaluation failed for {question_type}: {str(e)}") from e
    
    return result


def _evaluate_metrics_generated(
    question: str,
    gold_answer: str,
    generated_answer: str,
    config: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Evaluate metrics-generated questions using numerical validation methods.
    
    Uses TWO metrics:
    1. numerical_exact_match (rule-based)
    2. llm_as_judge_binary (LLM-based)
    
    Args:
        question: Question text
        gold_answer: Gold answer
        generated_answer: Generated answer
        config: Configuration dictionary
    
    Returns:
        Dictionary with results from both metrics
    """
    
    metrics = {}
    
    # Metric 1: Numerical Exact Match (rule-based)
    try:
        nem_result = numerical_exact_match(
            gold_answer=gold_answer,
            generated_answer=generated_answer,
            tolerance=config['tolerance']
        )
        metrics['numerical_exact_match'] = nem_result
    except Exception as e:
        raise Exception(f"numerical_exact_match failed: {str(e)}") from e
    
    # Metric 2: LLM-as-Judge Binary (LLM-based)
    try:
        llm_result = llm_as_judge_binary(
            question=question,
            gold_answer=gold_answer,
            generated_answer=generated_answer,
            tolerance=config['tolerance'],
            provider=config['llm_provider'],
            model=config['llm_model'],
            temperature=config['llm_temperature'],
            max_retries=config['llm_max_retries'],
            retry_delay_ms=config['llm_retry_delay_ms'],
            return_details=config['return_details']
        )
        metrics['llm_as_judge_binary'] = llm_result
    except Exception as e:
        raise Exception(f"llm_as_judge_binary failed: {str(e)}") from e
    
    return metrics


def _evaluate_novel_generated(
    question: str,
    gold_answer: str,
    generated_answer: str,
    config: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Evaluate novel-generated questions using token and semantic methods.
    
    Uses TWO metrics:
    1. token_f1 (token-level overlap)
    2. llm_as_judge_graded (semantic evaluation)
    
    Args:
        question: Question text
        gold_answer: Gold answer
        generated_answer: Generated answer
        config: Configuration dictionary
    
    Returns:
        Dictionary with results from both metrics
    """
    
    metrics = {}
    
    # Metric 1: Token F1 (token-level overlap)
    try:
        f1_result = token_f1(
            gold_answer=gold_answer,
            generated_answer=generated_answer,
            normalize=config['normalize'],
            remove_stopwords=config['remove_stopwords']
        )
        metrics['token_f1'] = f1_result
    except Exception as e:
        raise Exception(f"token_f1 failed: {str(e)}") from e
    
    # Metric 2: LLM-as-Judge Graded (semantic evaluation)
    try:
        llm_result = llm_as_judge_graded(
            question=question,
            gold_answer=gold_answer,
            generated_answer=generated_answer,
            provider=config['llm_provider'],
            model=config['llm_model'],
            temperature=config['llm_temperature'],
            max_retries=config['llm_max_retries'],
            retry_delay_ms=config['llm_retry_delay_ms'],
            return_details=config['return_details']
        )
        metrics['llm_as_judge_graded'] = llm_result
    except Exception as e:
        raise Exception(f"llm_as_judge_graded failed: {str(e)}") from e
    
    return metrics


def _evaluate_domain_relevant(
    question: str,
    gold_answer: str,
    generated_answer: str,
    config: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Evaluate domain-relevant questions using semantic evaluation.
    
    Uses ONE metric:
    1. llm_as_judge_graded (semantic evaluation)
    
    Note: No length-based routing - all domain-relevant questions
    use LLM judge regardless of answer length.
    
    Args:
        question: Question text
        gold_answer: Gold answer
        generated_answer: Generated answer
        config: Configuration dictionary
    
    Returns:
        Dictionary with result from LLM judge
    """
    
    metrics = {}
    
    # Metric: LLM-as-Judge Graded (semantic evaluation)
    try:
        llm_result = llm_as_judge_graded(
            question=question,
            gold_answer=gold_answer,
            generated_answer=generated_answer,
            provider=config['llm_provider'],
            model=config['llm_model'],
            temperature=config['llm_temperature'],
            max_retries=config['llm_max_retries'],
            retry_delay_ms=config['llm_retry_delay_ms'],
            return_details=config['return_details']
        )
        metrics['llm_as_judge_graded'] = llm_result
    except Exception as e:
        raise Exception(f"llm_as_judge_graded failed: {str(e)}") from e
    
    return metrics


# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def get_default_config() -> Dict[str, Any]:
    """
    Get the default configuration dictionary.
    
    Returns:
        Copy of DEFAULT_CONFIG
    """
    return DEFAULT_CONFIG.copy()


def validate_config(config: Dict[str, Any]) -> bool:
    """
    Validate a configuration dictionary.
    
    Args:
        config: Configuration to validate
    
    Returns:
        True if valid
    
    Raises:
        ValueError: If configuration is invalid
    """
    
    # Check tolerance
    if 'tolerance' in config:
        if not isinstance(config['tolerance'], (int, float)):
            raise ValueError("tolerance must be a number")
        if config['tolerance'] <= 0:
            raise ValueError("tolerance must be positive")
    
    # Check LLM model
    if 'llm_model' in config:
        if not isinstance(config['llm_model'], str):
            raise ValueError("llm_model must be a string")
    
    # Check temperature
    if 'llm_temperature' in config:
        if not isinstance(config['llm_temperature'], (int, float)):
            raise ValueError("llm_temperature must be a number")
        if not 0 <= config['llm_temperature'] <= 2:
            raise ValueError("llm_temperature must be between 0 and 2")
    
    return True


def print_evaluation_summary(result: Dict[str, Any]) -> None:
    """
    Print a human-readable summary of evaluation results.
    
    Args:
        result: Result dictionary from evaluate_answer()
    """
    
    print("="*70)
    print("EVALUATION SUMMARY")
    print("="*70)
    
    print(f"\nQuestion Type: {result['question_type']}")
    print(f"Question: {result['question'][:80]}...")
    print(f"Gold Answer: {result['gold_answer'][:80]}...")
    print(f"Generated Answer: {result['generated_answer'][:80]}...")
    
    # Refusal check
    if result['refusal_check']:
        print(f"\nRefusal Detected: {result['summary']['refusal_detected']}")
        if result['summary']['refusal_detected']:
            print(f"  Type: {result['refusal_check']['refusal_type']}")
    
    # Metrics
    print(f"\nMetrics Computed: {', '.join(result['summary']['metrics_computed'])}")
    
    for metric_name, metric_result in result['metrics'].items():
        print(f"\n{metric_name.upper()}:")
        
        if metric_name == 'numerical_exact_match':
            print(f"  Match: {metric_result['match']}")
            print(f"  Category: {metric_result['error_category']}")
            if metric_result.get('relative_error'):
                print(f"  Relative Error: {metric_result['relative_error']:.3f}%")
        
        elif metric_name == 'llm_as_judge_binary':
            print(f"  Match: {metric_result['match']}")
            print(f"  Category: {metric_result['error_category']}")
            if metric_result.get('relative_error'):
                print(f"  Relative Error: {metric_result['relative_error']:.3f}%")
            if metric_result.get('corrected'):
                print(f"  ‚ö†Ô∏è  Auto-corrected by post-processing")
        
        elif metric_name == 'token_f1':
            print(f"  F1: {metric_result['f1']:.3f}")
            print(f"  Precision: {metric_result['precision']:.3f}")
            print(f"  Recall: {metric_result['recall']:.3f}")
        
        elif metric_name == 'llm_as_judge_graded':
            print(f"  Score: {metric_result['score']}/4")
            print(f"  Facts Present: {len(metric_result['facts_present'])}/{len(metric_result['key_facts_gold'])}")
            print(f"  Justification: {metric_result['justification'][:100]}...")
    
    # Completion status
    print(f"\nEvaluation Complete: {result['summary']['evaluation_complete']}")
    if result['summary']['errors']:
        print(f"Errors: {result['summary']['errors']}")
    
    print("="*70)


# ============================================================================
# TESTING FUNCTIONS
# ============================================================================

def _test_aggregate_evaluator():
    """Quick test for aggregate evaluator"""
    
    print("Testing Aggregate Evaluator...")
    print("NOTE: This requires all Phase 1 and Phase 2 modules + OpenAI API key")
    print()
    
    # Test 1: Metrics-generated
    print("Test 1: Metrics-generated question")
    print("-"*70)
    result = evaluate_answer(
        question="What is the FY2018 capital expenditure amount (in USD millions) for 3M?",
        question_type="metrics-generated",
        gold_answer="$1577.00",
        generated_answer="1577 million dollars"
    )
    print_evaluation_summary(result)
    print()
    
    # Test 2: Novel-generated
    print("\n\nTest 2: Novel-generated question")
    print("-"*70)
    result = evaluate_answer(
        question="Which segment dragged down 3M's overall growth in 2022?",
        question_type="novel-generated",
        gold_answer="The consumer segment shrunk by 0.9% organically.",
        generated_answer="The Consumer segment has dragged down 3M's overall growth."
    )
    print_evaluation_summary(result)
    print()
    
    # Test 3: Domain-relevant
    print("\n\nTest 3: Domain-relevant question")
    print("-"*70)
    result = evaluate_answer(
        question="Does AMD have a reasonably healthy liquidity profile?",
        question_type="domain-relevant",
        gold_answer="Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities).",
        generated_answer="Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio for FY22."
    )
    print_evaluation_summary(result)


if __name__ == "__main__":
    print("Aggregate Evaluator Module")
    print("="*70)
    print()
    print("To test, run: _test_aggregate_evaluator()")
    print("Make sure all dependencies are installed and OPENAI_API_KEY is set")

Aggregate Evaluator Module

To test, run: _test_aggregate_evaluator()
Make sure all dependencies are installed and OPENAI_API_KEY is set


In [8]:
"""
Comprehensive Test Suite for Aggregate Evaluator
=================================================
Tests the master evaluation function that routes to appropriate metrics.

NOTE: These tests require:
- All Phase 1 modules (numerical_exact_match, token_f1, detect_refusal)
- All Phase 2 modules (llm_as_judge_binary, llm_as_judge_graded)
- OpenAI API key set in environment
"""

import os
import sys

# Add parent directory to path
sys.path.append('/home/claude')


def check_api_key():
    """Check if OpenAI API key is available"""
    if not os.getenv("OPENAI_API_KEY"):
        print("‚ö†Ô∏è  WARNING: OPENAI_API_KEY not found in environment")
        print("   These tests will fail without API access")
        print("   Set with: export OPENAI_API_KEY='your-key-here'")
        return False
    return True


def test_metrics_generated():
    """Test evaluation of metrics-generated questions"""
    print("\n" + "="*70)
    print("TEST SUITE 1: Metrics-Generated Questions")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, description, should_match)
        (
            "What is the FY2018 capital expenditure amount (in USD millions) for 3M?",
            "$1577.00",
            "1577 million dollars",
            "Exact match with format difference",
            True
        ),
        (
            "What is the operating margin?",
            "24.5%",
            "24.48%",
            "Within tolerance",
            True
        ),
        (
            "What is the inventory turnover?",
            "9.5",
            "12",
            "Out of tolerance",
            False
        ),
        (
            "What is the ratio?",
            "0.66",
            "I cannot calculate without data",
            "Refusal",
            False
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, desc, expected_match in tests:
        print(f"\nTest: {desc}")
        print(f"Expected match: {expected_match}")
        
        try:
            result = evaluate_answer(
                question=question,
                question_type="metrics-generated",
                gold_answer=gold,
                generated_answer=gen
            )
            
            # Check structure
            assert 'question_type' in result
            assert 'metrics' in result
            assert 'summary' in result
            assert result['question_type'] == 'metrics-generated'
            
            # Check metrics exist
            assert 'numerical_exact_match' in result['metrics']
            assert 'llm_as_judge_binary' in result['metrics']
            
            # Check refusal detection
            if "cannot" in gen.lower() or "don't" in gen.lower():
                if result['summary']['refusal_detected']:
                    print(f"‚úì Refusal detected correctly")
            
            # Check at least one metric agrees with expected
            nem_match = result['metrics']['numerical_exact_match']['match']
            llm_match = result['metrics']['llm_as_judge_binary']['match']
            
            if nem_match == expected_match or llm_match == expected_match:
                passed += 1
                print(f"‚úì At least one metric matches expectation")
                print(f"  NEM: {nem_match}, LLM: {llm_match}")
            else:
                failed += 1
                print(f"‚úó Neither metric matches expectation")
                print(f"  Expected: {expected_match}")
                print(f"  NEM: {nem_match}, LLM: {llm_match}")
                
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nMetrics-Generated: {passed}/{len(tests)} passed")
    return passed, failed


def test_novel_generated():
    """Test evaluation of novel-generated questions"""
    print("\n" + "="*70)
    print("TEST SUITE 2: Novel-Generated Questions")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, description, min_f1, min_score)
        (
            "Which segment dragged down 3M's overall growth in 2022?",
            "The consumer segment shrunk by 0.9% organically.",
            "The Consumer segment has dragged down 3M's overall growth.",
            "Partial match - missing details",
            0.3,  # Expect some token overlap
            2     # Expect score >= 2
        ),
        (
            "Which derivative had the highest notional value for Verizon in FY 2021?",
            "Cross currency swaps. Its notional value was $32,502 million.",
            "Cross currency swaps had the highest notional value in FY 2021, at $32,502 million.",
            "Good match with all facts",
            0.6,  # Good token overlap
            3     # Expect score >= 3
        ),
        (
            "What is the answer?",
            "The answer is 42.",
            "I don't know",
            "Refusal",
            0.0,  # No overlap
            0     # Expect score = 0
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, desc, min_f1, min_score in tests:
        print(f"\nTest: {desc}")
        print(f"Expected: F1 >= {min_f1}, Score >= {min_score}")
        
        try:
            result = evaluate_answer(
                question=question,
                question_type="novel-generated",
                gold_answer=gold,
                generated_answer=gen
            )
            
            # Check structure
            assert 'metrics' in result
            assert result['question_type'] == 'novel-generated'
            
            # Check metrics exist
            assert 'token_f1' in result['metrics']
            assert 'llm_as_judge_graded' in result['metrics']
            
            # Check values
            f1 = result['metrics']['token_f1']['f1']
            score = result['metrics']['llm_as_judge_graded']['score']
            
            if f1 >= min_f1 and score >= min_score:
                passed += 1
                print(f"‚úì F1: {f1:.3f} >= {min_f1}, Score: {score} >= {min_score}")
            else:
                # Allow some flexibility
                if f1 >= (min_f1 - 0.1) and score >= (min_score - 1):
                    passed += 1
                    print(f"‚úì Close enough: F1: {f1:.3f}, Score: {score}")
                else:
                    failed += 1
                    print(f"‚úó F1: {f1:.3f} < {min_f1} or Score: {score} < {min_score}")
                
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nNovel-Generated: {passed}/{len(tests)} passed")
    return passed, failed


def test_domain_relevant():
    """Test evaluation of domain-relevant questions"""
    print("\n" + "="*70)
    print("TEST SUITE 3: Domain-Relevant Questions")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, description, min_score, max_score)
        (
            "Does AMD have a reasonably healthy liquidity profile based on its quick ratio for FY22?",
            "Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities).",
            "Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio for FY22.",
            "Good answer without calculation details",
            3, 4
        ),
        (
            "Roughly how many times has AES Corporation sold its inventory in FY2022?",
            "AES has converted inventory 9.5 times in FY 2022.",
            "AES Corporation sold its inventory roughly 12 times in FY2022.",
            "Wrong number",
            0, 2
        ),
        (
            "What is the liquidity ratio?",
            "The ratio is 0.96",
            "I cannot calculate this",
            "Refusal",
            0, 0
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, desc, min_score, max_score in tests:
        print(f"\nTest: {desc}")
        print(f"Expected score: {min_score}-{max_score}")
        
        try:
            result = evaluate_answer(
                question=question,
                question_type="domain-relevant",
                gold_answer=gold,
                generated_answer=gen
            )
            
            # Check structure
            assert 'metrics' in result
            assert result['question_type'] == 'domain-relevant'
            
            # Check metric exists
            assert 'llm_as_judge_graded' in result['metrics']
            
            # Check value
            score = result['metrics']['llm_as_judge_graded']['score']
            
            if min_score <= score <= max_score:
                passed += 1
                print(f"‚úì Score: {score} (within {min_score}-{max_score})")
            else:
                failed += 1
                print(f"‚úó Score: {score} (expected {min_score}-{max_score})")
                
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nDomain-Relevant: {passed}/{len(tests)} passed")
    return passed, failed


def test_invalid_question_type():
    """Test error handling for invalid question type"""
    print("\n" + "="*70)
    print("TEST SUITE 4: Error Handling")
    print("="*70)
    
    print("\nTest: Invalid question type")
    
    try:
        result = evaluate_answer(
            question="Test question",
            question_type="invalid-type",
            gold_answer="Answer",
            generated_answer="Answer"
        )
        print("‚úó Should have raised ValueError")
        return 0, 1
    except ValueError as e:
        if "Invalid question_type" in str(e):
            print(f"‚úì Correctly raised ValueError: {e}")
            return 1, 0
        else:
            print(f"‚úó Wrong error message: {e}")
            return 0, 1
    except Exception as e:
        print(f"‚úó Wrong exception type: {e}")
        return 0, 1


def test_config_validation():
    """Test configuration validation"""
    print("\n" + "="*70)
    print("TEST SUITE 5: Configuration Validation")
    print("="*70)
    
    passed, failed = 0, 0
    
    # Test 1: Valid config
    print("\nTest: Valid config")
    try:
        config = get_default_config()
        validate_config(config)
        print("‚úì Default config is valid")
        passed += 1
    except Exception as e:
        print(f"‚úó Error: {e}")
        failed += 1
    
    # Test 2: Invalid tolerance
    print("\nTest: Invalid tolerance (negative)")
    try:
        config = {'tolerance': -0.01}
        validate_config(config)
        print("‚úó Should have raised ValueError")
        failed += 1
    except ValueError as e:
        print(f"‚úì Correctly raised ValueError: {e}")
        passed += 1
    
    # Test 3: Invalid temperature
    print("\nTest: Invalid temperature (out of range)")
    try:
        config = {'llm_temperature': 3.0}
        validate_config(config)
        print("‚úó Should have raised ValueError")
        failed += 1
    except ValueError as e:
        print(f"‚úì Correctly raised ValueError: {e}")
        passed += 1
    
    print(f"\nConfig Validation: {passed}/3 passed")
    return passed, failed


def test_return_structure():
    """Test that return structure is correct"""
    print("\n" + "="*70)
    print("TEST SUITE 6: Return Structure Validation")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    print("\nTest: Return structure for metrics-generated")
    
    try:
        result = evaluate_answer(
            question="What is the value?",
            question_type="metrics-generated",
            gold_answer="100",
            generated_answer="100"
        )
        
        # Check all required keys
        required_keys = ['question_type', 'question', 'gold_answer', 'generated_answer', 
                        'refusal_check', 'metrics', 'summary']
        
        for key in required_keys:
            assert key in result, f"Missing key: {key}"
        
        # Check metrics structure
        assert 'numerical_exact_match' in result['metrics']
        assert 'llm_as_judge_binary' in result['metrics']
        
        # Check summary structure
        assert 'question_type' in result['summary']
        assert 'refusal_detected' in result['summary']
        assert 'metrics_computed' in result['summary']
        assert 'evaluation_complete' in result['summary']
        
        print("‚úì Return structure is correct")
        return 1, 0
        
    except AssertionError as e:
        print(f"‚úó Structure validation failed: {e}")
        return 0, 1
    except Exception as e:
        print(f"‚úó Error: {e}")
        return 0, 1


def test_custom_config():
    """Test using custom configuration"""
    print("\n" + "="*70)
    print("TEST SUITE 7: Custom Configuration")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    print("\nTest: Custom tolerance")
    
    try:
        # Strict tolerance
        config = {
            'tolerance': 0.001,  # 0.1% - very strict
            'return_details': False  # Faster
        }
        
        result = evaluate_answer(
            question="What is the value?",
            question_type="metrics-generated",
            gold_answer="100",
            generated_answer="100.5",
            config=config
        )
        
        # With 0.1% tolerance, 100.5 vs 100 (0.5% error) should NOT match
        nem_match = result['metrics']['numerical_exact_match']['match']
        
        if not nem_match:
            print(f"‚úì Strict tolerance working: no match for 0.5% error with 0.1% tolerance")
            return 1, 0
        else:
            print(f"‚úó Expected no match with strict tolerance")
            return 0, 1
            
    except Exception as e:
        print(f"‚úó Error: {e}")
        return 0, 1


def run_all_tests():
    """Run all test suites"""
    print("\n" + "="*70)
    print("COMPREHENSIVE TEST SUITE FOR AGGREGATE EVALUATOR")
    print("="*70)
    
    if not check_api_key():
        print("\n‚ùå Cannot run tests without OPENAI_API_KEY")
        print("Please set your API key and try again")
        return False
    
    total_passed = 0
    total_failed = 0
    
    test_suites = [
        test_metrics_generated,
        test_novel_generated,
        test_domain_relevant,
        test_invalid_question_type,
        test_config_validation,
        test_return_structure,
        test_custom_config,
    ]
    
    for test_func in test_suites:
        try:
            passed, failed = test_func()
            total_passed += passed
            total_failed += failed
        except Exception as e:
            print(f"‚úó Test suite crashed: {e}")
            total_failed += 1
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Total Tests: {total_passed + total_failed}")
    print(f"‚úì Passed: {total_passed}")
    print(f"‚úó Failed: {total_failed}")
    if total_passed + total_failed > 0:
        print(f"Success Rate: {100 * total_passed / (total_passed + total_failed):.1f}%")
    print("="*70)
    
    return total_failed == 0


if __name__ == "__main__":
    success = run_all_tests()
    
    if success:
        print("\nüéâ ALL TESTS PASSED! üéâ")
    else:
        print("\n‚ö†Ô∏è  SOME TESTS FAILED - Review output above")
        print("Note: LLM scores can vary, some variance is expected")


COMPREHENSIVE TEST SUITE FOR AGGREGATE EVALUATOR

TEST SUITE 1: Metrics-Generated Questions

Test: Exact match with format difference
Expected match: True
‚úì At least one metric matches expectation
  NEM: True, LLM: True

Test: Within tolerance
Expected match: True
‚úì At least one metric matches expectation
  NEM: True, LLM: True

Test: Out of tolerance
Expected match: False
‚úì At least one metric matches expectation
  NEM: False, LLM: False

Test: Refusal
Expected match: False
‚úì Refusal detected correctly
‚úì At least one metric matches expectation
  NEM: False, LLM: False

Metrics-Generated: 4/4 passed

TEST SUITE 2: Novel-Generated Questions

Test: Partial match - missing details
Expected: F1 >= 0.3, Score >= 2
‚úì F1: 0.333 >= 0.3, Score: 2 >= 2

Test: Good match with all facts
Expected: F1 >= 0.6, Score >= 3
‚úì F1: 0.640 >= 0.6, Score: 4 >= 3

Test: Refusal
Expected: F1 >= 0.0, Score >= 0
‚úì F1: 0.000 >= 0.0, Score: 0 >= 0

Novel-Generated: 3/3 passed

TEST SUITE 3: Domain

In [11]:
"""
Aggregate Evaluator - Usage Examples and Integration Guide
==========================================================

This document provides comprehensive examples for using the master evaluation
function across different question types and use cases.

Author: Financial QA Evaluation System
Version: 1.0
"""

# ============================================================================
# BASIC USAGE
# ============================================================================

# Example 1: Metrics-Generated Question
print("="*70)
print("EXAMPLE 1: Metrics-Generated Question")
print("="*70)

result = evaluate_answer(
    question="What is the FY2018 capital expenditure amount (in USD millions) for 3M?",
    question_type="metrics-generated",
    gold_answer="$1577.00",
    generated_answer="1577 million dollars"
)

# Access results
print(f"Question Type: {result['question_type']}")
print(f"Refusal Detected: {result['summary']['refusal_detected']}")
print(f"Metrics Computed: {result['summary']['metrics_computed']}")

# Check numerical exact match
nem = result['metrics']['numerical_exact_match']
print(f"\nNumerical Exact Match:")
print(f"  Match: {nem['match']}")
print(f"  Category: {nem['error_category']}")

# Check LLM judge
llm = result['metrics']['llm_as_judge_binary']
print(f"\nLLM as Judge (Binary):")
print(f"  Match: {llm['match']}")
print(f"  Category: {llm['error_category']}")
if llm.get('corrected'):
    print(f"  ‚ö†Ô∏è  Auto-corrected by post-processing")

print()


# Example 2: Novel-Generated Question
print("="*70)
print("EXAMPLE 2: Novel-Generated Question")
print("="*70)

result = evaluate_answer(
    question="Which segment dragged down 3M's overall growth in 2022 excluding M&A?",
    question_type="novel-generated",
    gold_answer="The consumer segment shrunk by 0.9% organically.",
    generated_answer="The Consumer segment has dragged down 3M's overall growth in 2022."
)

# Check Token F1
f1 = result['metrics']['token_f1']
print(f"Token F1:")
print(f"  F1 Score: {f1['f1']:.3f}")
print(f"  Precision: {f1['precision']:.3f}")
print(f"  Recall: {f1['recall']:.3f}")
print(f"  Common Tokens: {f1['common_tokens']}")
print(f"  Missing Tokens: {f1['missing_tokens']}")

# Check LLM graded
llm = result['metrics']['llm_as_judge_graded']
print(f"\nLLM as Judge (Graded):")
print(f"  Score: {llm['score']}/4")
print(f"  Key Facts: {len(llm['key_facts_gold'])}")
print(f"  Facts Present: {len(llm['facts_present'])}")
print(f"  Facts Missing: {llm['facts_missing']}")
print(f"  Justification: {llm['justification'][:100]}...")

print()


# Example 3: Domain-Relevant Question
print("="*70)
print("EXAMPLE 3: Domain-Relevant Question")
print("="*70)

result = evaluate_answer(
    question="Does AMD have a reasonably healthy liquidity profile based on its quick ratio for FY22?",
    question_type="domain-relevant",
    gold_answer="Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities).",
    generated_answer="Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio for FY22."
)

# Check LLM graded (only metric for domain-relevant)
llm = result['metrics']['llm_as_judge_graded']
print(f"LLM as Judge (Graded):")
print(f"  Score: {llm['score']}/4")
print(f"  Justification: {llm['justification']}")

print()


# ============================================================================
# CUSTOM CONFIGURATION
# ============================================================================

print("="*70)
print("CUSTOM CONFIGURATION")
print("="*70)

# Get default config and modify
config = get_default_config()
print(f"\nDefault tolerance: {config['tolerance']}")

# Make it stricter
config['tolerance'] = 0.005  # 0.5% instead of 1%
config['return_details'] = False  # Faster, less data

result = evaluate_answer(
    question="What is the margin?",
    question_type="metrics-generated",
    gold_answer="24.5%",
    generated_answer="24.6%",
    config=config
)

nem = result['metrics']['numerical_exact_match']
print(f"\nWith strict 0.5% tolerance:")
print(f"  Generated: 24.6% vs Gold: 24.5%")
print(f"  Relative Error: {nem.get('relative_error', 'N/A')}%")
print(f"  Match: {nem['match']}")
print(f"  Category: {nem['error_category']}")

print()


# ============================================================================
# BATCH PROCESSING PATTERN
# ============================================================================

print("="*70)
print("BATCH PROCESSING PATTERN")
print("="*70)

# Simulate evaluating multiple questions
questions = [
    {
        'question': "What is the FY2018 capex?",
        'question_type': "metrics-generated",
        'gold_answer': "$1577.00",
        'generated_answer': "1577 million"
    },
    {
        'question': "Which segment declined?",
        'question_type': "novel-generated",
        'gold_answer': "The consumer segment shrunk by 0.9%.",
        'generated_answer': "Consumer segment."
    },
    {
        'question': "Is liquidity healthy?",
        'question_type': "domain-relevant",
        'gold_answer': "Yes. The quick ratio is 1.57.",
        'generated_answer': "Yes, liquidity is healthy."
    }
]

results = []
for i, q in enumerate(questions):
    print(f"\nEvaluating {i+1}/{len(questions)}: {q['question_type']}")
    
    result = evaluate_answer(
        question=q['question'],
        question_type=q['question_type'],
        gold_answer=q['gold_answer'],
        generated_answer=q['generated_answer']
    )
    
    results.append(result)
    print(f"  Complete: {result['summary']['evaluation_complete']}")

print(f"\nBatch complete: {len(results)} evaluations")

print()


# ============================================================================
# ERROR HANDLING
# ============================================================================

print("="*70)
print("ERROR HANDLING")
print("="*70)

# Example 1: Invalid question type
print("\nExample 1: Invalid question type")
try:
    result = evaluate_answer(
        question="Test",
        question_type="invalid-type",
        gold_answer="Answer",
        generated_answer="Answer"
    )
except ValueError as e:
    print(f"‚úì Caught ValueError: {e}")

# Example 2: Handling refusals
print("\nExample 2: Handling refusals")
result = evaluate_answer(
    question="What is the ratio?",
    question_type="metrics-generated",
    gold_answer="0.66",
    generated_answer="I cannot calculate this without data"
)

if result['summary']['refusal_detected']:
    print(f"‚úì Refusal detected:")
    print(f"  Type: {result['refusal_check']['refusal_type']}")
    print(f"  Matched Pattern: {result['refusal_check'].get('matched_pattern', 'N/A')}")

print()


# ============================================================================
# ANALYSIS PATTERNS
# ============================================================================

print("="*70)
print("ANALYSIS PATTERNS")
print("="*70)

# Pattern 1: Compare rule-based vs LLM for metrics-generated
print("\nPattern 1: Rule-based vs LLM comparison")

result = evaluate_answer(
    question="What is the revenue?",
    question_type="metrics-generated",
    gold_answer="$1.577 billion",
    generated_answer="1577 million"
)

nem_match = result['metrics']['numerical_exact_match']['match']
llm_match = result['metrics']['llm_as_judge_binary']['match']

print(f"  Gold: '$1.577 billion'")
print(f"  Generated: '1577 million'")
print(f"  Rule-based match: {nem_match}")
print(f"  LLM match: {llm_match}")
print(f"  Agreement: {nem_match == llm_match}")

if nem_match != llm_match:
    print(f"  ‚ö†Ô∏è  Disagreement detected - investigate!")
    print(f"  LLM Justification: {result['metrics']['llm_as_judge_binary']['justification']}")


# Pattern 2: Token F1 vs LLM for novel-generated
print("\nPattern 2: Token F1 vs LLM semantic comparison")

result = evaluate_answer(
    question="What happened to the segment?",
    question_type="novel-generated",
    gold_answer="The consumer segment shrunk by 0.9% organically.",
    generated_answer="Consumer declined."
)

f1_score = result['metrics']['token_f1']['f1']
llm_score = result['metrics']['llm_as_judge_graded']['score']

print(f"  Token F1: {f1_score:.3f}")
print(f"  LLM Score: {llm_score}/4")

if f1_score < 0.3 and llm_score >= 2:
    print(f"  üí° Low token overlap but decent LLM score - semantic equivalence detected")
elif f1_score > 0.5 and llm_score < 2:
    print(f"  ‚ö†Ô∏è  High token overlap but low LLM score - investigate!")

print()


# ============================================================================
# AGGREGATION AND REPORTING
# ============================================================================

print("="*70)
print("AGGREGATION AND REPORTING")
print("="*70)

# Simulate evaluating a dataset
results = []

# Mix of question types
test_data = [
    ("metrics-generated", "$1577", "1577", True),
    ("metrics-generated", "24.5%", "30%", False),
    ("novel-generated", "Consumer segment declined", "Consumer declined", True),
    ("domain-relevant", "Yes, healthy liquidity", "Yes, liquidity is good", True),
]

for qtype, gold, gen, _ in test_data:
    result = evaluate_answer(
        question="Sample question",
        question_type=qtype,
        gold_answer=gold,
        generated_answer=gen
    )
    results.append(result)

# Aggregate statistics
metrics_generated_count = sum(1 for r in results if r['question_type'] == 'metrics-generated')
novel_generated_count = sum(1 for r in results if r['question_type'] == 'novel-generated')
domain_relevant_count = sum(1 for r in results if r['question_type'] == 'domain-relevant')

print(f"\nDataset Statistics:")
print(f"  Total: {len(results)}")
print(f"  Metrics-generated: {metrics_generated_count}")
print(f"  Novel-generated: {novel_generated_count}")
print(f"  Domain-relevant: {domain_relevant_count}")

# Refusal rate
refusal_count = sum(1 for r in results if r['summary']['refusal_detected'])
print(f"  Refusal rate: {100*refusal_count/len(results):.1f}%")

# Success rate
success_count = sum(1 for r in results if r['summary']['evaluation_complete'])
print(f"  Success rate: {100*success_count/len(results):.1f}%")

print()


# ============================================================================
# PRETTY PRINTING
# ============================================================================

print("="*70)
print("PRETTY PRINTING")
print("="*70)

result = evaluate_answer(
    question="What is the FY2018 capital expenditure?",
    question_type="metrics-generated",
    gold_answer="$1577.00",
    generated_answer="1577 million dollars"
)

# Use built-in pretty printer
print_evaluation_summary(result)


# ============================================================================
# INTEGRATION WITH RAG SYSTEM
# ============================================================================

print("\n" + "="*70)
print("INTEGRATION WITH RAG SYSTEM")
print("="*70)

def evaluate_rag_output(question_data, generated_answer):
    """
    Example function showing how to integrate with your RAG system.
    
    Args:
        question_data: Dict with 'question', 'question_type', 'answer' (gold)
        generated_answer: String from your RAG system
    
    Returns:
        Evaluation result
    """
    
    result = evaluate_answer(
        question=question_data['question'],
        question_type=question_data['question_type'],
        gold_answer=question_data['answer'],
        generated_answer=generated_answer
    )
    
    return result

# Example usage
financebench_question = {
    'question': "What is the FY2018 capital expenditure amount (in USD millions) for 3M?",
    'question_type': "metrics-generated",
    'answer': "$1577.00"
}

rag_output = "The FY2018 capital expenditure for 3M was 1577 million dollars."

result = evaluate_rag_output(financebench_question, rag_output)

print(f"\nRAG Evaluation:")
print(f"  Question: {result['question'][:60]}...")
print(f"  Type: {result['question_type']}")
print(f"  Metrics: {', '.join(result['summary']['metrics_computed'])}")
print(f"  Complete: {result['summary']['evaluation_complete']}")

print()


# ============================================================================
# SUMMARY
# ============================================================================

print("="*70)
print("SUMMARY - Key Takeaways")
print("="*70)

print("""
The Aggregate Evaluator provides:

1. ‚úÖ Automatic routing to appropriate metrics based on question type
2. ‚úÖ Comprehensive evaluation with multiple metrics per question
3. ‚úÖ Pre-check refusal detection to catch obvious failures
4. ‚úÖ Structured, consistent output format
5. ‚úÖ Configurable parameters for different use cases
6. ‚úÖ Error handling with exceptions for reliability

Usage Pattern:
    result = evaluate_answer(
        question=question,
        question_type=question_type,  # metrics/novel/domain-relevant
        gold_answer=gold,
        generated_answer=generated,
        config=config  # optional
    )

Access Results:
    - result['metrics'][metric_name] ‚Üí Full metric results
    - result['summary'] ‚Üí High-level overview
    - result['refusal_check'] ‚Üí Refusal detection results

Best Practices:
    - Always handle ValueError for invalid question types
    - Check result['summary']['evaluation_complete'] before using results
    - Compare metrics (rule-based vs LLM) for insights
    - Use custom config for domain-specific tolerances
    - Batch process with proper error handling
""")

print("="*70)
print("Ready to evaluate your FinanceBench dataset! üöÄ")
print("="*70)

EXAMPLE 1: Metrics-Generated Question
Question Type: metrics-generated
Refusal Detected: False
Metrics Computed: ['numerical_exact_match', 'llm_as_judge_binary']

Numerical Exact Match:
  Match: True
  Category: exact_match

LLM as Judge (Binary):
  Match: True
  Category: exact_match

EXAMPLE 2: Novel-Generated Question
Token F1:
  F1 Score: 0.300
  Precision: 0.250
  Recall: 0.375
  Common Tokens: {'consumer', 'the', 'segment'}
  Missing Tokens: {'9', 'shrunk', 'by', 'organically', '0'}

LLM as Judge (Graded):
  Score: 2/4
  Key Facts: 4
  Facts Present: 2
  Facts Missing: ['0.9%', 'organically']
  Justification: The generated answer correctly identifies the consumer segment as the factor affecting 3M's growth, ...

EXAMPLE 3: Domain-Relevant Question
LLM as Judge (Graded):
  Score: 3/4
  Justification: The generated answer correctly affirms that AMD has a healthy liquidity profile and mentions the quick ratio, but it omits the specific value of 1.57 and the calculation formula. Whil