In [16]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [22]:
"""
Binary LLM-as-Judge for Numerical Answer Validation
====================================================
This module provides LLM-based binary validation for metrics-generated questions.
Supplements numerical_exact_match by handling complex numerical expressions,
scale conversions, and providing reasoning for matches/mismatches.

Use Cases:
- Complex numerical expressions ("1.577 billion" vs "$1,577 million")
- Context-dependent parsing ("24.5%" vs "0.245")
- Debugging numerical_exact_match failures
- Comparison studies (rule-based vs LLM approaches)

Author: Financial QA Evaluation System
Version: 1.0
"""

import time
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field


class BinaryJudgment(BaseModel):
    """
    Pydantic schema for binary numerical judgment output.
    Ensures structured and parseable response from LLM.
    """
    
    match: bool = Field(
        description="Whether the generated answer matches gold within tolerance"
    )
    
    gold_number: Optional[float] = Field(
        description="Numerical value extracted from gold answer (None if unparseable)"
    )
    
    generated_number: Optional[float] = Field(
        description="Numerical value extracted from generated answer (None if unparseable)"
    )
    
    relative_error: Optional[float] = Field(
        description="Relative error as percentage (None if not applicable)"
    )
    
    absolute_error: Optional[float] = Field(
        description="Absolute difference between numbers (None if not applicable)"
    )
    
    error_category: str = Field(
        description="Category: exact_match, within_tolerance, out_of_tolerance, refusal, unparseable"
    )
    
    justification: str = Field(
        description="Brief explanation (1-2 sentences) of the judgment and reasoning"
    )


def llm_as_judge_binary(
    question: str,
    gold_answer: str,
    generated_answer: str,
    tolerance: float = 0.01,
    provider: str = "openai",
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
    max_retries: int = 3,
    retry_delay_ms: int = 500,
    return_details: bool = True
) -> Dict[str, Any]:
    """
    Evaluate numerical answer using LLM with binary match validation.
    
    This evaluator is used for metrics-generated questions as a supplement
    to rule-based numerical_exact_match. It handles complex numerical
    expressions, scale conversions, and provides reasoning.
    
    Args:
        question: The question being answered (provides context)
        gold_answer: The gold standard answer (ground truth)
        generated_answer: The generated answer to evaluate
        tolerance: Relative tolerance for matching (0.01 = 1% difference allowed)
        provider: LLM provider ('openai', 'anthropic', 'ollama')
        model: Model name (e.g., 'gpt-4o-mini')
        temperature: Temperature for generation (0.0 for deterministic)
        max_retries: Maximum number of retry attempts on failure
        retry_delay_ms: Delay between retries in milliseconds
        return_details: If True, include full LLM response and metadata
    
    Returns:
        Dictionary containing:
            - match: bool - Whether answers match within tolerance
            - gold_num: Optional[float] - Extracted gold number
            - gen_num: Optional[float] - Extracted generated number
            - relative_error: Optional[float] - Relative error as percentage
            - absolute_error: Optional[float] - Absolute difference
            - error_category: str - One of: exact_match, within_tolerance, 
                                    out_of_tolerance, refusal, unparseable
            - justification: str - LLM's reasoning
            - success: bool - Whether LLM call succeeded
            - raw_response: dict - Full LLM response (if return_details=True)
            - metadata: dict - Call information (if return_details=True)
    
    Error Categories:
        - exact_match: Numbers are identical (within floating point precision)
        - within_tolerance: Numbers differ but within tolerance threshold
        - out_of_tolerance: Numbers differ beyond tolerance
        - refusal: Generated answer is a refusal ("I don't know", etc.)
        - unparseable: Cannot extract valid number from generated answer
    
    Examples:
        >>> # Exact match with different formats
        >>> result = llm_as_judge_binary(
        ...     question="What is the FY2018 capex for 3M in millions?",
        ...     gold_answer="$1577.00",
        ...     generated_answer="1577 million dollars",
        ...     tolerance=0.01
        ... )
        >>> print(result['match'])  # True
        >>> print(result['error_category'])  # 'exact_match'
        
        >>> # Within tolerance
        >>> result = llm_as_judge_binary(
        ...     question="What is the operating margin?",
        ...     gold_answer="24.5%",
        ...     generated_answer="24.48%",
        ...     tolerance=0.01
        ... )
        >>> print(result['match'])  # True
        >>> print(result['error_category'])  # 'within_tolerance'
        
        >>> # Scale conversion
        >>> result = llm_as_judge_binary(
        ...     question="What is the revenue?",
        ...     gold_answer="$1.577 billion",
        ...     generated_answer="1577 million",
        ...     tolerance=0.01
        ... )
        >>> print(result['match'])  # True
        >>> print(result['justification'])  # "1.577 billion equals 1577 million..."
    """
    
    from langchain_openai import ChatOpenAI
    
    # Create LLM with structured output
    llm = ChatOpenAI(
        model=model,
        temperature=temperature,
        model_kwargs={"response_format": {"type": "json_object"}} if provider == "openai" else {}
    )
    
    # Apply structured output schema
    structured_llm = llm.with_structured_output(BinaryJudgment)
    
    # Construct evaluation prompt with few-shot examples
    prompt = _create_binary_prompt(question, gold_answer, generated_answer, tolerance)
    
    # Call LLM with retry logic
    try:
        judgment = _call_llm_with_retry(
            structured_llm,
            prompt,
            max_retries=max_retries,
            retry_delay_ms=retry_delay_ms
        )
    except Exception as e:
        # If all retries fail, return error result
        return {
            'match': False,
            'gold_num': None,
            'gen_num': None,
            'relative_error': None,
            'absolute_error': None,
            'error_category': 'unparseable',
            'justification': f"LLM evaluation failed after {max_retries} retries: {str(e)}",
            'error': str(e),
            'success': False
        }
    
    # Build result dictionary - start with LLM's judgment
    result = {
        'match': judgment.match,
        'gold_num': judgment.gold_number,
        'gen_num': judgment.generated_number,
        'relative_error': judgment.relative_error,
        'absolute_error': judgment.absolute_error,
        'error_category': judgment.error_category,
        'justification': judgment.justification,
        'success': True
    }
    
    # POST-PROCESSING: Validate and correct LLM logic errors
    # If both numbers exist and relative error is calculable
    if judgment.gold_number is not None and judgment.generated_number is not None:
        # Calculate what the error SHOULD be
        abs_error = abs(judgment.generated_number - judgment.gold_number)
        rel_error = (abs_error / abs(judgment.gold_number)) * 100  # as percentage
        
        # Check if LLM's calculation is reasonable (allow 0.05% variance for rounding)
        if judgment.relative_error is not None:
            calc_diff = abs(rel_error - judgment.relative_error)
            if calc_diff > 0.05:  # More than 0.05% difference suggests miscalculation
                result['warning'] = f"LLM calculated relative_error={judgment.relative_error}%, but should be {rel_error:.3f}%"
        
        # CRITICAL: Validate match/category consistency based on ACTUAL calculation
        expected_match = rel_error <= (tolerance * 100)
        
        if expected_match and not judgment.match:
            # LLM said NO but should be YES - CORRECT IT
            result['match'] = True
            result['error_category'] = 'exact_match' if rel_error == 0 else 'within_tolerance'
            result['justification'] += f" [Auto-corrected: relative error {rel_error:.3f}% ‚â§ {tolerance*100}% tolerance]"
            result['corrected'] = True
        elif not expected_match and judgment.match:
            # LLM said YES but should be NO - CORRECT IT
            result['match'] = False
            result['error_category'] = 'out_of_tolerance'
            result['justification'] += f" [Auto-corrected: relative error {rel_error:.3f}% > {tolerance*100}% tolerance]"
            result['corrected'] = True
        
        # Use our calculation for error values (more reliable than LLM)
        result['relative_error'] = rel_error
        result['absolute_error'] = abs_error
    
    if return_details:
        result['raw_response'] = judgment.model_dump()
        result['metadata'] = {
            'provider': provider,
            'model': model,
            'temperature': temperature,
            'tolerance': tolerance,
            'question': question,
            'gold_answer': gold_answer,
            'generated_answer': generated_answer
        }
    
    return result


def _create_binary_prompt(
    question: str,
    gold_answer: str,
    generated_answer: str,
    tolerance: float
) -> str:
    """
    Create the evaluation prompt with few-shot examples for binary judgment.
    
    Args:
        question: The question being answered
        gold_answer: Gold standard answer
        generated_answer: Generated answer to evaluate
        tolerance: Relative tolerance (e.g., 0.01 = 1%)
    
    Returns:
        Formatted prompt string
    """
    
    tolerance_percent = tolerance * 100
    
    prompt = f"""You are an expert evaluator for a financial question-answering system. Your task is to determine if a generated numerical answer matches the gold standard answer within a specified tolerance.

**Your Task:**
1. Extract the numerical value from both gold and generated answers
2. Handle different formats (currency, percentages, scales like million/billion)
3. Calculate the relative error if both numbers are valid
4. Determine if the match is within tolerance: {tolerance_percent}%
5. Categorize the result and provide justification

**Tolerance Definition:**
- Relative tolerance: {tolerance_percent}% means the generated number can differ by up to {tolerance_percent}% of the gold number
- Formula: relative_error = (|generated - gold| / |gold|) √ó 100%
- The tolerance value {tolerance_percent}% is expressed as a PERCENTAGE, not a decimal
- Example 1: If gold=100 and tolerance=1%, generated can be 99-101 (1% of 100 = 1)
- Example 2: If gold=1577 and tolerance=1%, generated can be 1561.23-1592.77 (1% of 1577 = 15.77)
- Example 3: If relative_error=0.191%, compare 0.191 < 1.0 ‚Üí WITHIN tolerance
- Example 4: If relative_error=2.5%, compare 2.5 > 1.0 ‚Üí OUT OF tolerance

**CRITICAL COMPARISON RULE:**
When you calculate relative_error as a percentage (like 0.191%), compare it directly to {tolerance_percent}:
- If relative_error ‚â§ {tolerance_percent} ‚Üí WITHIN tolerance ‚Üí match=TRUE
- If relative_error > {tolerance_percent} ‚Üí OUT OF tolerance ‚Üí match=FALSE

Example: relative_error=0.191% and tolerance={tolerance_percent}%
‚Üí Is 0.191 ‚â§ {tolerance_percent}? 
‚Üí 0.191 ‚â§ 1.0? ‚Üí YES ‚Üí within_tolerance ‚Üí match=TRUE

**Error Categories and Match Rules:**
- **exact_match**: Numbers are identical (or effectively identical within floating point precision) ‚Üí **match=TRUE**
- **within_tolerance**: Numbers differ but relative error ‚â§ {tolerance_percent}% ‚Üí **match=TRUE**
- **out_of_tolerance**: Numbers differ and relative error > {tolerance_percent}% ‚Üí **match=FALSE**
- **refusal**: Generated answer refuses to provide a number ("I don't know", "cannot calculate", etc.) ‚Üí **match=FALSE**
- **unparseable**: Cannot extract a valid number from generated answer ‚Üí **match=FALSE**

**CRITICAL RULE**: If relative_error ‚â§ {tolerance_percent}%, then match MUST be TRUE and error_category MUST be either "exact_match" (if error is 0) or "within_tolerance".

**Important Guidelines:**
- Handle format variations: "$1577", "1577 dollars", "USD 1577" are all the same
- Handle scale conversions: "1.577 billion" = "1577 million"
- Handle percentage formats: "24.5%" and "0.245" may be the same depending on context
- Negative numbers: "-3.7" and "(3.7)" in accounting notation are the same
- Accounting notation: "(3.7)" means -3.7
- If generated answer has qualifiers like "approximately", still extract the number

---

**Few-Shot Examples:**

**Example 1 - Exact Match with Different Formats:**
Question: "What is the FY2018 capital expenditure amount (in USD millions) for 3M?"
Gold Answer: "$1577.00"
Generated Answer: "1577 million dollars"
Tolerance: 1%

Evaluation:
- gold_number: 1577.0
- generated_number: 1577.0
- absolute_error: 0.0
- relative_error: 0.0%
- match: true
- error_category: "exact_match"
- justification: "Both answers represent $1577 million. The format differs but the numerical value is identical."

---

**Example 2 - Within Tolerance:**
Question: "What is Adobe's FY2016 unadjusted operating income margin (as percent of total revenue)?"
Gold Answer: "24.5%"
Generated Answer: "24.48%"
Tolerance: 1%

Evaluation:
- gold_number: 24.5
- generated_number: 24.48
- absolute_error: 0.02
- relative_error: 0.08% (calculated as: |24.48-24.5|/24.5 √ó 100 = 0.02/24.5 √ó 100 = 0.0817%)
- **Comparison: Is 0.08% ‚â§ 1%? YES, 0.08 < 1.0**
- match: true (BECAUSE relative_error 0.08% is LESS than tolerance 1%)
- error_category: "within_tolerance"
- justification: "The generated answer (24.48%) is within 1% tolerance of the gold answer (24.5%). The relative error is 0.08%, which is less than 1%, so this is a match."

---

**Example 2b - Within Tolerance (Worked Example with Similar Numbers):**
Question: "What is the capital expenditure?"
Gold Answer: "1577"
Generated Answer: "1580"
Tolerance: 1%

Evaluation:
- gold_number: 1577.0
- generated_number: 1580.0
- absolute_error: 3.0 (calculated as: |1580-1577| = 3)
- relative_error: 0.190% (calculated as: 3/1577 √ó 100 = 0.190%)
- **Comparison: Is 0.190% ‚â§ 1%? YES, 0.190 < 1.0**
- match: true (BECAUSE relative_error 0.190% is LESS than tolerance 1%)
- error_category: "within_tolerance"
- justification: "The generated answer (1580) differs from gold (1577) by 3 units, resulting in a relative error of 0.190%. Since 0.190% < 1%, this is within tolerance and is a match."

---

**Example 3 - Out of Tolerance (Wrong Number):**
Question: "Roughly how many times has AES Corporation sold its inventory in FY2022?"
Gold Answer: "9.5"
Generated Answer: "AES Corporation sold its inventory roughly 12 times in FY2022."
Tolerance: 1%

Evaluation:
- gold_number: 9.5
- generated_number: 12.0
- absolute_error: 2.5
- relative_error: 26.3% (2.5/9.5 √ó 100)
- match: false
- error_category: "out_of_tolerance"
- justification: "The generated number (12.0) differs significantly from the gold answer (9.5) with a relative error of 26.3%, far exceeding the 1% tolerance."

---

**Example 4 - Refusal:**
Question: "What is the FY2019 fixed asset turnover ratio for Activision Blizzard?"
Gold Answer: "0.66"
Generated Answer: "I cannot calculate this ratio without access to the specific financial statements."
Tolerance: 1%

Evaluation:
- gold_number: 0.66
- generated_number: null
- absolute_error: null
- relative_error: null
- match: false
- error_category: "refusal"
- justification: "The generated answer is a refusal to provide a numerical value rather than an actual answer."

---

**Example 5 - Scale Conversion (Billion vs Million):**
Question: "What is the total revenue for FY2021?"
Gold Answer: "$1.577 billion"
Generated Answer: "Total revenue was 1577 million dollars"
Tolerance: 1%

Evaluation:
- gold_number: 1577.0 (converted to millions for comparison)
- generated_number: 1577.0
- absolute_error: 0.0
- relative_error: 0.0%
- match: true
- error_category: "exact_match"
- justification: "The answers are identical: 1.577 billion equals 1577 million. Both represent the same value with different scale notation."

---

**Example 6 - Percentage vs Decimal (Context Matters):**
Question: "What is the FY2022 operating margin as a percentage?"
Gold Answer: "24.5%"
Generated Answer: "The operating margin is 0.245"
Tolerance: 1%

Evaluation:
- gold_number: 24.5 (keep as percentage since question asks "as a percentage")
- generated_number: 0.245 (this is decimal form, should be 24.5% for comparison)
- absolute_error: 24.255
- relative_error: 99.0%
- match: false
- error_category: "out_of_tolerance"
- justification: "The question asks for a percentage. Gold answer is 24.5%, but generated provides 0.245 (decimal form). While mathematically equivalent, they don't match in the expected format. If converted properly, 0.245 = 24.5%, which would be an exact match."

Note: For percentage questions, consider whether to compare as percentages (24.5) or decimals (0.245). Context from the question helps determine the expected format.

---

**Example 7 - Negative Numbers (Accounting Notation):**
Question: "What is the net income change?"
Gold Answer: "-3.7"
Generated Answer: "(3.7)"
Tolerance: 1%

Evaluation:
- gold_number: -3.7
- generated_number: -3.7 (accounting notation: parentheses mean negative)
- absolute_error: 0.0
- relative_error: 0.0%
- match: true
- error_category: "exact_match"
- justification: "Both represent -3.7. The generated answer uses accounting notation (parentheses) to indicate a negative number."

---

**Example 8 - Unparseable Answer:**
Question: "What is the inventory turnover ratio?"
Gold Answer: "9.5"
Generated Answer: "The ratio varies depending on the quarter and specific inventory category considered."
Tolerance: 1%

Evaluation:
- gold_number: 9.5
- generated_number: null
- absolute_error: null
- relative_error: null
- match: false
- error_category: "unparseable"
- justification: "The generated answer does not contain a numerical value. It provides an explanation without giving the actual number."

---

**Now evaluate the following:**

**Question:** {question}
**Gold Answer:** {gold_answer}
**Generated Answer:** {generated_answer}
**Tolerance:** {tolerance_percent}%

**Step-by-step evaluation process:**
1. Extract the numerical value from gold answer ‚Üí gold_number
2. Extract the numerical value from generated answer ‚Üí generated_number
3. If both numbers exist:
   a. Calculate absolute_error = |generated_number - gold_number|
   b. Calculate relative_error = (absolute_error / |gold_number|) √ó 100
   c. **COMPARE relative_error to tolerance {tolerance_percent}:**
      - If relative_error = 0% ‚Üí **match=TRUE**, error_category="exact_match"
      - If 0% < relative_error ‚â§ {tolerance_percent}% ‚Üí **match=TRUE**, error_category="within_tolerance"
        * Example: If relative_error = 0.19% and tolerance = 1%, then 0.19 ‚â§ 1.0 ‚Üí TRUE ‚Üí match=TRUE
      - If relative_error > {tolerance_percent}% ‚Üí **match=FALSE**, error_category="out_of_tolerance"
        * Example: If relative_error = 2.5% and tolerance = 1%, then 2.5 > 1.0 ‚Üí TRUE ‚Üí match=FALSE
4. If generated is refusal ‚Üí **match=FALSE**, error_category="refusal"
5. If cannot parse generated ‚Üí **match=FALSE**, error_category="unparseable"

**VERIFICATION STEP - Before finalizing your answer, verify:**
- If you calculated relative_error as X%, is X ‚â§ {tolerance_percent}?
- If YES ‚Üí match MUST be TRUE and error_category MUST be "exact_match" or "within_tolerance"
- If NO ‚Üí match MUST be FALSE and error_category MUST be "out_of_tolerance"
- Double-check your comparison: compare relative_error value to {tolerance_percent} value directly

**CRITICAL**: The 'match' field MUST be consistent with the error_category:
- exact_match or within_tolerance ‚Üí match=TRUE
- out_of_tolerance, refusal, or unparseable ‚Üí match=FALSE

**SANITY CHECK BEFORE SUBMITTING YOUR ANSWER:**
1. Did you calculate relative_error as a percentage? (e.g., 0.19%)
2. Did you compare it to tolerance value {tolerance_percent}%? (e.g., is 0.19 ‚â§ 1.0?)
3. If relative_error ‚â§ {tolerance_percent}, did you set match=TRUE?
4. Does your justification match your match value?
5. NEVER say "within tolerance" and then mark as "out_of_tolerance"
6. NEVER say a number is "less than {tolerance_percent}%" and then set match=FALSE

Provide your evaluation in the structured format with:
1. match (true/false) - MUST follow the rules above
2. gold_number (extracted number or null)
3. generated_number (extracted number or null)
4. relative_error (percentage or null) - express as X% where X is the number
5. absolute_error (absolute difference or null)
6. error_category (exact_match, within_tolerance, out_of_tolerance, refusal, unparseable)
7. justification (1-2 sentences, MUST be logically consistent with match and error_category)


**Important**: 
- Extract numbers carefully considering the question context
- For percentages, maintain consistency: if gold is "24.5%", treat generated "0.245" as potentially 24.5% depending on question wording
- For scale (million/billion), normalize to the same unit before comparing
- Be precise with relative error calculation: (|gen - gold| / |gold|) √ó 100%
"""
    
    return prompt


def _call_llm_with_retry(
    llm,
    prompt: str,
    max_retries: int = 3,
    retry_delay_ms: int = 500
) -> BinaryJudgment:
    """
    Call LLM with retry logic on failure.
    
    Args:
        llm: LangChain LLM with structured output
        prompt: Evaluation prompt
        max_retries: Maximum retry attempts
        retry_delay_ms: Delay between retries in milliseconds
    
    Returns:
        BinaryJudgment object
    
    Raises:
        Exception: If all retries fail
    """
    
    last_error = None
    
    for attempt in range(max_retries):
        try:
            response = llm.invoke(prompt)
            return response
        except Exception as e:
            last_error = e
            if attempt < max_retries - 1:
                # Wait before retry
                time.sleep(retry_delay_ms / 1000.0)
                continue
            else:
                # All retries exhausted
                raise Exception(f"LLM call failed after {max_retries} attempts. Last error: {str(e)}")
    
    # Should not reach here, but just in case
    raise Exception(f"LLM call failed: {str(last_error)}")


# ============================================================================
# TESTING FUNCTIONS
# ============================================================================

def _test_llm_as_judge_binary():
    """Quick test for binary LLM-as-Judge"""
    
    print("Testing Binary LLM-as-Judge...")
    print("NOTE: This requires OpenAI API key to be set")
    print()
    
    # Test case 1: Exact match
    print("Test 1: Exact match with different formats")
    result = llm_as_judge_binary(
        question="What is the FY2018 capital expenditure amount (in USD millions) for 3M?",
        gold_answer="$1577.00",
        generated_answer="1577 million dollars",
        tolerance=0.01
    )
    
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Gold: {result['gold_num']}, Generated: {result['gen_num']}")
    print(f"Justification: {result['justification']}")
    print()
    
    # Test case 2: Within tolerance
    print("Test 2: Within tolerance")
    result = llm_as_judge_binary(
        question="What is the operating margin?",
        gold_answer="24.5%",
        generated_answer="24.48%",
        tolerance=0.01
    )
    
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Relative error: {result['relative_error']}%")
    print(f"Justification: {result['justification']}")
    print()
    
    # Test case 3: Out of tolerance
    print("Test 3: Out of tolerance")
    result = llm_as_judge_binary(
        question="How many times has AES sold inventory?",
        gold_answer="9.5",
        generated_answer="12 times",
        tolerance=0.01
    )
    
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Relative error: {result['relative_error']}%")
    print(f"Justification: {result['justification']}")
    print()
    
    # Test case 4: Refusal
    print("Test 4: Refusal detection")
    result = llm_as_judge_binary(
        question="What is the ratio?",
        gold_answer="0.66",
        generated_answer="I cannot calculate without the data",
        tolerance=0.01
    )
    
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Justification: {result['justification']}")


if __name__ == "__main__":
    print("Binary LLM-as-Judge Module")
    print("="*70)
    print()
    print("To test, run: _test_llm_as_judge_binary()")
    print("Make sure OPENAI_API_KEY is set in environment")

Binary LLM-as-Judge Module

To test, run: _test_llm_as_judge_binary()
Make sure OPENAI_API_KEY is set in environment


In [23]:
"""
Comprehensive Test Suite for Binary LLM-as-Judge
=================================================
Tests cover:
- Exact matches with different formats
- Within tolerance cases
- Out of tolerance cases
- Currency format variations
- Percentage format variations
- Scale conversions (million/billion)
- Negative numbers and accounting notation
- Refusals
- Unparseable answers

NOTE: These tests require OPENAI_API_KEY to be set in environment
"""

import os
import sys

# Add parent directory to path
sys.path.append('/home/claude')



def check_api_key():
    """Check if OpenAI API key is available"""
    if not os.getenv("OPENAI_API_KEY"):
        print("‚ö†Ô∏è  WARNING: OPENAI_API_KEY not found in environment")
        print("   These tests will fail without API access")
        print("   Set with: export OPENAI_API_KEY='your-key-here'")
        return False
    return True


def test_exact_matches():
    """Test exact matches with different formats"""
    print("\n" + "="*70)
    print("TEST SUITE 1: Exact Matches")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, description)
        (
            "What is the FY2018 capital expenditure amount (in USD millions) for 3M?",
            "$1577.00",
            "1577 million dollars",
            "Currency format variation"
        ),
        (
            "What is the fixed asset turnover ratio?",
            "0.66",
            "0.66",
            "Simple exact match"
        ),
        (
            "What is the net income change?",
            "-3.7",
            "(3.7)",
            "Negative number with accounting notation"
        ),
        (
            "What is the total revenue?",
            "$1.577 billion",
            "1577 million",
            "Scale conversion: billion to million"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Gold: '{gold}' | Generated: '{gen}'")
        
        try:
            result = llm_as_judge_binary(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                tolerance=0.01
            )
            
            if result['match'] and result['error_category'] == 'exact_match':
                passed += 1
                print(f"‚úì Match: {result['match']} | Category: {result['error_category']}")
                print(f"  Numbers: Gold={result['gold_num']}, Gen={result['gen_num']}")
            else:
                failed += 1
                print(f"‚úó Expected exact_match, got: {result['error_category']}")
                print(f"  Match: {result['match']}")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nExact Matches: {passed}/{len(tests)} passed")
    return passed, failed


def test_within_tolerance():
    """Test answers within tolerance"""
    print("\n" + "="*70)
    print("TEST SUITE 2: Within Tolerance")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, tolerance, description)
        (
            "What is the operating margin?",
            "24.5%",
            "24.48%",
            0.01,
            "Small percentage difference"
        ),
        (
            "What is the revenue?",
            "$1577.00",
            "1579",
            0.01,
            "Small difference within 1% tolerance"
        ),
        (
            "What is the ratio?",
            "9.5",
            "9.52",
            0.01,
            "Small decimal difference"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, tol, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Gold: '{gold}' | Generated: '{gen}' | Tolerance: {tol*100}%")
        
        try:
            result = llm_as_judge_binary(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                tolerance=tol
            )
            
            if result['match'] and result['error_category'] == 'within_tolerance':
                passed += 1
                print(f"‚úì Match: {result['match']} | Category: {result['error_category']}")
                print(f"  Relative error: {result['relative_error']:.2f}%")
            else:
                failed += 1
                print(f"‚úó Expected within_tolerance, got: {result['error_category']}")
                print(f"  Match: {result['match']}")
                print(f"  Relative error: {result['relative_error']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nWithin Tolerance: {passed}/{len(tests)} passed")
    return passed, failed


def test_out_of_tolerance():
    """Test answers out of tolerance"""
    print("\n" + "="*70)
    print("TEST SUITE 3: Out of Tolerance")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, tolerance, description)
        (
            "How many times has AES sold inventory?",
            "9.5",
            "12",
            0.01,
            "Significantly wrong number (26% error)"
        ),
        (
            "What is the margin?",
            "24.5%",
            "30%",
            0.01,
            "Large percentage difference (22% error)"
        ),
        (
            "What is the revenue?",
            "$1577",
            "$2000",
            0.01,
            "Large value difference (27% error)"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, tol, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Gold: '{gold}' | Generated: '{gen}' | Tolerance: {tol*100}%")
        
        try:
            result = llm_as_judge_binary(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                tolerance=tol
            )
            
            if not result['match'] and result['error_category'] == 'out_of_tolerance':
                passed += 1
                print(f"‚úì Match: {result['match']} | Category: {result['error_category']}")
                print(f"  Relative error: {result['relative_error']:.2f}%")
            else:
                failed += 1
                print(f"‚úó Expected out_of_tolerance, got: {result['error_category']}")
                print(f"  Match: {result['match']}")
                print(f"  Relative error: {result['relative_error']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nOut of Tolerance: {passed}/{len(tests)} passed")
    return passed, failed


def test_refusals():
    """Test refusal detection"""
    print("\n" + "="*70)
    print("TEST SUITE 4: Refusal Detection")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, description)
        (
            "What is the ratio?",
            "0.66",
            "I cannot calculate without the data",
            "Hard refusal"
        ),
        (
            "What is the inventory turnover?",
            "9.5",
            "Data not available",
            "Data unavailable"
        ),
        (
            "What is the margin?",
            "24.5%",
            "I don't have access to that information",
            "No access refusal"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Generated: '{gen}'")
        
        try:
            result = llm_as_judge_binary(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                tolerance=0.01
            )
            
            if not result['match'] and result['error_category'] == 'refusal':
                passed += 1
                print(f"‚úì Category: {result['error_category']}")
                print(f"  Generated number: {result['gen_num']}")
            else:
                failed += 1
                print(f"‚úó Expected refusal, got: {result['error_category']}")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nRefusals: {passed}/{len(tests)} passed")
    return passed, failed


def test_unparseable():
    """Test unparseable answers"""
    print("\n" + "="*70)
    print("TEST SUITE 5: Unparseable Answers")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, description)
        (
            "What is the inventory turnover ratio?",
            "9.5",
            "The ratio varies depending on the quarter and specific inventory category considered.",
            "Explanation without number"
        ),
        (
            "What is the value?",
            "42",
            "It's complicated and depends on many factors.",
            "Vague response"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Generated: '{gen}'")
        
        try:
            result = llm_as_judge_binary(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                tolerance=0.01
            )
            
            if not result['match'] and result['error_category'] == 'unparseable':
                passed += 1
                print(f"‚úì Category: {result['error_category']}")
                print(f"  Generated number: {result['gen_num']}")
            else:
                failed += 1
                print(f"‚úó Expected unparseable, got: {result['error_category']}")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nUnparseable: {passed}/{len(tests)} passed")
    return passed, failed


def test_edge_cases():
    """Test edge cases and special scenarios"""
    print("\n" + "="*70)
    print("TEST SUITE 6: Edge Cases")
    print("="*70)
    
    if not check_api_key():
        print("Skipping tests - no API key")
        return 0, 0
    
    tests = [
        # (question, gold, generated, expected_match, description)
        (
            "What is the value?",
            "$1,577.00",
            "1577",
            True,
            "Thousands separator handling"
        ),
        (
            "What is the change?",
            "-0.02",
            "negative 0.02",
            True,
            "Negative number as text"
        ),
        (
            "What is approximately the value?",
            "1577",
            "approximately 1580",
            True,
            "With 'approximately' qualifier (should be within tolerance)"
        ),
    ]
    
    passed, failed = 0, 0
    
    for question, gold, gen, expected_match, desc in tests:
        print(f"\nTest: {desc}")
        print(f"Gold: '{gold}' | Generated: '{gen}'")
        
        try:
            result = llm_as_judge_binary(
                question=question,
                gold_answer=gold,
                generated_answer=gen,
                tolerance=0.01
            )
            
            if result['match'] == expected_match:
                passed += 1
                print(f"‚úì Match: {result['match']} (expected: {expected_match})")
                print(f"  Category: {result['error_category']}")
            else:
                failed += 1
                print(f"‚úó Match: {result['match']} (expected: {expected_match})")
                print(f"  Category: {result['error_category']}")
                print(f"  Justification: {result['justification']}")
        except Exception as e:
            failed += 1
            print(f"‚úó Error: {e}")
    
    print(f"\nEdge Cases: {passed}/{len(tests)} passed")
    return passed, failed


def run_all_tests():
    """Run all test suites"""
    print("\n" + "="*70)
    print("COMPREHENSIVE TEST SUITE FOR BINARY LLM-AS-JUDGE")
    print("="*70)
    
    if not check_api_key():
        print("\n‚ùå Cannot run tests without OPENAI_API_KEY")
        print("Please set your API key and try again")
        return False
    
    total_passed = 0
    total_failed = 0
    
    test_suites = [
        test_exact_matches,
        test_within_tolerance,
        test_out_of_tolerance,
        test_refusals,
        test_unparseable,
        test_edge_cases,
    ]
    
    for test_func in test_suites:
        try:
            passed, failed = test_func()
            total_passed += passed
            total_failed += failed
        except Exception as e:
            print(f"‚úó Test suite crashed: {e}")
            total_failed += 1
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Total Tests: {total_passed + total_failed}")
    print(f"‚úì Passed: {total_passed}")
    print(f"‚úó Failed: {total_failed}")
    if total_passed + total_failed > 0:
        print(f"Success Rate: {100 * total_passed / (total_passed + total_failed):.1f}%")
    print("="*70)
    
    return total_failed == 0


if __name__ == "__main__":
    success = run_all_tests()
    
    if success:
        print("\nüéâ ALL TESTS PASSED! üéâ")
    else:
        print("\n‚ö†Ô∏è  SOME TESTS FAILED - Review output above")
        print("Note: LLM judgments can vary, some variance is expected")


COMPREHENSIVE TEST SUITE FOR BINARY LLM-AS-JUDGE

TEST SUITE 1: Exact Matches

Test: Currency format variation
Gold: '$1577.00' | Generated: '1577 million dollars'
‚úì Match: True | Category: exact_match
  Numbers: Gold=1577.0, Gen=1577.0

Test: Simple exact match
Gold: '0.66' | Generated: '0.66'
‚úì Match: True | Category: exact_match
  Numbers: Gold=0.66, Gen=0.66

Test: Negative number with accounting notation
Gold: '-3.7' | Generated: '(3.7)'
‚úì Match: True | Category: exact_match
  Numbers: Gold=-3.7, Gen=-3.7

Test: Scale conversion: billion to million
Gold: '$1.577 billion' | Generated: '1577 million'
‚úì Match: True | Category: exact_match
  Numbers: Gold=1577.0, Gen=1577.0

Exact Matches: 4/4 passed

TEST SUITE 2: Within Tolerance

Test: Small percentage difference
Gold: '24.5%' | Generated: '24.48%' | Tolerance: 1.0%
‚úì Match: True | Category: within_tolerance
  Relative error: 0.08%

Test: Small difference within 1% tolerance
Gold: '$1577.00' | Generated: '1579' | Toleran

In [24]:
"""
Quick Test to Verify Tolerance Fix
===================================
Tests the specific failures identified by the user.
"""

import os
import sys
sys.path.append('/home/claude')


def test_tolerance_fix():
    """Test the specific cases that were failing"""
    
    print("Testing Tolerance Logic Fix")
    print("="*70)
    
    # Test 1: User's example that failed
    print("\n TEST 1: User's Reported Failure")
    print("-"*70)
    question = "What is the FY2018 capital expenditure amount (in USD millions) for 3M?"
    gold = "$1577.00"
    gen = "1580 million dollars"
    
    result = llm_as_judge_binary(
        question=question,
        gold_answer=gold,
        generated_answer=gen,
        tolerance=0.01
    )
    
    print(f"Gold: {gold}")
    print(f"Generated: {gen}")
    print(f"Relative Error: {result['relative_error']:.3f}%")
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Justification: {result['justification']}")
    
    # Expected: match=True (0.191% < 1%), within_tolerance
    if result['match'] and result['error_category'] == 'within_tolerance':
        print("‚úì PASS: Correctly identified as within_tolerance with match=True")
    else:
        print(f"‚úó FAIL: Expected match=True and within_tolerance")
        print(f"   Got: match={result['match']}, category={result['error_category']}")
    
    # Test 2: Small difference within tolerance
    print("\n\nTEST 2: Small Difference Within Tolerance")
    print("-"*70)
    question = "What is the value?"
    gold = "$1577.00"
    gen = "1579"
    
    result = llm_as_judge_binary(
        question=question,
        gold_answer=gold,
        generated_answer=gen,
        tolerance=0.01
    )
    
    print(f"Gold: {gold}")
    print(f"Generated: {gen}")
    print(f"Relative Error: {result['relative_error']:.3f}%")
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Justification: {result['justification']}")
    
    # Expected: match=True (0.127% < 1%), within_tolerance
    if result['match'] and result['error_category'] == 'within_tolerance':
        print("‚úì PASS: Correctly identified as within_tolerance with match=True")
    else:
        print(f"‚úó FAIL: Expected match=True and within_tolerance")
        print(f"   Got: match={result['match']}, category={result['error_category']}")
    
    # Test 3: With 'approximately' qualifier
    print("\n\nTEST 3: With 'Approximately' Qualifier")
    print("-"*70)
    question = "What is the value?"
    gold = "1577"
    gen = "approximately 1580"
    
    result = llm_as_judge_binary(
        question=question,
        gold_answer=gold,
        generated_answer=gen,
        tolerance=0.01
    )
    
    print(f"Gold: {gold}")
    print(f"Generated: {gen}")
    print(f"Relative Error: {result['relative_error']:.3f}%")
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Justification: {result['justification']}")
    
    # Expected: match=True (0.191% < 1%), within_tolerance
    if result['match'] and result['error_category'] == 'within_tolerance':
        print("‚úì PASS: Correctly identified as within_tolerance with match=True")
    else:
        print(f"‚úó FAIL: Expected match=True and within_tolerance")
        print(f"   Got: match={result['match']}, category={result['error_category']}")
    
    # Test 4: Should be out of tolerance (control test)
    print("\n\nTEST 4: Out of Tolerance (Control Test)")
    print("-"*70)
    question = "What is the value?"
    gold = "1577"
    gen = "1700"
    
    result = llm_as_judge_binary(
        question=question,
        gold_answer=gold,
        generated_answer=gen,
        tolerance=0.01
    )
    
    print(f"Gold: {gold}")
    print(f"Generated: {gen}")
    print(f"Relative Error: {result['relative_error']:.3f}%")
    print(f"Match: {result['match']}")
    print(f"Category: {result['error_category']}")
    print(f"Justification: {result['justification']}")
    
    # Expected: match=False (7.8% > 1%), out_of_tolerance
    if not result['match'] and result['error_category'] == 'out_of_tolerance':
        print("‚úì PASS: Correctly identified as out_of_tolerance with match=False")
    else:
        print(f"‚úó FAIL: Expected match=False and out_of_tolerance")
        print(f"   Got: match={result['match']}, category={result['error_category']}")
    
    print("\n" + "="*70)
    print("Test Complete!")


if __name__ == "__main__":
    if not os.getenv("OPENAI_API_KEY"):
        print("ERROR: OPENAI_API_KEY not set")
        print("Set with: export OPENAI_API_KEY='your-key-here'")
        exit(1)
    
    test_tolerance_fix()

Testing Tolerance Logic Fix

 TEST 1: User's Reported Failure
----------------------------------------------------------------------
Gold: $1577.00
Generated: 1580 million dollars
Relative Error: 0.190%
Match: True
Category: within_tolerance
Justification: The generated answer (1580 million) differs from the gold answer (1577 million) by 3 units, resulting in a relative error of 0.191%. Since 0.191% is less than the 1% tolerance, this is within tolerance and is a match.
‚úì PASS: Correctly identified as within_tolerance with match=True


TEST 2: Small Difference Within Tolerance
----------------------------------------------------------------------
Gold: $1577.00
Generated: 1579
Relative Error: 0.127%
Match: True
Category: within_tolerance
Justification: The generated answer (1579) differs from the gold answer (1577) by 2 units, resulting in a relative error of 0.127%. Since 0.127% is less than 1%, this would typically be within tolerance, but the absolute difference exceeds the thresh

In [25]:
question = "What is the FY2018 capital expenditure amount (in USD millions) for 3M?"
gold = "$1577.00"
gen = "1580 million dollars"

result = llm_as_judge_binary(
    question=question,
    gold_answer=gold,
    generated_answer=gen,
    tolerance=0.01
)

print(result)

{'match': True, 'gold_num': 1577.0, 'gen_num': 1580.0, 'relative_error': 0.19023462270133165, 'absolute_error': 3.0, 'error_category': 'within_tolerance', 'justification': 'The generated answer (1580 million) differs from the gold answer (1577 million) by 3 units, resulting in a relative error of 0.191%. Since 0.191% is less than the 1% tolerance, this is within tolerance and is a match.', 'success': True, 'raw_response': {'match': True, 'gold_number': 1577.0, 'generated_number': 1580.0, 'relative_error': 0.191, 'absolute_error': 3.0, 'error_category': 'within_tolerance', 'justification': 'The generated answer (1580 million) differs from the gold answer (1577 million) by 3 units, resulting in a relative error of 0.191%. Since 0.191% is less than the 1% tolerance, this is within tolerance and is a match.'}, 'metadata': {'provider': 'openai', 'model': 'gpt-4o-mini', 'temperature': 0.0, 'tolerance': 0.01, 'question': 'What is the FY2018 capital expenditure amount (in USD millions) for 3M?