In [1]:
"""
Refusal Detection for Financial QA Evaluation
==============================================

This module provides refusal detection to identify when a model
refuses to answer or indicates it doesn't have enough information.

Refusals are important to track as a failure mode separate from
incorrect answers. They help understand:
- When the model lacks confidence
- When retrieval failed (in RAG systems)
- Model behavior patterns

Author: Financial QA Evaluation System
Version: 1.0
"""

import re
from typing import Dict, Any, List, Optional


# Explicit refusal patterns
REFUSAL_PATTERNS = [
    # Direct refusals
    r"\bi\s+(?:do\s+not|don't|cannot|can't|could\s+not|couldn't)\s+(?:know|have|provide|answer|calculate|determine|find)",
    r"\bi\s+(?:am\s+)?(?:unable|not\s+able)\s+to",
    
    # Data/information unavailable
    r"(?:data|information|details|answer)\s+(?:is\s+)?(?:not\s+)?(?:available|unavailable|accessible|provided)",
    r"no\s+(?:data|information|details)\s+(?:is\s+)?(?:available|provided)",
    r"data\s+not\s+available",
    
    # Cannot calculate/determine
    r"cannot\s+(?:be\s+)?(?:calculated|determined|computed|assessed|evaluated|answered)",
    r"(?:unable|impossible)\s+to\s+(?:calculate|determine|compute|assess|evaluate|answer)",
    r"can'?t\s+(?:be\s+)?(?:calculated|determined|computed)",
    r"cannot\s+\w+\s+without",  # Catches "cannot answer without", "cannot calculate without", etc.
    
    # Insufficient information
    r"insufficient\s+(?:information|data|details)",
    r"not\s+enough\s+(?:information|data|details)",
    r"lack(?:ing)?\s+(?:sufficient\s+)?(?:information|data|details)",
    r"without\s+(?:sufficient\s+)?(?:specific\s+)?(?:information|data|details|figures)",
    
    # Not applicable
    r"\bn/?a\b",
    r"not\s+applicable",
    r"does\s+not\s+apply",
    
    # Question context
    r"(?:the\s+)?question\s+(?:cannot|can'?t)\s+be\s+answered",
    r"(?:this\s+)?(?:cannot|can'?t)\s+be\s+answered",
    
    # Specific financial refusals
    r"(?:financial\s+)?(?:data|information|figures?)\s+(?:is\s+)?(?:not\s+)?(?:available|accessible|disclosed)",
    r"no\s+(?:specific|clear|direct)\s+(?:data|information|mention|figures?)",
    r"without\s+(?:sufficient\s+)?(?:specific\s+)?(?:information|data|details|figures)",
    
    # Qualification patterns (weak refusals)
    r"(?:it\s+)?(?:is\s+)?(?:difficult|hard|challenging)\s+to\s+(?:determine|calculate|assess)",
    r"(?:may|might)\s+not\s+be\s+(?:possible|feasible)\s+to",
]

# Short vague answers that might indicate refusal
VAGUE_SHORT_ANSWERS = [
    r"^(?:i\s+)?(?:do\s+)?(?:not|don't)\s+know\.?$",
    r"^(?:not\s+)?(?:sure|certain)\.?$",
    r"^(?:un)?clear\.?$",
    r"^un(?:known|certain)\.?$",
]


def detect_refusal(
    answer: str,
    min_length: int = 3,
    check_vague: bool = True,
    return_details: bool = True
) -> Dict[str, Any]:
    """
    Detect if an answer is a refusal to answer the question.
    
    Args:
        answer: The generated answer to check
        min_length: Minimum answer length to not be considered refusal
                   (very short answers like "0" are valid, not refusals)
        check_vague: Whether to check for vague short answers like "I don't know"
        return_details: If True, return detailed information about the refusal
    
    Returns:
        Dictionary containing:
            - is_refusal: bool - Whether answer is a refusal
            - confidence: float - Confidence in refusal detection (0-1)
            - refusal_type: str - Type of refusal detected
            - matched_pattern: str - Which pattern was matched (if any)
            - answer_length: int - Character length of answer
    
    Refusal types:
        - 'explicit': Clear refusal patterns (high confidence)
        - 'vague': Vague short answers like "I don't know"
        - 'none': Not a refusal
    
    Examples:
        >>> detect_refusal("I cannot calculate this without specific data.")
        {'is_refusal': True, 'confidence': 1.0, 'refusal_type': 'explicit', ...}
        
        >>> detect_refusal("Data not available.")
        {'is_refusal': True, 'confidence': 1.0, 'refusal_type': 'explicit', ...}
        
        >>> detect_refusal("I don't know")
        {'is_refusal': True, 'confidence': 0.9, 'refusal_type': 'vague', ...}
        
        >>> detect_refusal("The answer is 42.")
        {'is_refusal': False, 'confidence': 1.0, 'refusal_type': 'none', ...}
        
        >>> detect_refusal("0")  # Valid short answer, not refusal
        {'is_refusal': False, 'confidence': 1.0, 'refusal_type': 'none', ...}
    """
    
    if not answer or not isinstance(answer, str):
        return {
            'is_refusal': True,
            'confidence': 1.0,
            'refusal_type': 'explicit',
            'matched_pattern': 'empty_answer',
            'answer_length': 0,
        } if return_details else {'is_refusal': True}
    
    # Normalize for pattern matching
    answer_lower = answer.lower().strip()
    
    # Check if answer is effectively empty (whitespace only)
    if not answer_lower:
        return {
            'is_refusal': True,
            'confidence': 1.0,
            'refusal_type': 'explicit',
            'matched_pattern': 'empty_answer',
            'answer_length': 0,
        } if return_details else {'is_refusal': True}
    
    answer_length = len(answer.strip())
    
    # Check explicit refusal patterns
    for pattern in REFUSAL_PATTERNS:
        if re.search(pattern, answer_lower, re.IGNORECASE):
            result = {
                'is_refusal': True,
                'confidence': 1.0,
                'refusal_type': 'explicit',
                'matched_pattern': pattern,
                'answer_length': answer_length,
            }
            return result if return_details else {'is_refusal': True}
    
    # Check vague short answers if requested
    if check_vague and answer_length < 30:  # Only check short answers
        for pattern in VAGUE_SHORT_ANSWERS:
            if re.match(pattern, answer_lower, re.IGNORECASE):
                result = {
                    'is_refusal': True,
                    'confidence': 0.9,  # Slightly lower confidence for vague patterns
                    'refusal_type': 'vague',
                    'matched_pattern': pattern,
                    'answer_length': answer_length,
                }
                return result if return_details else {'is_refusal': True}
    
    # Not a refusal
    result = {
        'is_refusal': False,
        'confidence': 1.0,
        'refusal_type': 'none',
        'matched_pattern': None,
        'answer_length': answer_length,
    }
    return result if return_details else {'is_refusal': False}


def batch_detect_refusal(
    answers: List[str],
    min_length: int = 3,
    check_vague: bool = True
) -> Dict[str, Any]:
    """
    Detect refusals in multiple answers.
    
    Args:
        answers: List of generated answers to check
        min_length: Minimum answer length threshold
        check_vague: Whether to check for vague answers
    
    Returns:
        Dictionary containing:
            - results: List of individual detection results
            - refusal_count: Number of refusals
            - refusal_rate: Percentage of refusals
            - explicit_count: Count of explicit refusals
            - vague_count: Count of vague refusals
            - non_refusal_count: Count of non-refusals
    
    Example:
        >>> answers = [
        ...     "The answer is 42.",
        ...     "Data not available.",
        ...     "I don't know",
        ...     "The result is approximately 100."
        ... ]
        >>> result = batch_detect_refusal(answers)
        >>> print(f"Refusal rate: {result['refusal_rate']:.1f}%")
    """
    
    results = []
    explicit_count = 0
    vague_count = 0
    non_refusal_count = 0
    
    for answer in answers:
        result = detect_refusal(
            answer, 
            min_length=min_length, 
            check_vague=check_vague,
            return_details=True
        )
        results.append(result)
        
        if result['is_refusal']:
            if result['refusal_type'] == 'explicit':
                explicit_count += 1
            elif result['refusal_type'] == 'vague':
                vague_count += 1
        else:
            non_refusal_count += 1
    
    total = len(answers)
    refusal_count = explicit_count + vague_count
    refusal_rate = (refusal_count / total * 100) if total > 0 else 0.0
    
    return {
        'results': results,
        'total': total,
        'refusal_count': refusal_count,
        'refusal_rate': refusal_rate,
        'explicit_count': explicit_count,
        'vague_count': vague_count,
        'non_refusal_count': non_refusal_count,
    }


def categorize_by_refusal(
    answers: List[str],
    labels: Optional[List[str]] = None
) -> Dict[str, List[int]]:
    """
    Categorize answer indices by refusal status.
    Useful for analyzing patterns in refusals.
    
    Args:
        answers: List of generated answers
        labels: Optional list of labels for each answer (e.g., question IDs)
    
    Returns:
        Dictionary with:
            - refusals: List of indices (or labels) that are refusals
            - non_refusals: List of indices (or labels) that are not refusals
    
    Example:
        >>> answers = ["42", "Data not available", "100", "I don't know"]
        >>> labels = ["Q1", "Q2", "Q3", "Q4"]
        >>> result = categorize_by_refusal(answers, labels)
        >>> print(f"Refusal questions: {result['refusals']}")  # ['Q2', 'Q4']
    """
    
    refusals = []
    non_refusals = []
    
    for i, answer in enumerate(answers):
        result = detect_refusal(answer, return_details=False)
        
        identifier = labels[i] if labels else i
        
        if result['is_refusal']:
            refusals.append(identifier)
        else:
            non_refusals.append(identifier)
    
    return {
        'refusals': refusals,
        'non_refusals': non_refusals,
    }


def get_refusal_statistics(
    answers_by_mode: Dict[str, List[str]]
) -> Dict[str, Dict[str, Any]]:
    """
    Calculate refusal statistics across different modes (e.g., closed-book, RAG, oracle).
    
    Args:
        answers_by_mode: Dictionary mapping mode name to list of answers
                        e.g., {'closed-book': [...], 'rag': [...], 'oracle': [...]}
    
    Returns:
        Dictionary mapping mode name to refusal statistics
    
    Example:
        >>> stats = get_refusal_statistics({
        ...     'closed-book': ["I don't know", "Data not available", "42"],
        ...     'oracle': ["42", "100", "The answer is 7.5"]
        ... })
        >>> print(f"Closed-book refusal rate: {stats['closed-book']['refusal_rate']:.1f}%")
    """
    
    statistics = {}
    
    for mode, answers in answers_by_mode.items():
        batch_result = batch_detect_refusal(answers)
        statistics[mode] = {
            'refusal_count': batch_result['refusal_count'],
            'refusal_rate': batch_result['refusal_rate'],
            'explicit_count': batch_result['explicit_count'],
            'vague_count': batch_result['vague_count'],
            'total': batch_result['total'],
        }
    
    return statistics


def _test_detect_refusal():
    """Quick sanity tests for detect_refusal"""
    
    print("Running quick tests for detect_refusal()...")
    
    test_cases = [
        # (answer, expected_is_refusal, description)
        ("I cannot calculate this without data.", True, "Explicit refusal"),
        ("Data not available.", True, "Data unavailable"),
        ("I don't know", True, "Vague short refusal"),
        ("N/A", True, "Not applicable"),
        ("Insufficient information to determine.", True, "Insufficient info"),
        ("The answer is 42.", False, "Valid answer"),
        ("0", False, "Valid short numeric answer"),
        ("Yes", False, "Valid short text answer"),
        ("The consumer segment.", False, "Valid short sentence"),
        ("", True, "Empty answer (counts as refusal)"),
        ("cannot be determined without specific financial data", True, "Long refusal"),
        ("The quick ratio is 1.57", False, "Valid detailed answer"),
    ]
    
    passed = 0
    failed = 0
    
    for answer, expected, desc in test_cases:
        result = detect_refusal(answer)
        actual = result['is_refusal']
        
        if actual == expected:
            passed += 1
            refusal_info = f"[{result['refusal_type']}]" if actual else ""
            print(f"‚úì {desc:45} {refusal_info}")
        else:
            failed += 1
            print(f"‚úó {desc:45}")
            print(f"  Expected is_refusal={expected}, got {actual}")
            print(f"  Type: {result['refusal_type']}, Pattern: {result.get('matched_pattern', 'N/A')}")
    
    print(f"\nResults: {passed} passed, {failed} failed out of {len(test_cases)} tests")
    return failed == 0


if __name__ == "__main__":
    success = _test_detect_refusal()
    
    if success:
        print("\n‚úÖ All quick tests passed!")
    else:
        print("\n‚ö†Ô∏è  Some tests failed")

Running quick tests for detect_refusal()...
‚úì Explicit refusal                              [explicit]
‚úì Data unavailable                              [explicit]
‚úì Vague short refusal                           [explicit]
‚úì Not applicable                                [explicit]
‚úì Insufficient info                             [explicit]
‚úì Valid answer                                  
‚úì Valid short numeric answer                    
‚úì Valid short text answer                       
‚úì Valid short sentence                          
‚úì Empty answer (counts as refusal)              [explicit]
‚úì Long refusal                                  [explicit]
‚úì Valid detailed answer                         

Results: 12 passed, 0 failed out of 12 tests

‚úÖ All quick tests passed!


In [2]:
"""
Comprehensive Test Suite for detect_refusal()
==============================================

Tests cover:
- Explicit refusal patterns
- Vague short answers
- Valid short answers (not refusals)
- Real FinanceBench examples
- Edge cases
- Batch processing
"""


def test_explicit_refusals():
    """Test explicit refusal patterns"""
    print("\n" + "="*70)
    print("TEST SUITE 1: Explicit Refusal Patterns")
    print("="*70)
    
    tests = [
        # Direct refusals
        ("I cannot calculate this.", True, "Direct 'cannot'"),
        ("I don't know the answer.", True, "Don't know"),
        ("I am unable to provide this information.", True, "Unable to"),
        ("I could not find the data.", True, "Could not"),
        
        # Data unavailable
        ("Data not available.", True, "Data not available"),
        ("Information is not available.", True, "Information unavailable"),
        ("The data is unavailable.", True, "Data unavailable (article)"),
        ("No data available.", True, "No data available"),
        
        # Cannot calculate/determine
        ("Cannot be calculated without specific figures.", True, "Cannot calculate"),
        ("This cannot be determined.", True, "Cannot determine"),
        ("Unable to calculate the ratio.", True, "Unable to calculate"),
        ("The answer cannot be computed.", True, "Cannot compute"),
        
        # Insufficient information
        ("Insufficient information to answer.", True, "Insufficient info"),
        ("Not enough data to determine.", True, "Not enough data"),
        ("Lacking sufficient details.", True, "Lacking details"),
        ("Cannot answer without specific financial data.", True, "Without specific data"),
        
        # Not applicable
        ("N/A", True, "N/A abbreviation"),
        ("n/a", True, "n/a lowercase"),
        ("Not applicable in this context.", True, "Not applicable"),
        
        # Financial specific
        ("Financial data is not disclosed.", True, "Not disclosed"),
        ("No specific figures mentioned.", True, "No specific figures"),
    ]
    
    passed, failed = 0, 0
    for answer, expected, desc in tests:
        result = detect_refusal(answer)
        
        if result['is_refusal'] == expected:
            passed += 1
            print(f"‚úì {desc:50} [{result['refusal_type']}]")
        else:
            failed += 1
            print(f"‚úó {desc:50}")
            print(f"  Answer: '{answer}'")
            print(f"  Expected refusal={expected}, got {result['is_refusal']}")
    
    print(f"\nExplicit Refusals: {passed}/{len(tests)} passed")
    return passed, failed


def test_vague_short_answers():
    """Test vague short answer detection"""
    print("\n" + "="*70)
    print("TEST SUITE 2: Vague Short Answers")
    print("="*70)
    
    tests = [
        ("I don't know", True, "I don't know"),
        ("Don't know", True, "Don't know (no 'I')"),
        ("Not sure", True, "Not sure"),
        ("Uncertain", True, "Uncertain"),
        ("Unclear", True, "Unclear"),
        ("Unknown", True, "Unknown"),
    ]
    
    passed, failed = 0, 0
    for answer, expected, desc in tests:
        result = detect_refusal(answer, check_vague=True)
        
        if result['is_refusal'] == expected:
            passed += 1
            print(f"‚úì {desc:50} [{result['refusal_type']}]")
        else:
            failed += 1
            print(f"‚úó {desc:50}")
            print(f"  Expected refusal={expected}, got {result['is_refusal']}")
    
    print(f"\nVague Short Answers: {passed}/{len(tests)} passed")
    return passed, failed


def test_valid_short_answers():
    """Test that valid short answers are NOT detected as refusals"""
    print("\n" + "="*70)
    print("TEST SUITE 3: Valid Short Answers (NOT Refusals)")
    print("="*70)
    
    tests = [
        ("0", False, "Zero"),
        ("1", False, "One"),
        ("42", False, "Number"),
        ("Yes", False, "Yes"),
        ("No", False, "No"),
        ("True", False, "True"),
        ("False", False, "False"),
        ("$8.70", False, "Currency"),
        ("24.5%", False, "Percentage"),
        ("The consumer segment.", False, "Short sentence"),
        ("It decreased.", False, "Short statement"),
    ]
    
    passed, failed = 0, 0
    for answer, expected, desc in tests:
        result = detect_refusal(answer)
        
        if result['is_refusal'] == expected:
            passed += 1
            print(f"‚úì {desc:50}")
        else:
            failed += 1
            print(f"‚úó {desc:50}")
            print(f"  Answer: '{answer}'")
            print(f"  Expected refusal={expected}, got {result['is_refusal']}")
            print(f"  Type: {result['refusal_type']}")
    
    print(f"\nValid Short Answers: {passed}/{len(tests)} passed")
    return passed, failed


def test_real_financebench_examples():
    """Test real examples from FinanceBench dataset"""
    print("\n" + "="*70)
    print("TEST SUITE 4: Real FinanceBench Examples")
    print("="*70)
    
    tests = [
        # Metrics-generated refusals
        (
            "Data not available.",
            True,
            "Metrics: closedbook refusal"
        ),
        (
            "$15 billion",
            False,
            "Metrics: valid answer (even if wrong)"
        ),
        (
            "8.738",
            False,
            "Metrics: valid numeric answer"
        ),
        
        # Domain-relevant refusals
        (
            "The inventory turnover ratio for AES Corporation in FY2022 cannot be calculated without specific COGS and average inventory figures.",
            True,
            "Domain: closedbook refusal (cannot calculate)"
        ),
        (
            "The quick ratio's improvement or decline cannot be determined without specific financial data for FY2022 and FY2023.",
            True,
            "Domain: closedbook refusal (cannot determine)"
        ),
        (
            "Yes, AMD has a reasonably healthy liquidity profile based on its quick ratio for FY22.",
            False,
            "Domain: closedbook valid answer (even if incomplete)"
        ),
        
        # Domain-relevant valid answers
        (
            "AES Corporation sold its inventory roughly 12 times in FY2022; however, conventional inventory management may not be meaningful due to the nature of its business in the energy sector.",
            False,
            "Domain: oracle valid (even with qualification)"
        ),
        (
            "AMCOR's quick ratio has improved from FY2022 to FY2023.",
            False,
            "Domain: oracle valid simplified answer"
        ),
        (
            "Yes. The quick ratio is 1.57, calculated as (cash and cash equivalents+Short term investments+Accounts receivable, net+receivables from related parties)/ (current liabilities).",
            False,
            "Domain: gold standard valid answer"
        ),
        
        # Novel-generated refusals
        (
            "Approximately $1.5 billion.",
            False,
            "Novel: closedbook valid (approximate)"
        ),
        (
            "Interest rate swaps had the highest notional value among Verizon's derivative instruments in FY 2021.",
            False,
            "Novel: closedbook valid (even if wrong)"
        ),
        
        # Edge cases
        (
            "0",
            False,
            "Single digit zero (valid answer)"
        ),
        (
            "No. The quick ratio for 3M was 0.96 by Jun'23 close, which needs a bit of an improvement to touch the 1x mark",
            False,
            "Starts with 'No' but is valid answer"
        ),
    ]
    
    passed, failed = 0, 0
    for answer, expected, desc in tests:
        result = detect_refusal(answer)
        
        if result['is_refusal'] == expected:
            passed += 1
            status = f"[{result['refusal_type']}]" if result['is_refusal'] else "[valid]"
            print(f"‚úì {desc:55} {status}")
        else:
            failed += 1
            print(f"‚úó {desc:55}")
            print(f"  Answer: '{answer[:80]}...'")
            print(f"  Expected refusal={expected}, got {result['is_refusal']}")
            print(f"  Type: {result['refusal_type']}")
    
    print(f"\nReal FinanceBench Examples: {passed}/{len(tests)} passed")
    return passed, failed


def test_edge_cases():
    """Test edge cases"""
    print("\n" + "="*70)
    print("TEST SUITE 5: Edge Cases")
    print("="*70)
    
    tests = [
        ("", True, "Empty string"),
        (None, True, "None input"),
        ("   ", True, "Whitespace only"),
        ("?", False, "Single character"),
        ("The answer is not available due to data limitations.", True, "Long refusal"),
        ("Although I cannot determine the exact figure, the trend suggests growth.", True, "Qualified refusal"),
        ("No", False, "Simple 'No' (valid answer, not refusal)"),
        ("Yes", False, "Simple 'Yes' (valid answer)"),
    ]
    
    passed, failed = 0, 0
    for answer, expected, desc in tests:
        try:
            result = detect_refusal(answer)
            
            if result['is_refusal'] == expected:
                passed += 1
                print(f"‚úì {desc:50}")
            else:
                failed += 1
                print(f"‚úó {desc:50}")
                print(f"  Expected refusal={expected}, got {result['is_refusal']}")
        except Exception as e:
            failed += 1
            print(f"‚úó {desc:50} - Exception: {e}")
    
    print(f"\nEdge Cases: {passed}/{len(tests)} passed")
    return passed, failed


def test_batch_detection():
    """Test batch detection"""
    print("\n" + "="*70)
    print("TEST SUITE 6: Batch Detection")
    print("="*70)
    
    answers = [
        "The answer is 42.",
        "Data not available.",
        "I don't know",
        "$100 million",
        "Cannot be calculated without specific data.",
        "Yes. It decreased.",
        "N/A",
        "The ratio is 1.57",
    ]
    
    result = batch_detect_refusal(answers)
    
    print(f"Total answers: {result['total']}")
    print(f"Refusals: {result['refusal_count']}")
    print(f"  - Explicit: {result['explicit_count']}")
    print(f"  - Vague: {result['vague_count']}")
    print(f"Non-refusals: {result['non_refusal_count']}")
    print(f"Refusal rate: {result['refusal_rate']:.1f}%")
    
    # Expected: 4 refusals (Data not available, I don't know, Cannot be calculated, N/A)
    expected_refusals = 4
    expected_rate = 4/8 * 100
    
    if result['refusal_count'] == expected_refusals:
        print(f"\n‚úì Batch detection working correctly")
        return 1, 0
    else:
        print(f"\n‚úó Batch detection failed")
        print(f"  Expected {expected_refusals} refusals, got {result['refusal_count']}")
        return 0, 1


def test_categorization():
    """Test categorization by refusal"""
    print("\n" + "="*70)
    print("TEST SUITE 7: Categorization by Refusal")
    print("="*70)
    
    answers = [
        "42",
        "Data not available",
        "100",
        "I don't know",
        "The answer is 7.5"
    ]
    
    labels = ["Q1", "Q2", "Q3", "Q4", "Q5"]
    
    result = categorize_by_refusal(answers, labels)
    
    print(f"Refusals: {result['refusals']}")
    print(f"Non-refusals: {result['non_refusals']}")
    
    # Expected: Q2 and Q4 are refusals
    expected_refusals = ["Q2", "Q4"]
    expected_non_refusals = ["Q1", "Q3", "Q5"]
    
    if result['refusals'] == expected_refusals and result['non_refusals'] == expected_non_refusals:
        print(f"\n‚úì Categorization working correctly")
        return 1, 0
    else:
        print(f"\n‚úó Categorization failed")
        print(f"  Expected refusals: {expected_refusals}")
        print(f"  Expected non-refusals: {expected_non_refusals}")
        return 0, 1


def test_statistics_by_mode():
    """Test statistics across different modes"""
    print("\n" + "="*70)
    print("TEST SUITE 8: Statistics by Mode")
    print("="*70)
    
    answers_by_mode = {
        'closed-book': [
            "I don't know",
            "Data not available",
            "42",
            "Cannot determine without data",
            "Approximately 100"
        ],
        'oracle': [
            "42",
            "100",
            "The answer is 7.5",
            "Yes, it increased",
            "1.57"
        ]
    }
    
    stats = get_refusal_statistics(answers_by_mode)
    
    print("\nClosed-Book Mode:")
    print(f"  Refusal rate: {stats['closed-book']['refusal_rate']:.1f}%")
    print(f"  Refusals: {stats['closed-book']['refusal_count']}/{stats['closed-book']['total']}")
    
    print("\nOracle Mode:")
    print(f"  Refusal rate: {stats['oracle']['refusal_rate']:.1f}%")
    print(f"  Refusals: {stats['oracle']['refusal_count']}/{stats['oracle']['total']}")
    
    # Expected: closed-book should have higher refusal rate
    if stats['closed-book']['refusal_rate'] > stats['oracle']['refusal_rate']:
        print(f"\n‚úì Statistics by mode working correctly")
        print(f"  (Closed-book has higher refusal rate as expected)")
        return 1, 0
    else:
        print(f"\n‚úó Statistics unexpected")
        return 0, 1


def run_all_tests():
    """Run all test suites"""
    print("\n" + "="*70)
    print("COMPREHENSIVE TEST SUITE FOR detect_refusal()")
    print("="*70)
    
    total_passed = 0
    total_failed = 0
    
    test_suites = [
        test_explicit_refusals,
        test_vague_short_answers,
        test_valid_short_answers,
        test_real_financebench_examples,
        test_edge_cases,
        test_batch_detection,
        test_categorization,
        test_statistics_by_mode,
    ]
    
    for test_func in test_suites:
        passed, failed = test_func()
        total_passed += passed
        total_failed += failed
    
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"Total Tests: {total_passed + total_failed}")
    print(f"‚úì Passed: {total_passed}")
    print(f"‚úó Failed: {total_failed}")
    print(f"Success Rate: {100 * total_passed / (total_passed + total_failed):.1f}%")
    print("="*70)
    
    return total_failed == 0


if __name__ == "__main__":
    success = run_all_tests()
    
    if success:
        print("\nüéâ ALL TESTS PASSED! üéâ")
    else:
        print("\n‚ö†Ô∏è  SOME TESTS FAILED - Review output above")


COMPREHENSIVE TEST SUITE FOR detect_refusal()

TEST SUITE 1: Explicit Refusal Patterns
‚úì Direct 'cannot'                                    [explicit]
‚úì Don't know                                         [explicit]
‚úì Unable to                                          [explicit]
‚úì Could not                                          [explicit]
‚úì Data not available                                 [explicit]
‚úì Information unavailable                            [explicit]
‚úì Data unavailable (article)                         [explicit]
‚úì No data available                                  [explicit]
‚úì Cannot calculate                                   [explicit]
‚úì Cannot determine                                   [explicit]
‚úì Unable to calculate                                [explicit]
‚úì Cannot compute                                     [explicit]
‚úì Insufficient info                                  [explicit]
‚úì Not enough data                                   

In [3]:
# Single answer check
result = detect_refusal("Data not available.")

if result['is_refusal']:
    print(f"Refusal detected!")
    print(f"Type: {result['refusal_type']}")
    print(f"Confidence: {result['confidence']}")
else:
    print("Valid answer")

# Batch processing
answers = [
    "The answer is 42.",
    "Data not available.",
    "I don't know",
    "$100 million",
]

batch_result = batch_detect_refusal(answers)
print(f"Refusal rate: {batch_result['refusal_rate']:.1f}%")
print(f"Explicit refusals: {batch_result['explicit_count']}")
print(f"Vague refusals: {batch_result['vague_count']}")

# Compare across modes
answers_by_mode = {
    'closed-book': [
        "I don't know",
        "Data not available",
        "42"
    ],
    'oracle': [
        "42",
        "100",
        "The answer is 7.5"
    ]
}

stats = get_refusal_statistics(answers_by_mode)
print(f"Closed-book refusal rate: {stats['closed-book']['refusal_rate']:.1f}%")
print(f"Oracle refusal rate: {stats['oracle']['refusal_rate']:.1f}%")

# Categorize by refusal
answers = ["42", "Data not available", "100", "I don't know"]
labels = ["Q1", "Q2", "Q3", "Q4"]

categories = categorize_by_refusal(answers, labels)
print(f"Questions with refusals: {categories['refusals']}")  # ['Q2', 'Q4']

Refusal detected!
Type: explicit
Confidence: 1.0
Refusal rate: 50.0%
Explicit refusals: 2
Vague refusals: 0
Closed-book refusal rate: 66.7%
Oracle refusal rate: 0.0%
Questions with refusals: ['Q2', 'Q4']
