In [10]:
"""
FinanceBench Generation Evaluation Pipeline
===========================================

This script evaluates generated answers from RAG experiments using multiple metrics
based on question types (metrics-generated, novel-generated, domain-relevant).
"""

import os
import sys
import json
import time
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime

# Progress tracking
from dotenv import load_dotenv
from tqdm import tqdm

# ============================================================================
# Load Environment Variables
# ============================================================================
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# ============================================================================
# IMPORTANT: Update these paths to match your directory structure
# ============================================================================

# Directory containing generated answer JSON files
GENERATION_PATH = "../../generation_set/closedbook_oracle_sets"

# Directory where evaluation result JSON files will be saved
EVALUATION_PATH = "../../evaluation_results/generation"

# ============================================================================
# Evaluation Configuration (Hardcoded - modify as needed)
# ============================================================================

EVALUATION_CONFIG = {
    'tolerance': 0.05,              # 5% tolerance for numerical matching
    'llm_model': 'gpt-4o-mini',     # Model for LLM-as-judge evaluation
    'temperature': 0.0,              # Temperature for LLM calls
    'return_details': True           # Return full evaluation details
}

# API Rate Limiting
CALL_DELAY = 0.5        # Seconds to wait between each query evaluation
RETRY_DELAY = 10        # Seconds to wait before retrying after API error
MAX_RETRIES = 3         # Maximum number of retry attempts

# ============================================================================
# Evaluation File Configurations
# ============================================================================
# Define all 9 experimental configurations to process

EVALUATION_CONFIGS = [
    # Closed-book experiments (3 question types)
    {
        'mode': 'closed_book',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_closed_basic'
    },
    {
        'mode': 'closed_book',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_closed_basic'
    },
    {
        'mode': 'closed_book',
        'question_type': 'domain-relevant',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'domain_closed_basic'
    },
    
    # RAG experiments (3 question types)
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic'
    },
    {
        'mode': 'rag',
        'question_type': 'domain-relevant',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'domain_rag_basic'
    },
    
    # Oracle experiments (3 question types)
    {
        'mode': 'oracle',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic'
    },
    {
        'mode': 'oracle',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic'
    },
    {
        'mode': 'oracle',
        'question_type': 'domain-relevant',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'domain_rag_basic'
    },
]

# ============================================================================
# Import Evaluation Functions
# ============================================================================

# Track whether evaluation functions are available
EVAL_FUNCTIONS_AVAILABLE = False

try:
    # Import your evaluation suite functions
    from generation_evaluation_suit.evaluate_answer import evaluate_answer
    from generation_evaluation_suit.detect_refusal import detect_refusal
    from generation_evaluation_suit.numerical_exact_match import numerical_exact_match
    from generation_evaluation_suit.token_f1 import token_f1
    from generation_evaluation_suit.llm_as_judge_binary import llm_as_judge_binary
    from generation_evaluation_suit.llm_as_judge_graded import llm_as_judge_graded
    
    EVAL_FUNCTIONS_AVAILABLE = True
    print("‚úì Successfully imported evaluation functions")
except ImportError as e:
    print(f"‚ö†Ô∏è  Note: Evaluation functions not yet imported")
    print(f"   (This is fine for Step 1 configuration testing)")
    print(f"   Error: {e}")
    print(f"\n   When ready to run full evaluation, ensure these modules are available:")
    print("     - evaluate_answer.py")
    print("     - detect_refusal.py")
    print("     - numerical_exact_match.py")
    print("     - token_f1.py")
    print("     - llm_as_judge_binary.py")
    print("     - llm_as_judge_graded.py")

# ============================================================================
# Configuration Verification
# ============================================================================

def verify_configuration():
    """Verify that all configuration is set up correctly."""
    
    print("\n" + "="*70)
    print("CONFIGURATION VERIFICATION")
    print("="*70)
    
    # Check paths
    print(f"\nüìÅ Paths:")
    print(f"   Generation Path: {GENERATION_PATH}")
    print(f"   Evaluation Path: {EVALUATION_PATH}")
    
    if GENERATION_PATH == "/path/to/generated_answers":
        print(f"   ‚ö†Ô∏è  WARNING: GENERATION_PATH is still set to placeholder!")
        print(f"   ‚ö†Ô∏è  Please update GENERATION_PATH to your actual directory")
    
    if EVALUATION_PATH == "/path/to/evaluation_results":
        print(f"   ‚ö†Ô∏è  WARNING: EVALUATION_PATH is still set to placeholder!")
        print(f"   ‚ö†Ô∏è  Please update EVALUATION_PATH to your actual directory")
    
    # Check evaluation config
    print(f"\n‚öôÔ∏è  Evaluation Configuration:")
    print(f"   Tolerance: {EVALUATION_CONFIG['tolerance']} ({EVALUATION_CONFIG['tolerance']*100}%)")
    print(f"   LLM Model: {EVALUATION_CONFIG['llm_model']}")
    print(f"   Temperature: {EVALUATION_CONFIG['temperature']}")
    print(f"   Return Details: {EVALUATION_CONFIG['return_details']}")
    
    # Check rate limiting
    print(f"\n‚è±Ô∏è  Rate Limiting:")
    print(f"   Call Delay: {CALL_DELAY}s between queries")
    print(f"   Retry Delay: {RETRY_DELAY}s after API error")
    print(f"   Max Retries: {MAX_RETRIES} attempts")
    
    # Check configurations
    print(f"\nüìã Evaluation Configurations:")
    print(f"   Total files to process: {len(EVALUATION_CONFIGS)}")
    print(f"\n   Configurations:")
    
    for i, config in enumerate(EVALUATION_CONFIGS, 1):
        mode = config['mode']
        qtype = config['question_type']
        print(f"   [{i}/9] {mode:12s} | {qtype}")
    
    print("\n" + "="*70)
    print("‚úì Configuration loaded successfully!")
    print("="*70)

# ============================================================================
# Main Test Function
# ============================================================================

def test_step1():
    """Test Step 1: Verify imports and configuration."""
    
    print("\n" + "="*70)
    print("STEP 1: PROJECT SETUP & CONFIGURATION")
    print("="*70)
    
    # Verify configuration
    verify_configuration()
    
    print("\n‚úÖ Step 1 Complete!")
    print("\nNext steps:")
    print("  1. Update GENERATION_PATH to your actual directory")
    print("  2. Update EVALUATION_PATH to your actual directory")
    print("  3. Verify the 9 EVALUATION_CONFIGS match your file naming")
    print("  4. Run this script again to confirm configuration")
    print("  5. Report back for approval to proceed to Step 2")
    print("\n" + "="*70)

# ============================================================================
# Entry Point
# ============================================================================

if __name__ == "__main__":
    test_step1()

‚úì Successfully imported evaluation functions

STEP 1: PROJECT SETUP & CONFIGURATION

CONFIGURATION VERIFICATION

üìÅ Paths:
   Generation Path: ../../generation_set/closedbook_oracle_sets
   Evaluation Path: ../../evaluation_results/generation

‚öôÔ∏è  Evaluation Configuration:
   Tolerance: 0.05 (5.0%)
   LLM Model: gpt-4o-mini
   Temperature: 0.0
   Return Details: True

‚è±Ô∏è  Rate Limiting:
   Call Delay: 0.5s between queries
   Retry Delay: 10s after API error
   Max Retries: 3 attempts

üìã Evaluation Configurations:
   Total files to process: 9

   Configurations:
   [1/9] closed_book  | metrics-generated
   [2/9] closed_book  | novel-generated
   [3/9] closed_book  | domain-relevant
   [4/9] rag          | metrics-generated
   [5/9] rag          | novel-generated
   [6/9] rag          | domain-relevant
   [7/9] oracle       | metrics-generated
   [8/9] oracle       | novel-generated
   [9/9] oracle       | domain-relevant

‚úì Configuration loaded successfully!

‚úÖ Step 1

In [2]:
# ============================================================================
# STEP 2: File Path Management Functions
# ============================================================================

def get_generation_filename(config: Dict[str, str]) -> str:
    """
    Construct generation file name from config.
    
    Format: {mode}_{question_type}_{provider}_{model}_{temperature}_{template_alias}.json
    
    Args:
        config: Dictionary with keys: mode, question_type, provider, model, 
                temperature, template_alias
    
    Returns:
        Filename string (e.g., 'closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json')
    
    Example:
        >>> config = {
        ...     'mode': 'closed_book',
        ...     'question_type': 'metrics-generated',
        ...     'provider': 'openai',
        ...     'model': 'gpt-4o-mini',
        ...     'temperature': '0.0',
        ...     'template_alias': 'metrics_closed_basic'
        ... }
        >>> get_generation_filename(config)
        'closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json'
    """
    filename = (
        f"{config['mode']}_"
        f"{config['question_type']}_"
        f"{config['provider']}_"
        f"{config['model']}_"
        f"{config['temperature']}_"
        f"{config['template_alias']}.json"
    )
    return filename


def get_evaluation_filename(config: Dict[str, str]) -> str:
    """
    Construct evaluation file name from config.
    
    Format: evaluation_{mode}_{question_type}_{provider}_{model}_{temperature}_{template_alias}.json
    
    Args:
        config: Dictionary with keys: mode, question_type, provider, model, 
                temperature, template_alias
    
    Returns:
        Filename string (e.g., 'evaluation_closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json')
    
    Example:
        >>> config = {
        ...     'mode': 'closed_book',
        ...     'question_type': 'metrics-generated',
        ...     'provider': 'openai',
        ...     'model': 'gpt-4o-mini',
        ...     'temperature': '0.0',
        ...     'template_alias': 'metrics_closed_basic'
        ... }
        >>> get_evaluation_filename(config)
        'evaluation_closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json'
    """
    filename = (
        f"evaluation_"
        f"{config['mode']}_"
        f"{config['question_type']}_"
        f"{config['provider']}_"
        f"{config['model']}_"
        f"{config['temperature']}_"
        f"{config['template_alias']}.json"
    )
    return filename


def get_generation_filepath(config: Dict[str, str]) -> str:
    """
    Get full path to generation file.
    
    Args:
        config: Configuration dictionary
    
    Returns:
        Full path to generation JSON file
    
    Example:
        >>> config = {'mode': 'closed_book', ...}
        >>> get_generation_filepath(config)
        '/path/to/generated_answers/closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json'
    """
    filename = get_generation_filename(config)
    filepath = os.path.join(GENERATION_PATH, filename)
    return filepath


def get_evaluation_filepath(config: Dict[str, str]) -> str:
    """
    Get full path to evaluation file.
    
    Args:
        config: Configuration dictionary
    
    Returns:
        Full path to evaluation JSON file
    
    Example:
        >>> config = {'mode': 'closed_book', ...}
        >>> get_evaluation_filepath(config)
        '/path/to/evaluation_results/evaluation_closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json'
    """
    filename = get_evaluation_filename(config)
    filepath = os.path.join(EVALUATION_PATH, filename)
    return filepath


def check_file_exists(filepath: str) -> bool:
    """
    Check if file exists at given path.
    
    Args:
        filepath: Full path to file
    
    Returns:
        True if file exists, False otherwise
    
    Example:
        >>> check_file_exists('/path/to/file.json')
        True
    """
    return os.path.exists(filepath) and os.path.isfile(filepath)


def get_file_info(filepath: str) -> Dict[str, Any]:
    """
    Get information about a file.
    
    Args:
        filepath: Full path to file
    
    Returns:
        Dictionary with file information:
            - exists: bool
            - size_bytes: int (if exists)
            - size_kb: float (if exists)
            - size_mb: float (if exists)
    
    Example:
        >>> get_file_info('/path/to/file.json')
        {'exists': True, 'size_bytes': 1024, 'size_kb': 1.0, 'size_mb': 0.001}
    """
    info = {'exists': False}
    
    if check_file_exists(filepath):
        size_bytes = os.path.getsize(filepath)
        info['exists'] = True
        info['size_bytes'] = size_bytes
        info['size_kb'] = round(size_bytes / 1024, 2)
        info['size_mb'] = round(size_bytes / (1024 * 1024), 3)
    
    return info

# ============================================================================
def test_step2():
    """Test Step 2: File path management functions."""
    
    print("\n" + "="*70)
    print("STEP 2: FILE PATH MANAGEMENT")
    print("="*70)
    
    # Test with first configuration (closed_book metrics-generated)
    test_config = EVALUATION_CONFIGS[0]
    
    print("\nüß™ Testing with configuration:")
    print(f"   Mode: {test_config['mode']}")
    print(f"   Question Type: {test_config['question_type']}")
    print(f"   Provider: {test_config['provider']}")
    print(f"   Model: {test_config['model']}")
    print(f"   Temperature: {test_config['temperature']}")
    print(f"   Template: {test_config['template_alias']}")
    
    # Test filename generation
    print("\nüìù Testing filename generation:")
    gen_filename = get_generation_filename(test_config)
    eval_filename = get_evaluation_filename(test_config)
    print(f"   Generation: {gen_filename}")
    print(f"   Evaluation: {eval_filename}")
    
    # Test filepath generation
    print("\nüìÅ Testing filepath generation:")
    gen_filepath = get_generation_filepath(test_config)
    eval_filepath = get_evaluation_filepath(test_config)
    print(f"   Generation: {gen_filepath}")
    print(f"   Evaluation: {eval_filepath}")
    
    # Test file existence
    print("\nüîç Checking file existence:")
    gen_exists = check_file_exists(gen_filepath)
    eval_exists = check_file_exists(eval_filepath)
    print(f"   Generation file exists: {gen_exists}")
    print(f"   Evaluation file exists: {eval_exists}")
    
    # Get file info if exists
    if gen_exists:
        gen_info = get_file_info(gen_filepath)
        print(f"   Generation file size: {gen_info['size_kb']} KB")
    
    if eval_exists:
        eval_info = get_file_info(eval_filepath)
        print(f"   Evaluation file size: {eval_info['size_kb']} KB")
    
    # Test all 9 configurations
    print("\n" + "="*70)
    print("üìã Testing all 9 configurations:")
    print("="*70)
    
    for i, config in enumerate(EVALUATION_CONFIGS, 1):
        gen_filepath = get_generation_filepath(config)
        eval_filepath = get_evaluation_filepath(config)
        gen_exists = check_file_exists(gen_filepath)
        eval_exists = check_file_exists(eval_filepath)
        
        mode = config['mode']
        qtype = config['question_type']
        
        print(f"\n[{i}/9] {mode:12s} | {qtype}")
        print(f"   Generation file: {os.path.basename(gen_filepath)}")
        print(f"   ‚îî‚îÄ Exists: {'‚úì' if gen_exists else '‚úó'}")
        
        if gen_exists:
            info = get_file_info(gen_filepath)
            print(f"   ‚îî‚îÄ Size: {info['size_kb']} KB")
        
        print(f"   Evaluation file: {os.path.basename(eval_filepath)}")
        print(f"   ‚îî‚îÄ Exists: {'‚úì' if eval_exists else '‚úó'}")
        
        if eval_exists:
            info = get_file_info(eval_filepath)
            print(f"   ‚îî‚îÄ Size: {info['size_kb']} KB")
    
    # Summary
    print("\n" + "="*70)
    gen_count = sum(1 for c in EVALUATION_CONFIGS if check_file_exists(get_generation_filepath(c)))
    eval_count = sum(1 for c in EVALUATION_CONFIGS if check_file_exists(get_evaluation_filepath(c)))
    
    print("\nüìä Summary:")
    print(f"   Total configurations: {len(EVALUATION_CONFIGS)}")
    print(f"   Generation files found: {gen_count}/{len(EVALUATION_CONFIGS)}")
    print(f"   Evaluation files found: {eval_count}/{len(EVALUATION_CONFIGS)}")
    
    if gen_count == 0:
        print("\n‚ö†Ô∏è  WARNING: No generation files found!")
        print("   Please check:")
        print(f"   1. GENERATION_PATH is correct: {GENERATION_PATH}")
        print("   2. Files exist in that directory")
        print("   3. Filenames match the expected pattern")
    elif gen_count < len(EVALUATION_CONFIGS):
        print(f"\n‚ö†Ô∏è  WARNING: Only {gen_count}/{len(EVALUATION_CONFIGS)} generation files found")
        print("   Some configurations are missing generation files")
    else:
        print("\n‚úì All generation files found!")
    
    print("\n" + "="*70)
    print("‚úÖ Step 2 Complete!")
    print("="*70)
    print("\nStep 2 Functions Implemented:")
    print("  ‚úì get_generation_filename(config)")
    print("  ‚úì get_evaluation_filename(config)")
    print("  ‚úì get_generation_filepath(config)")
    print("  ‚úì get_evaluation_filepath(config)")
    print("  ‚úì check_file_exists(filepath)")
    print("  ‚úì get_file_info(filepath)")
    
    print("\nNext steps:")
    print("  1. Verify filenames match your actual files")
    print("  2. Confirm generation files are detected correctly")
    print("  3. Report back for approval to proceed to Step 3")
    print("\n" + "="*70)


test_step2()



STEP 2: FILE PATH MANAGEMENT

üß™ Testing with configuration:
   Mode: closed_book
   Question Type: metrics-generated
   Provider: openai
   Model: gpt-4o-mini
   Temperature: 0.0
   Template: metrics_closed_basic

üìù Testing filename generation:
   Generation: closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json
   Evaluation: evaluation_closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json

üìÅ Testing filepath generation:
   Generation: ../../generation_set/closedbook_oracle_sets/closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json
   Evaluation: ../../evaluation_results/generation/evaluation_closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json

üîç Checking file existence:
   Generation file exists: True
   Evaluation file exists: True
   Generation file size: 402.86 KB
   Evaluation file size: 176.38 KB

üìã Testing all 9 configurations:

[1/9] closed_book  | metrics-gene

In [3]:
# ============================================================================
# STEP 3: JSON File Reader
# ============================================================================

def read_generation_file(filepath: str) -> dict:
    """
    Read generation JSON file and return parsed data.
    Validates structure and required fields.
    
    Args:
        filepath: Full path to generation JSON file
    
    Returns:
        Dictionary with parsed JSON data
    
    Raises:
        FileNotFoundError: If file doesn't exist
        json.JSONDecodeError: If file is not valid JSON
        ValueError: If required fields are missing
    
    Example:
        >>> data = read_generation_file('/path/to/closed_book_metrics-generated_...')
        >>> print(f"Mode: {data['metadata']['mode']}")
        >>> print(f"Questions: {len(data['queries'])}")
    """
    # Check file exists
    if not os.path.exists(filepath):
        raise FileNotFoundError(
            f"Generation file not found: {filepath}\n"
            f"Please check:\n"
            f"  1. GENERATION_PATH is correct\n"
            f"  2. File exists in the directory\n"
            f"  3. Filename matches the expected pattern"
        )
    
    # Read and parse JSON
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        raise json.JSONDecodeError(
            f"Failed to parse JSON file: {filepath}\n"
            f"Error: {str(e)}\n"
            f"The file may be corrupted or not valid JSON.",
            e.doc, e.pos
        )
    
    # Validate required top-level fields
    required_top_fields = ['metadata', 'queries']
    missing_top = [field for field in required_top_fields if field not in data]
    if missing_top:
        raise ValueError(
            f"Generation file missing required top-level fields: {missing_top}\n"
            f"File: {filepath}\n"
            f"Required fields: {required_top_fields}"
        )
    
    # Validate metadata fields
    required_metadata_fields = ['mode', 'question_type', 'provider', 'model', 
                                 'temperature', 'total_questions']
    metadata = data.get('metadata', {})
    missing_metadata = [field for field in required_metadata_fields 
                        if field not in metadata]
    if missing_metadata:
        raise ValueError(
            f"Generation file metadata missing required fields: {missing_metadata}\n"
            f"File: {filepath}\n"
            f"Required metadata fields: {required_metadata_fields}"
        )
    
    # Validate queries is a list
    queries = data.get('queries', [])
    if not isinstance(queries, list):
        raise ValueError(
            f"Field 'queries' must be a list, got {type(queries).__name__}\n"
            f"File: {filepath}"
        )
    
    # Validate queries is not empty
    if len(queries) == 0:
        raise ValueError(
            f"Field 'queries' is empty (no questions to evaluate)\n"
            f"File: {filepath}"
        )
    
    # Validate each query has required fields
    required_query_fields = ['financebench_id', 'question', 'answer', 
                             'generated_answer', 'question_type']
    
    for i, query in enumerate(queries):
        missing_query = [field for field in required_query_fields 
                         if field not in query]
        if missing_query:
            raise ValueError(
                f"Query #{i+1} missing required fields: {missing_query}\n"
                f"File: {filepath}\n"
                f"Query ID: {query.get('financebench_id', 'UNKNOWN')}\n"
                f"Required query fields: {required_query_fields}"
            )
        
        # Validate that answer and generated_answer are strings
        if not isinstance(query.get('answer'), str):
            raise ValueError(
                f"Query #{i+1} 'answer' must be a string\n"
                f"File: {filepath}\n"
                f"Query ID: {query.get('financebench_id', 'UNKNOWN')}"
            )
        
        if not isinstance(query.get('generated_answer'), str):
            raise ValueError(
                f"Query #{i+1} 'generated_answer' must be a string\n"
                f"File: {filepath}\n"
                f"Query ID: {query.get('financebench_id', 'UNKNOWN')}"
            )
    
    return data


# ============================================================================
# STEP 3 Test Function
# ============================================================================

def test_step3():
    """Test Step 3: JSON file reading and validation."""
    
    print("\n" + "="*70)
    print("STEP 3: JSON FILE READER")
    print("="*70)
    
    # Test with first configuration
    test_config = EVALUATION_CONFIGS[0]
    gen_filepath = get_generation_filepath(test_config)
    
    print(f"\nüß™ Testing with: {test_config['mode']} - {test_config['question_type']}")
    print(f"   File: {os.path.basename(gen_filepath)}")
    
    # Check if file exists
    if not check_file_exists(gen_filepath):
        print(f"\n‚ö†Ô∏è  File not found: {gen_filepath}")
        print("   Cannot test JSON reading without a file.")
        print("\n   To test Step 3:")
        print("   1. Ensure GENERATION_PATH points to your actual directory")
        print("   2. Ensure at least one generation file exists")
        print("   3. Run this test again")
        return
    
    # Try to read the file
    print(f"\nüìñ Reading file...")
    try:
        data = read_generation_file(gen_filepath)
        print(f"   ‚úì File read successfully!")
        
        # Display metadata
        metadata = data['metadata']
        print(f"\nüìã Metadata:")
        print(f"   Mode: {metadata['mode']}")
        print(f"   Question Type: {metadata['question_type']}")
        print(f"   Provider: {metadata['provider']}")
        print(f"   Model: {metadata['model']}")
        print(f"   Temperature: {metadata['temperature']}")
        print(f"   Total Questions: {metadata['total_questions']}")
        
        # Display queries info
        queries = data['queries']
        print(f"\nüìù Queries:")
        print(f"   Count: {len(queries)}")
        
        # Show first query
        if len(queries) > 0:
            first_query = queries[0]
            print(f"\n   First Query:")
            print(f"   ‚îú‚îÄ ID: {first_query['financebench_id']}")
            print(f"   ‚îú‚îÄ Question: {first_query['question'][:80]}...")
            print(f"   ‚îú‚îÄ Gold Answer: {first_query['answer'][:60]}...")
            print(f"   ‚îî‚îÄ Generated Answer: {first_query['generated_answer'][:60]}...")
        
        # Validation summary
        print(f"\n‚úÖ Validation Passed:")
        print(f"   ‚úì All required top-level fields present")
        print(f"   ‚úì All required metadata fields present")
        print(f"   ‚úì Queries is a list with {len(queries)} items")
        print(f"   ‚úì All queries have required fields")
        print(f"   ‚úì All answers are strings")
        
    except FileNotFoundError as e:
        print(f"\n‚úó Error: File not found")
        print(f"   {str(e)}")
    except json.JSONDecodeError as e:
        print(f"\n‚úó Error: Invalid JSON")
        print(f"   {str(e)}")
    except ValueError as e:
        print(f"\n‚úó Error: Validation failed")
        print(f"   {str(e)}")
    except Exception as e:
        print(f"\n‚úó Unexpected error: {type(e).__name__}")
        print(f"   {str(e)}")
    
    print("\n" + "="*70)
    
    # Test with non-existent file
    print("\nüß™ Testing error handling with non-existent file...")
    fake_path = "/path/to/nonexistent/file.json"
    try:
        read_generation_file(fake_path)
        print("   ‚úó Should have raised FileNotFoundError!")
    except FileNotFoundError:
        print("   ‚úì Correctly raised FileNotFoundError")
    
    print("\n" + "="*70)
    print("‚úÖ Step 3 Complete!")
    print("="*70)
    print("\nStep 3 Function Implemented:")
    print("  ‚úì read_generation_file(filepath)")
    print("\nFeatures:")
    print("  ‚úì File existence check")
    print("  ‚úì JSON parsing")
    print("  ‚úì Structure validation")
    print("  ‚úì Required fields validation")
    print("  ‚úì Data type validation")
    print("  ‚úì Comprehensive error messages")
    print("\n" + "="*70)


# Test Step 3
test_step3()  # Add this line


STEP 3: JSON FILE READER

üß™ Testing with: closed_book - metrics-generated
   File: closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json

üìñ Reading file...
   ‚úì File read successfully!

üìã Metadata:
   Mode: closed_book
   Question Type: metrics-generated
   Provider: openai
   Model: gpt-4o-mini
   Temperature: 0.0
   Total Questions: 50

üìù Queries:
   Count: 50

   First Query:
   ‚îú‚îÄ ID: financebench_id_03029
   ‚îú‚îÄ Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a r...
   ‚îú‚îÄ Gold Answer: $1577.00...
   ‚îî‚îÄ Generated Answer: $1,700 million...

‚úÖ Validation Passed:
   ‚úì All required top-level fields present
   ‚úì All required metadata fields present
   ‚úì Queries is a list with 50 items
   ‚úì All queries have required fields
   ‚úì All answers are strings


üß™ Testing error handling with non-existent file...
   ‚úì Correctly raised FileNotFoundError

‚úÖ Step 3 Complete!

Step 3 Functi

In [4]:
# ============================================================================
# STEP 4: Single Query Evaluator
# ============================================================================

def evaluate_single_query(
    query: Dict[str, Any],
    question_type: str,
    eval_config: Dict[str, Any],
    retry_delay: int = 10,
    max_retries: int = 3
) -> Dict[str, Any]:
    """
    Evaluate a single query using evaluate_answer() with retry logic.
    
    Args:
        query: Query dictionary with keys: financebench_id, question, answer, 
               generated_answer, question_type
        question_type: Question type for routing ('metrics-generated', 
                      'novel-generated', 'domain-relevant')
        eval_config: Evaluation configuration dictionary
        retry_delay: Seconds to wait before retrying after API error (default: 10)
        max_retries: Maximum number of retry attempts (default: 3)
    
    Returns:
        Dictionary containing:
            - financebench_id: Query ID
            - question: The question
            - gold_answer: Gold standard answer
            - generated_answer: Generated answer
            - question_type: Question type
            - evaluation: Full evaluation result from evaluate_answer()
            - evaluation_success: Boolean indicating if evaluation succeeded
            - retry_count: Number of retries needed
    
    Raises:
        Exception: If evaluation fails after max_retries attempts
    
    Example:
        >>> query = {
        ...     'financebench_id': 'test_001',
        ...     'question': 'What is the revenue?',
        ...     'answer': '$100 million',
        ...     'generated_answer': '$100M',
        ...     'question_type': 'metrics-generated'
        ... }
        >>> result = evaluate_single_query(query, 'metrics-generated', EVALUATION_CONFIG)
        >>> print(result['evaluation']['summary']['metrics_computed'])
    """
    # Extract required fields
    financebench_id = query['financebench_id']
    question = query['question']
    gold_answer = query['answer']
    generated_answer = query['generated_answer']
    
    # Attempt evaluation with retry logic
    retry_count = 0
    last_error = None
    
    for attempt in range(max_retries):
        try:
            # Call evaluate_answer from your evaluation suite
            evaluation_result = evaluate_answer(
                question=question,
                question_type=question_type,
                gold_answer=gold_answer,
                generated_answer=generated_answer,
                config=eval_config
            )
            
            # Success! Return the result
            return {
                'financebench_id': financebench_id,
                'question': question,
                'gold_answer': gold_answer,
                'generated_answer': generated_answer,
                'question_type': question_type,
                'evaluation': evaluation_result,
                'evaluation_success': True,
                'retry_count': retry_count
            }
            
        except Exception as e:
            retry_count += 1
            last_error = e
            error_msg = str(e)
            
            # Check if it's an API error that should be retried
            is_api_error = any(keyword in error_msg.lower() 
                              for keyword in ['rate limit', 'api', 'timeout', 
                                            'connection', 'openai'])
            
            if is_api_error and attempt < max_retries - 1:
                # Log the retry
                print(f"   ‚ö†Ô∏è  API error on attempt {attempt + 1}/{max_retries}")
                print(f"      Error: {error_msg[:100]}...")
                print(f"      Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                # Either not an API error, or we've exhausted retries
                break
    
    # If we get here, all retries failed
    error_message = (
        f"Evaluation failed for query {financebench_id} after {max_retries} attempts.\n"
        f"Question: {question[:100]}...\n"
        f"Last error: {str(last_error)}"
    )
    
    raise Exception(error_message)


# ============================================================================
# STEP 4 Test Function
# ============================================================================

def test_step4():
    """Test Step 4: Single query evaluation with retry logic."""
    
    print("\n" + "="*70)
    print("STEP 4: SINGLE QUERY EVALUATOR")
    print("="*70)
    
    # Check if evaluation functions are available
    if not EVAL_FUNCTIONS_AVAILABLE:
        print("\n‚ö†Ô∏è  Evaluation functions not available")
        print("   Step 4 requires the evaluation suite modules:")
        print("   - evaluate_answer.py")
        print("   - detect_refusal.py")
        print("   - numerical_exact_match.py")
        print("   - token_f1.py")
        print("   - llm_as_judge_binary.py")
        print("   - llm_as_judge_graded.py")
        print("\n   Cannot test Step 4 without these modules.")
        return
    
    # Find a generation file to test with
    test_config = None
    test_filepath = None
    
    for config in EVALUATION_CONFIGS:
        filepath = get_generation_filepath(config)
        if check_file_exists(filepath):
            test_config = config
            test_filepath = filepath
            break
    
    if test_config is None:
        print("\n‚ö†Ô∏è  No generation files found")
        print(f"   Checked in: {GENERATION_PATH}")
        print("   Cannot test Step 4 without a generation file.")
        return
    
    # Read the file
    print(f"\nüß™ Testing with: {test_config['mode']} - {test_config['question_type']}")
    print(f"   File: {os.path.basename(test_filepath)}")
    
    try:
        data = read_generation_file(test_filepath)
        queries = data['queries']
        question_type = data['metadata']['question_type']
        
        print(f"   ‚úì Loaded {len(queries)} queries")
        
        # Test with first 2 queries
        num_test_queries = min(2, len(queries))
        print(f"\nüìù Testing evaluation on {num_test_queries} queries...")
        
        for i in range(num_test_queries):
            query = queries[i]
            query_id = query['financebench_id']
            
            print(f"\n   Query {i+1}/{num_test_queries}: {query_id}")
            print(f"   Question: {query['question'][:60]}...")
            
            try:
                result = evaluate_single_query(
                    query=query,
                    question_type=question_type,
                    eval_config=EVALUATION_CONFIG,
                    retry_delay=RETRY_DELAY,
                    max_retries=MAX_RETRIES
                )
                
                print(f"   ‚úì Evaluation successful!")
                print(f"      Retries needed: {result['retry_count']}")
                
                # Show evaluation summary
                eval_summary = result['evaluation']['summary']
                print(f"      Refusal detected: {eval_summary['refusal_detected']}")
                print(f"      Metrics computed: {', '.join(eval_summary['metrics_computed'])}")
                
                # Show some metric results based on question type
                metrics = result['evaluation']['metrics']
                
                if question_type == 'metrics-generated':
                    nem = metrics.get('numerical_exact_match', {})
                    llm_bin = metrics.get('llm_as_judge_binary', {})
                    print(f"      NEM match: {nem.get('match', 'N/A')}")
                    print(f"      LLM binary match: {llm_bin.get('match', 'N/A')}")
                    
                elif question_type == 'novel-generated':
                    f1 = metrics.get('token_f1', {})
                    llm_grade = metrics.get('llm_as_judge_graded', {})
                    print(f"      Token F1: {f1.get('f1', 'N/A'):.3f}")
                    print(f"      LLM grade: {llm_grade.get('score', 'N/A')}/4")
                    
                elif question_type == 'domain-relevant':
                    llm_grade = metrics.get('llm_as_judge_graded', {})
                    print(f"      LLM grade: {llm_grade.get('score', 'N/A')}/4")
                
                # Add delay between queries (simulating batch processing)
                if i < num_test_queries - 1:
                    time.sleep(CALL_DELAY)
                
            except Exception as e:
                print(f"   ‚úó Evaluation failed: {str(e)[:100]}...")
                raise  # Re-raise to stop the process
        
        print("\n" + "="*70)
        print("‚úÖ Step 4 Complete!")
        print("="*70)
        print("\nStep 4 Function Implemented:")
        print("  ‚úì evaluate_single_query(query, question_type, eval_config)")
        print("\nFeatures:")
        print("  ‚úì Calls evaluate_answer() from your suite")
        print("  ‚úì Retry logic for API errors")
        print(f"  ‚úì Max retries: {MAX_RETRIES}")
        print(f"  ‚úì Retry delay: {RETRY_DELAY}s")
        print("  ‚úì Comprehensive error messages")
        print("  ‚úì Returns structured result with evaluation details")
        print("\n" + "="*70)
        
    except Exception as e:
        print(f"\n‚úó Error during testing: {str(e)}")
        raise

# # Test Step 4
test_step4()


STEP 4: SINGLE QUERY EVALUATOR

üß™ Testing with: closed_book - metrics-generated
   File: closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json
   ‚úì Loaded 50 queries

üìù Testing evaluation on 2 queries...

   Query 1/2: financebench_id_03029
   Question: What is the FY2018 capital expenditure amount (in USD millio...
   ‚úì Evaluation successful!
      Retries needed: 0
      Refusal detected: False
      Metrics computed: numerical_exact_match, llm_as_judge_binary
      NEM match: False
      LLM binary match: False

   Query 2/2: financebench_id_04672
   Question: Assume that you are a public equities analyst. Answer the fo...
   ‚úì Evaluation successful!
      Retries needed: 0
      Refusal detected: False
      Metrics computed: numerical_exact_match, llm_as_judge_binary
      NEM match: False
      LLM binary match: False

‚úÖ Step 4 Complete!

Step 4 Function Implemented:
  ‚úì evaluate_single_query(query, question_type, eval_config)

Features:


In [5]:
# ============================================================================
# STEP 5: Batch File Evaluator
# ============================================================================

def make_json_serializable(obj):
    """
    Recursively convert non-serializable objects to JSON-serializable types.
    
    Args:
        obj: Object to convert (dict, list, set, etc.)
    
    Returns:
        JSON-serializable version of the object
    """
    if isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, dict):
        return {key: make_json_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(item) for item in obj]
    else:
        return obj

def evaluate_generation_file(
    generation_filepath: str,
    evaluation_filepath: str,
    eval_config: Dict[str, Any],
    call_delay: float = 0.5
) -> Dict[str, Any]:
    """
    Process entire generation file and save evaluation results.
    
    Args:
        generation_filepath: Full path to generation JSON file
        evaluation_filepath: Full path where evaluation results will be saved
        eval_config: Evaluation configuration dictionary
        call_delay: Seconds to wait between query evaluations (default: 0.5)
    
    Returns:
        Dictionary containing summary statistics:
            - total_queries: Total number of queries evaluated
            - successful_evaluations: Number of successful evaluations
            - failed_evaluations: Number of failed evaluations
            - total_time_seconds: Total processing time
            - average_time_per_query: Average time per query
    
    Raises:
        Exception: If any query evaluation fails (stops entire process)
    """
    start_time = time.time()
    
    # Read generation file
    print(f"\nüìñ Reading generation file...")
    data = read_generation_file(generation_filepath)
    
    metadata = data['metadata']
    queries = data['queries']
    question_type = metadata['question_type']
    
    print(f"   ‚úì Loaded {len(queries)} queries")
    print(f"   Mode: {metadata['mode']}")
    print(f"   Question Type: {question_type}")
    
    # Initialize results structure
    evaluation_results = {
        'metadata': {
            'original_file': os.path.basename(generation_filepath),
            'mode': metadata['mode'],
            'question_type': metadata['question_type'],
            'provider': metadata['provider'],
            'model': metadata['model'],
            'temperature': metadata['temperature'],
            'template_alias': metadata.get('template_alias', 'N/A'),
            'evaluated_at': datetime.now().isoformat(),
            'evaluation_config': eval_config.copy(),
            'total_questions': len(queries),
            'total_evaluated': 0,
            'evaluation_success': True
        },
        'results': []
    }
    
    # Process queries with progress bar
    print(f"\nüîÑ Evaluating {len(queries)} queries...")
    
    successful_count = 0
    failed_count = 0
    
    # Create progress bar
    progress_bar = tqdm(
        queries,
        desc=f"Processing {metadata['mode']}_{question_type}",
        unit="query",
        ncols=100
    )
    
    try:
        for i, query in enumerate(progress_bar):
            query_id = query['financebench_id']
            
            # Update progress bar description with current query
            progress_bar.set_postfix_str(f"ID: {query_id}")
            
            try:
                # Evaluate single query
                result = evaluate_single_query(
                    query=query,
                    question_type=question_type,
                    eval_config=eval_config,
                    retry_delay=RETRY_DELAY,
                    max_retries=MAX_RETRIES
                )
                
                # Add to results
                evaluation_results['results'].append(result)
                successful_count += 1
                
                # Delay before next query (except for last query)
                if i < len(queries) - 1:
                    time.sleep(call_delay)
                
            except Exception as e:
                # Evaluation failed - stop the entire process
                failed_count += 1
                progress_bar.close()
                
                error_msg = (
                    f"\n{'='*70}\n"
                    f"‚ùå EVALUATION FAILED - STOPPING PROCESS\n"
                    f"{'='*70}\n"
                    f"Query ID: {query_id}\n"
                    f"Query #{i+1}/{len(queries)}\n"
                    f"Question: {query['question'][:100]}...\n"
                    f"Error: {str(e)}\n"
                    f"{'='*70}\n"
                )
                print(error_msg)
                raise
        
        progress_bar.close()
        
        # Update metadata with final counts
        evaluation_results['metadata']['total_evaluated'] = successful_count
        evaluation_results['metadata']['evaluation_success'] = (failed_count == 0)
        
        # Calculate timing
        end_time = time.time()
        total_time = end_time - start_time
        avg_time = total_time / len(queries) if len(queries) > 0 else 0
        
        # Save evaluation results
        print(f"\nüíæ Saving evaluation results...")
        print(f"   Output: {evaluation_filepath}")
        
        # Ensure output directory exists
        os.makedirs(os.path.dirname(evaluation_filepath), exist_ok=True)
        
        # Convert sets to lists for JSON serialization
        evaluation_results_serializable = make_json_serializable(evaluation_results)
        
        with open(evaluation_filepath, 'w', encoding='utf-8') as f:
            json.dump(evaluation_results_serializable, f, indent=2, ensure_ascii=False)
        
        file_info = get_file_info(evaluation_filepath)
        print(f"   ‚úì Saved successfully ({file_info['size_kb']} KB)")
        
        # Return summary
        summary = {
            'total_queries': len(queries),
            'successful_evaluations': successful_count,
            'failed_evaluations': failed_count,
            'total_time_seconds': round(total_time, 2),
            'average_time_per_query': round(avg_time, 2)
        }
        
        return summary
        
    except Exception as e:
        # Make sure progress bar is closed
        if 'progress_bar' in locals():
            progress_bar.close()
        raise


# ============================================================================
# STEP 5 Test Function
# ============================================================================

def test_step5():
    """Test Step 5: Batch file evaluation."""
    
    print("\n" + "="*70)
    print("STEP 5: BATCH FILE EVALUATOR")
    print("="*70)
    
    # Check if evaluation functions are available
    if not EVAL_FUNCTIONS_AVAILABLE:
        print("\n‚ö†Ô∏è  Evaluation functions not available")
        print("   Cannot test Step 5 without evaluation suite modules.")
        return
    
    # Find a generation file to test with
    test_config = None
    test_gen_filepath = None
    
    for config in EVALUATION_CONFIGS:
        gen_filepath = get_generation_filepath(config)
        eval_filepath = get_evaluation_filepath(config)
        
        # Use a file that hasn't been evaluated yet (or use first available)
        if check_file_exists(gen_filepath):
            test_config = config
            test_gen_filepath = gen_filepath
            break
    
    if test_config is None:
        print("\n‚ö†Ô∏è  No generation files found")
        print(f"   Checked in: {GENERATION_PATH}")
        print("   Cannot test Step 5 without a generation file.")
        return
    
    test_eval_filepath = get_evaluation_filepath(test_config)
    
    print(f"\nüß™ Testing with: {test_config['mode']} - {test_config['question_type']}")
    print(f"   Generation file: {os.path.basename(test_gen_filepath)}")
    print(f"   Evaluation file: {os.path.basename(test_eval_filepath)}")
    
    # Ask for confirmation (since this will process all 50 queries and cost money)
    print(f"\n‚ö†Ô∏è  This will evaluate ALL queries in the file!")
    print(f"   Estimated cost: ~$0.03 for 50 queries")
    print(f"   Estimated time: ~1-2 minutes")
    
    response = input("\n   Proceed with full evaluation? (yes/no): ")
    if response.lower() != 'yes':
        print("   Cancelled. Skipping Step 5 test.")
        print("\n   To test Step 5:")
        print("   1. Ensure you're ready to evaluate a full file")
        print("   2. Run test_step5() again and type 'yes'")
        return
    
    try:
        # Run batch evaluation
        summary = evaluate_generation_file(
            generation_filepath=test_gen_filepath,
            evaluation_filepath=test_eval_filepath,
            eval_config=EVALUATION_CONFIG,
            call_delay=CALL_DELAY
        )
        
        # Display summary
        print("\n" + "="*70)
        print("üìä EVALUATION SUMMARY")
        print("="*70)
        print(f"   Total queries: {summary['total_queries']}")
        print(f"   Successful: {summary['successful_evaluations']}")
        print(f"   Failed: {summary['failed_evaluations']}")
        print(f"   Total time: {summary['total_time_seconds']}s")
        print(f"   Avg time per query: {summary['average_time_per_query']}s")
        
        # Verify evaluation file was created
        if check_file_exists(test_eval_filepath):
            file_info = get_file_info(test_eval_filepath)
            print(f"\n‚úì Evaluation file created: {file_info['size_kb']} KB")
            
            # Read and show sample results
            with open(test_eval_filepath, 'r', encoding='utf-8') as f:
                eval_data = json.load(f)
            
            print(f"\nüìã Evaluation File Contents:")
            print(f"   Metadata fields: {len(eval_data['metadata'])}")
            print(f"   Results count: {len(eval_data['results'])}")
            
            if len(eval_data['results']) > 0:
                first_result = eval_data['results'][0]
                print(f"\n   First result structure:")
                print(f"   ‚îú‚îÄ financebench_id: {first_result['financebench_id']}")
                print(f"   ‚îú‚îÄ question_type: {first_result['question_type']}")
                print(f"   ‚îú‚îÄ evaluation_success: {first_result['evaluation_success']}")
                print(f"   ‚îî‚îÄ evaluation metrics: {list(first_result['evaluation']['metrics'].keys())}")
        
        print("\n" + "="*70)
        print("‚úÖ Step 5 Complete!")
        print("="*70)
        print("\nStep 5 Function Implemented:")
        print("  ‚úì evaluate_generation_file(gen_path, eval_path, config)")
        print("\nFeatures:")
        print("  ‚úì Reads generation file")
        print("  ‚úì Processes all queries with progress bar")
        print("  ‚úì Delays between queries")
        print("  ‚úì Stops on first error")
        print("  ‚úì Saves structured evaluation results")
        print("  ‚úì Returns summary statistics")
        print("\n" + "="*70)
        
    except Exception as e:
        print(f"\n‚úó Batch evaluation failed: {str(e)}")
        raise

# Test Step 5
test_step5()


STEP 5: BATCH FILE EVALUATOR

üß™ Testing with: closed_book - metrics-generated
   Generation file: closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json
   Evaluation file: evaluation_closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json

‚ö†Ô∏è  This will evaluate ALL queries in the file!
   Estimated cost: ~$0.03 for 50 queries
   Estimated time: ~1-2 minutes
   Cancelled. Skipping Step 5 test.

   To test Step 5:
   1. Ensure you're ready to evaluate a full file
   2. Run test_step5() again and type 'yes'


In [6]:
# ============================================================================
# STEP 6: Main Orchestrator
# ============================================================================

def main():
    """
    Main orchestration function to process all evaluation configurations.
    
    Features:
    - Checks if evaluation file already exists (skip if yes)
    - Checks if generation file exists (error if no)
    - Processes files that need evaluation
    - Provides comprehensive summary report
    - Tracks timing for each file and overall
    
    Process:
    1. Loop through all EVALUATION_CONFIGS
    2. For each config:
       - Check if evaluation file exists ‚Üí skip
       - Check if generation file exists ‚Üí error if missing
       - Process file if needed
    3. Display summary report
    
    Raises:
        Exception: If generation file is missing or evaluation fails
    """
    print("\n" + "="*70)
    print("GENERATION ANSWER EVALUATION - MAIN ORCHESTRATOR")
    print("="*70)
    
    # Check if evaluation functions are available
    if not EVAL_FUNCTIONS_AVAILABLE:
        print("\n‚ùå ERROR: Evaluation functions not available")
        print("\n   Required modules:")
        print("   - evaluate_answer.py")
        print("   - detect_refusal.py")
        print("   - numerical_exact_match.py")
        print("   - token_f1.py")
        print("   - llm_as_judge_binary.py")
        print("   - llm_as_judge_graded.py")
        print("\n   Please ensure these modules are in your Python path.")
        return
    
    print(f"\nüìã Configuration: {len(EVALUATION_CONFIGS)} files to check")
    print(f"üìÅ Generation path: {GENERATION_PATH}")
    print(f"üìÅ Evaluation path: {EVALUATION_PATH}")
    print(f"\n‚öôÔ∏è  Evaluation settings:")
    print(f"   Tolerance: {EVALUATION_CONFIG['tolerance']} ({EVALUATION_CONFIG['tolerance']*100}%)")
    print(f"   LLM Model: {EVALUATION_CONFIG['llm_model']}")
    print(f"   Call delay: {CALL_DELAY}s")
    print(f"   Retry delay: {RETRY_DELAY}s")
    print(f"   Max retries: {MAX_RETRIES}")
    
    # Track statistics
    total_files = len(EVALUATION_CONFIGS)
    skipped_files = 0
    processed_files = 0
    failed_files = 0
    
    skipped_list = []
    processed_list = []
    failed_list = []
    
    overall_start_time = time.time()
    
    # Process each configuration
    for i, config in enumerate(EVALUATION_CONFIGS, 1):
        mode = config['mode']
        question_type = config['question_type']
        
        print("\n" + "="*70)
        print(f"[{i}/{total_files}] {mode} | {question_type}")
        print("="*70)
        
        # Get file paths
        gen_filepath = get_generation_filepath(config)
        eval_filepath = get_evaluation_filepath(config)
        
        gen_filename = os.path.basename(gen_filepath)
        eval_filename = os.path.basename(eval_filepath)
        
        print(f"Generation: {gen_filename}")
        print(f"Evaluation: {eval_filename}")
        
        # Check if evaluation file already exists
        if check_file_exists(eval_filepath):
            file_info = get_file_info(eval_filepath)
            print(f"\n‚úì Evaluation file already exists ({file_info['size_kb']} KB)")
            print("   ‚Üí SKIPPED (already evaluated)")
            skipped_files += 1
            skipped_list.append({
                'mode': mode,
                'question_type': question_type,
                'reason': 'already_exists'
            })
            continue
        
        # Check if generation file exists
        if not check_file_exists(gen_filepath):
            print(f"\n‚ùå ERROR: Generation file not found!")
            print(f"   Path: {gen_filepath}")
            print("\n   Please check:")
            print("   1. GENERATION_PATH is correct")
            print("   2. File exists in the directory")
            print("   3. Filename matches the expected pattern")
            failed_files += 1
            failed_list.append({
                'mode': mode,
                'question_type': question_type,
                'error': 'generation_file_not_found'
            })
            raise FileNotFoundError(f"Generation file not found: {gen_filepath}")
        
        # File needs processing
        gen_info = get_file_info(gen_filepath)
        print(f"\n‚Üí Processing... (generation file: {gen_info['size_kb']} KB)")
        
        try:
            file_start_time = time.time()
            
            # Process the file
            summary = evaluate_generation_file(
                generation_filepath=gen_filepath,
                evaluation_filepath=eval_filepath,
                eval_config=EVALUATION_CONFIG,
                call_delay=CALL_DELAY
            )
            
            file_end_time = time.time()
            file_time = file_end_time - file_start_time
            
            # Success
            print(f"\n‚úì Evaluation completed successfully!")
            print(f"   Time: {summary['total_time_seconds']}s")
            print(f"   Queries: {summary['successful_evaluations']}/{summary['total_queries']}")
            
            processed_files += 1
            processed_list.append({
                'mode': mode,
                'question_type': question_type,
                'queries': summary['total_queries'],
                'time_seconds': summary['total_time_seconds']
            })
            
        except Exception as e:
            print(f"\n‚ùå ERROR: Evaluation failed!")
            print(f"   Error: {str(e)[:200]}...")
            failed_files += 1
            failed_list.append({
                'mode': mode,
                'question_type': question_type,
                'error': str(e)[:100]
            })
            raise  # Stop the entire process
    
    # Calculate overall timing
    overall_end_time = time.time()
    overall_time = overall_end_time - overall_start_time
    
    # Display summary report
    print("\n" + "="*70)
    print("üìä FINAL SUMMARY REPORT")
    print("="*70)
    
    print(f"\nüìà Overall Statistics:")
    print(f"   Total files: {total_files}")
    print(f"   Skipped: {skipped_files}")
    print(f"   Processed: {processed_files}")
    print(f"   Failed: {failed_files}")
    print(f"   Total time: {overall_time:.2f}s ({overall_time/60:.1f} minutes)")
    
    if processed_files > 0:
        avg_time = overall_time / processed_files
        print(f"   Avg time per file: {avg_time:.2f}s")
    
    # Show skipped files
    if skipped_list:
        print(f"\n‚è≠Ô∏è  Skipped Files ({len(skipped_list)}):")
        for item in skipped_list:
            print(f"   - {item['mode']:12s} | {item['question_type']}")
    
    # Show processed files
    if processed_list:
        print(f"\n‚úÖ Processed Files ({len(processed_list)}):")
        total_queries = 0
        for item in processed_list:
            print(f"   - {item['mode']:12s} | {item['question_type']:20s} | "
                  f"{item['queries']} queries | {item['time_seconds']:.1f}s")
            total_queries += item['queries']
        print(f"\n   Total queries evaluated: {total_queries}")
    
    # Show failed files
    if failed_list:
        print(f"\n‚ùå Failed Files ({len(failed_list)}):")
        for item in failed_list:
            print(f"   - {item['mode']:12s} | {item['question_type']}")
            print(f"     Error: {item['error']}")
    
    # Final status
    print("\n" + "="*70)
    if failed_files > 0:
        print("‚ùå EVALUATION INCOMPLETE - Some files failed")
    elif processed_files == 0 and skipped_files == total_files:
        print("‚úÖ ALL FILES ALREADY EVALUATED")
    elif processed_files > 0:
        print("‚úÖ EVALUATION COMPLETE!")
    print("="*70)
    
    # Show output location
    if processed_files > 0:
        print(f"\nüìÅ Evaluation results saved to: {EVALUATION_PATH}")
        print("\nGenerated files:")
        for item in processed_list:
            eval_filename = get_evaluation_filename({
                'mode': item['mode'],
                'question_type': item['question_type'],
                'provider': 'openai',
                'model': 'gpt-4o-mini',
                'temperature': '0.0',
                'template_alias': 'metrics_closed_basic'  # simplified for display
            })
            print(f"   - {eval_filename}")


# ============================================================================
# STEP 6 Test Function
# ============================================================================

def test_step6():
    """Test Step 6: Main orchestrator (dry run)."""
    
    print("\n" + "="*70)
    print("STEP 6: MAIN ORCHESTRATOR (DRY RUN)")
    print("="*70)
    
    print("\n‚ö†Ô∏è  This is a DRY RUN - showing what would happen without processing")
    
    # Simulate what main() would do
    print(f"\nüìã Configuration: {len(EVALUATION_CONFIGS)} files")
    
    would_skip = 0
    would_process = 0
    would_fail = 0
    
    for i, config in enumerate(EVALUATION_CONFIGS, 1):
        mode = config['mode']
        question_type = config['question_type']
        
        gen_filepath = get_generation_filepath(config)
        eval_filepath = get_evaluation_filepath(config)
        
        gen_exists = check_file_exists(gen_filepath)
        eval_exists = check_file_exists(eval_filepath)
        
        print(f"\n[{i}/9] {mode:12s} | {question_type}")
        
        if eval_exists:
            print(f"   Action: SKIP (already evaluated)")
            would_skip += 1
        elif not gen_exists:
            print(f"   Action: FAIL (generation file not found)")
            would_fail += 1
        else:
            print(f"   Action: PROCESS")
            would_process += 1
    
    print("\n" + "="*70)
    print("DRY RUN SUMMARY:")
    print("="*70)
    print(f"   Would skip: {would_skip}")
    print(f"   Would process: {would_process}")
    print(f"   Would fail: {would_fail}")
    
    if would_process > 0:
        est_time = would_process * 1.5  # ~1.5 min per file
        est_cost = would_process * 0.03  # ~$0.03 per file
        print(f"\n   Estimated time: ~{est_time:.0f} minutes")
        print(f"   Estimated cost: ~${est_cost:.2f}")
    
    print("\n" + "="*70)
    print("‚úÖ Step 6 Complete!")
    print("="*70)
    print("\nStep 6 Function Implemented:")
    print("  ‚úì main() - Full orchestration")
    print("\nFeatures:")
    print("  ‚úì Loops through all 9 configurations")
    print("  ‚úì Skip logic for existing evaluation files")
    print("  ‚úì Error handling for missing generation files")
    print("  ‚úì Progress tracking for each file")
    print("  ‚úì Comprehensive summary report")
    print("  ‚úì Total time and cost tracking")
    print("\nTo run full evaluation:")
    print("  main()  # This will process all files!")
    print("\n" + "="*70)

# Test Step 6
test_step6()  # Dry run


STEP 6: MAIN ORCHESTRATOR (DRY RUN)

‚ö†Ô∏è  This is a DRY RUN - showing what would happen without processing

üìã Configuration: 9 files

[1/9] closed_book  | metrics-generated
   Action: SKIP (already evaluated)

[2/9] closed_book  | novel-generated
   Action: SKIP (already evaluated)

[3/9] closed_book  | domain-relevant
   Action: SKIP (already evaluated)

[4/9] rag          | metrics-generated
   Action: PROCESS

[5/9] rag          | novel-generated
   Action: FAIL (generation file not found)

[6/9] rag          | domain-relevant
   Action: FAIL (generation file not found)

[7/9] oracle       | metrics-generated
   Action: SKIP (already evaluated)

[8/9] oracle       | novel-generated
   Action: SKIP (already evaluated)

[9/9] oracle       | domain-relevant
   Action: SKIP (already evaluated)

DRY RUN SUMMARY:
   Would skip: 6
   Would process: 1
   Would fail: 2

   Estimated time: ~2 minutes
   Estimated cost: ~$0.03

‚úÖ Step 6 Complete!

Step 6 Function Implemented:
  ‚úì ma

In [7]:
# ============================================================================
# STEP 7: Error Handling & Edge Cases
# ============================================================================

def validate_paths():
    """
    Validate that required paths are set and directories exist.
    
    Returns:
        tuple: (is_valid, error_messages)
    
    Example:
        >>> is_valid, errors = validate_paths()
        >>> if not is_valid:
        ...     print("\\n".join(errors))
    """
    errors = []
    
    # Check GENERATION_PATH
    if GENERATION_PATH == "/path/to/generated_answers":
        errors.append("GENERATION_PATH is still set to placeholder. Please update it.")
    elif not os.path.exists(GENERATION_PATH):
        errors.append(f"GENERATION_PATH does not exist: {GENERATION_PATH}")
    elif not os.path.isdir(GENERATION_PATH):
        errors.append(f"GENERATION_PATH is not a directory: {GENERATION_PATH}")
    
    # Check EVALUATION_PATH
    if EVALUATION_PATH == "/path/to/evaluation_results":
        errors.append("EVALUATION_PATH is still set to placeholder. Please update it.")
    else:
        # Try to create evaluation directory if it doesn't exist
        try:
            os.makedirs(EVALUATION_PATH, exist_ok=True)
        except Exception as e:
            errors.append(f"Cannot create EVALUATION_PATH: {EVALUATION_PATH}. Error: {e}")
    
    return (len(errors) == 0, errors)


def validate_openai_api_key():
    """
    Validate that OpenAI API key is available.
    
    Returns:
        tuple: (is_valid, error_message)
    
    Example:
        >>> is_valid, error = validate_openai_api_key()
        >>> if not is_valid:
        ...     print(error)
    """
    api_key = os.getenv("OPENAI_API_KEY")
    
    if not api_key:
        return (False, "OPENAI_API_KEY environment variable not set. "
                      "LLM-as-judge evaluation will fail.")
    
    if len(api_key) < 20:  # Basic sanity check
        return (False, f"OPENAI_API_KEY seems invalid (too short: {len(api_key)} chars)")
    
    return (True, None)


def estimate_cost_and_time(configs_to_process: List[Dict[str, str]]) -> Dict[str, Any]:
    """
    Estimate cost and time for processing given configurations.
    
    Args:
        configs_to_process: List of configuration dictionaries
    
    Returns:
        Dictionary with estimates:
            - total_queries: Total queries to evaluate
            - estimated_time_minutes: Estimated time in minutes
            - estimated_cost_usd: Estimated cost in USD
            - breakdown_by_type: Cost/time breakdown by question type
    
    Example:
        >>> estimate = estimate_cost_and_time(EVALUATION_CONFIGS)
        >>> print(f"Time: ~{estimate['estimated_time_minutes']} min")
        >>> print(f"Cost: ~${estimate['estimated_cost_usd']:.2f}")
    """
    # Typical values (adjust based on your experience)
    QUERIES_PER_FILE = 50
    TIME_PER_QUERY_SECONDS = 1.7  # Average from testing
    
    # Cost per query by question type (approximate)
    COST_PER_QUERY = {
        'metrics-generated': 0.0006,   # NEM + LLM-binary (1 LLM call)
        'novel-generated': 0.0006,      # Token-F1 + LLM-graded (1 LLM call)
        'domain-relevant': 0.0006       # LLM-graded (1 LLM call)
    }
    
    total_queries = len(configs_to_process) * QUERIES_PER_FILE
    total_time_seconds = total_queries * TIME_PER_QUERY_SECONDS
    
    # Calculate cost by question type
    breakdown = {}
    total_cost = 0
    
    for config in configs_to_process:
        qtype = config['question_type']
        if qtype not in breakdown:
            breakdown[qtype] = {
                'count': 0,
                'queries': 0,
                'cost': 0,
                'time_minutes': 0
            }
        
        breakdown[qtype]['count'] += 1
        breakdown[qtype]['queries'] += QUERIES_PER_FILE
        breakdown[qtype]['cost'] += QUERIES_PER_FILE * COST_PER_QUERY[qtype]
        breakdown[qtype]['time_minutes'] += (QUERIES_PER_FILE * TIME_PER_QUERY_SECONDS) / 60
        
        total_cost += QUERIES_PER_FILE * COST_PER_QUERY[qtype]
    
    return {
        'total_queries': total_queries,
        'estimated_time_minutes': round(total_time_seconds / 60, 1),
        'estimated_cost_usd': round(total_cost, 2),
        'breakdown_by_type': breakdown
    }


def get_processing_plan() -> Dict[str, Any]:
    """
    Analyze which files need processing and create execution plan.
    
    Returns:
        Dictionary containing:
            - to_skip: List of configs that already have evaluation files
            - to_process: List of configs that need evaluation
            - missing_generation: List of configs missing generation files
            - estimate: Cost and time estimate for processing
    
    Example:
        >>> plan = get_processing_plan()
        >>> print(f"Will process: {len(plan['to_process'])} files")
        >>> print(f"Will skip: {len(plan['to_skip'])} files")
    """
    to_skip = []
    to_process = []
    missing_generation = []
    
    for config in EVALUATION_CONFIGS:
        gen_filepath = get_generation_filepath(config)
        eval_filepath = get_evaluation_filepath(config)
        
        gen_exists = check_file_exists(gen_filepath)
        eval_exists = check_file_exists(eval_filepath)
        
        if eval_exists:
            to_skip.append(config)
        elif not gen_exists:
            missing_generation.append(config)
        else:
            to_process.append(config)
    
    # Get cost/time estimate for files to process
    estimate = estimate_cost_and_time(to_process) if to_process else None
    
    return {
        'to_skip': to_skip,
        'to_process': to_process,
        'missing_generation': missing_generation,
        'estimate': estimate
    }


def print_processing_plan(plan: Dict[str, Any]):
    """
    Print a detailed processing plan before execution.
    
    Args:
        plan: Plan dictionary from get_processing_plan()
    
    Example:
        >>> plan = get_processing_plan()
        >>> print_processing_plan(plan)
    """
    print("\n" + "="*70)
    print("üìã PROCESSING PLAN")
    print("="*70)
    
    total = len(EVALUATION_CONFIGS)
    skip_count = len(plan['to_skip'])
    process_count = len(plan['to_process'])
    missing_count = len(plan['missing_generation'])
    
    print(f"\nTotal configurations: {total}")
    print(f"  ‚úì Already evaluated: {skip_count}")
    print(f"  ‚Üí To process: {process_count}")
    print(f"  ‚úó Missing generation files: {missing_count}")
    
    if plan['to_skip']:
        print(f"\n‚è≠Ô∏è  Files to skip ({skip_count}):")
        for config in plan['to_skip']:
            print(f"   - {config['mode']:12s} | {config['question_type']}")
    
    if plan['missing_generation']:
        print(f"\n‚ö†Ô∏è  Missing generation files ({missing_count}):")
        for config in plan['missing_generation']:
            print(f"   - {config['mode']:12s} | {config['question_type']}")
            gen_filepath = get_generation_filepath(config)
            print(f"     Expected: {gen_filepath}")
    
    if plan['to_process']:
        print(f"\nüîÑ Files to process ({process_count}):")
        for config in plan['to_process']:
            print(f"   - {config['mode']:12s} | {config['question_type']}")
        
        if plan['estimate']:
            est = plan['estimate']
            print(f"\nüìä Estimates:")
            print(f"   Total queries: {est['total_queries']}")
            print(f"   Estimated time: ~{est['estimated_time_minutes']} minutes")
            print(f"   Estimated cost: ~${est['estimated_cost_usd']:.2f}")
            
            print(f"\n   Breakdown by question type:")
            for qtype, data in est['breakdown_by_type'].items():
                print(f"   - {qtype:20s}: {data['queries']:3d} queries | "
                      f"~${data['cost']:.2f} | ~{data['time_minutes']:.1f} min")
    
    print("\n" + "="*70)


def main_with_validation():
    """
    Enhanced main function with comprehensive validation and user confirmation.
    
    This is the recommended entry point for running evaluations.
    It includes:
    - Path validation
    - API key validation
    - Processing plan display
    - User confirmation before processing
    - Better error messages
    
    Example:
        >>> main_with_validation()  # Interactive with confirmations
    """
    print("\n" + "="*70)
    print("GENERATION ANSWER EVALUATION - ENHANCED")
    print("="*70)
    
    # Step 1: Validate paths
    print("\nüîç Step 1: Validating paths...")
    paths_valid, path_errors = validate_paths()
    if not paths_valid:
        print("\n‚ùå Path validation failed:")
        for error in path_errors:
            print(f"   - {error}")
        print("\nPlease fix the path issues and try again.")
        return
    print("   ‚úì Paths validated")
    
    # Step 2: Validate API key
    print("\nüîç Step 2: Validating OpenAI API key...")
    api_valid, api_error = validate_openai_api_key()
    if not api_valid:
        print(f"\n‚ö†Ô∏è  Warning: {api_error}")
        print("   Evaluation will likely fail without a valid API key.")
        response = input("\n   Continue anyway? (yes/no): ")
        if response.lower() != 'yes':
            print("   Cancelled.")
            return
    else:
        print("   ‚úì API key found")
    
    # Step 3: Check evaluation functions
    print("\nüîç Step 3: Checking evaluation functions...")
    if not EVAL_FUNCTIONS_AVAILABLE:
        print("\n‚ùå ERROR: Evaluation functions not available")
        print("   Required modules:")
        print("   - evaluate_answer.py")
        print("   - detect_refusal.py")
        print("   - numerical_exact_match.py")
        print("   - token_f1.py")
        print("   - llm_as_judge_binary.py")
        print("   - llm_as_judge_graded.py")
        return
    print("   ‚úì Evaluation functions available")
    
    # Step 4: Create processing plan
    print("\nüîç Step 4: Creating processing plan...")
    plan = get_processing_plan()
    print_processing_plan(plan)
    
    # Step 5: Check if anything to process
    if not plan['to_process']:
        if plan['to_skip']:
            print("\n‚úÖ All files already evaluated! Nothing to do.")
        elif plan['missing_generation']:
            print("\n‚ùå No files to process. All generation files are missing.")
        else:
            print("\n‚ö†Ô∏è  No files to process.")
        return
    
    # Step 6: User confirmation
    if plan['estimate']:
        print("\n‚ö†Ô∏è  CONFIRMATION REQUIRED")
        print(f"   This will evaluate {plan['estimate']['total_queries']} queries")
        print(f"   Estimated time: ~{plan['estimate']['estimated_time_minutes']} minutes")
        print(f"   Estimated cost: ~${plan['estimate']['estimated_cost_usd']:.2f}")
        
        response = input("\n   Proceed with evaluation? (yes/no): ")
        if response.lower() != 'yes':
            print("\n   Cancelled by user.")
            return
    
    # Step 7: Run evaluation
    print("\n" + "="*70)
    print("üöÄ STARTING EVALUATION")
    print("="*70)
    
    try:
        main()  # Call the original main function
    except KeyboardInterrupt:
        print("\n\n‚ö†Ô∏è  Evaluation interrupted by user (Ctrl+C)")
        print("   Partial results may have been saved.")
    except Exception as e:
        print(f"\n\n‚ùå Evaluation failed with error:")
        print(f"   {str(e)}")
        raise


# ============================================================================
# STEP 7 Test Function
# ============================================================================

def test_step7():
    """Test Step 7: Error handling and validation."""
    
    print("\n" + "="*70)
    print("STEP 7: ERROR HANDLING & VALIDATION")
    print("="*70)
    
    # Test path validation
    print("\nüß™ Test 1: Path Validation")
    paths_valid, path_errors = validate_paths()
    if paths_valid:
        print("   ‚úì Paths are valid")
    else:
        print("   ‚úó Path validation failed:")
        for error in path_errors:
            print(f"     - {error}")
    
    # Test API key validation
    print("\nüß™ Test 2: API Key Validation")
    api_valid, api_error = validate_openai_api_key()
    if api_valid:
        print("   ‚úì API key is valid")
    else:
        print(f"   ‚úó {api_error}")
    
    # Test processing plan
    print("\nüß™ Test 3: Processing Plan")
    plan = get_processing_plan()
    print(f"   Files to skip: {len(plan['to_skip'])}")
    print(f"   Files to process: {len(plan['to_process'])}")
    print(f"   Missing generation: {len(plan['missing_generation'])}")
    
    if plan['estimate']:
        est = plan['estimate']
        print(f"   Estimated time: ~{est['estimated_time_minutes']} min")
        print(f"   Estimated cost: ~${est['estimated_cost_usd']:.2f}")
    
    # Show full processing plan
    print_processing_plan(plan)
    
    print("\n" + "="*70)
    print("‚úÖ Step 7 Complete!")
    print("="*70)
    print("\nStep 7 Functions Implemented:")
    print("  ‚úì validate_paths()")
    print("  ‚úì validate_openai_api_key()")
    print("  ‚úì estimate_cost_and_time()")
    print("  ‚úì get_processing_plan()")
    print("  ‚úì print_processing_plan()")
    print("  ‚úì main_with_validation() - Enhanced main with safety checks")
    print("\nFeatures:")
    print("  ‚úì Path validation")
    print("  ‚úì API key validation")
    print("  ‚úì Cost and time estimation")
    print("  ‚úì Processing plan preview")
    print("  ‚úì User confirmation before processing")
    print("  ‚úì Better error messages")
    print("  ‚úì Keyboard interrupt handling")
    print("\nRecommended usage:")
    print("  main_with_validation()  # Use this instead of main()")
    print("\n" + "="*70)



test_step7()
    
# Recommended: Use enhanced main with validation
# main_with_validation()


STEP 7: ERROR HANDLING & VALIDATION

üß™ Test 1: Path Validation
   ‚úì Paths are valid

üß™ Test 2: API Key Validation
   ‚úì API key is valid

üß™ Test 3: Processing Plan
   Files to skip: 6
   Files to process: 1
   Missing generation: 2
   Estimated time: ~1.4 min
   Estimated cost: ~$0.03

üìã PROCESSING PLAN

Total configurations: 9
  ‚úì Already evaluated: 6
  ‚Üí To process: 1
  ‚úó Missing generation files: 2

‚è≠Ô∏è  Files to skip (6):
   - closed_book  | metrics-generated
   - closed_book  | novel-generated
   - closed_book  | domain-relevant
   - oracle       | metrics-generated
   - oracle       | novel-generated
   - oracle       | domain-relevant

‚ö†Ô∏è  Missing generation files (2):
   - rag          | novel-generated
     Expected: ../../generation_set/closedbook_oracle_sets/rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic.json
   - rag          | domain-relevant
     Expected: ../../generation_set/closedbook_oracle_sets/rag_domain-relevant_openai_gpt-4

In [None]:
# ============================================================================
# RUN MAIN ORCHESTRATOR WITH ALL CONFIGURATIONS
# ============================================================================

EVALUATION_CONFIGS = [
    # Closed-book experiments (3 question types)
    {
        'mode': 'closed_book',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_closed_basic'
    },
    {
        'mode': 'closed_book',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o',
        'temperature': '0.0',
        'template_alias': 'metrics_closed_basic'
    },
    {
        'mode': 'closed_book',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_closed_basic'
    },
    {
        'mode': 'closed_book',
        'question_type': 'domain-relevant',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'domain_closed_basic'
    },
    
    # RAG experiments (3 question types)
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_global_chunk512_baseline_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_single_chunk512_baseline_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_global_chunk512_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_single_chunk512_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_global_chunk1024_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_single_chunk1024_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_global_chunk2048_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic_single_chunk2048_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_global_chunk512_baseline_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_single_chunk512_baseline_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_global_chunk512_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_single_chunk512_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_global_chunk1024_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_single_chunk1024_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_global_chunk2048_rerank_k20'
    },
    {
        'mode': 'rag',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic_single_chunk2048_rerank_k20'
    },
    # {
    #     'mode': 'rag',
    #     'question_type': 'novel-generated',
    #     'provider': 'openai',
    #     'model': 'gpt-4o-mini',
    #     'temperature': '0.0',
    #     'template_alias': 'novel_rag_basic_global_chunk1024_rerank_k20'
    # },
    # {
    #     'mode': 'rag',
    #     'question_type': 'novel-generated',
    #     'provider': 'openai',
    #     'model': 'gpt-4o-mini',
    #     'temperature': '0.0',
    #     'template_alias': 'novel_rag_basic_single_chunk1024_rerank_k20'
    # },
    # {
    #     'mode': 'rag',
    #     'question_type': 'domain-relevant',
    #     'provider': 'openai',
    #     'model': 'gpt-4o-mini',
    #     'temperature': '0.0',
    #     'template_alias': 'domain_rag_basic'
    # },
    
    # Oracle experiments (3 question types)
    {
        'mode': 'oracle',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic'
    },
    {
        'mode': 'oracle',
        'question_type': 'metrics-generated',
        'provider': 'openai',
        'model': 'gpt-4o',
        'temperature': '0.0',
        'template_alias': 'metrics_rag_basic'
    },
    {
        'mode': 'oracle',
        'question_type': 'novel-generated',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'novel_rag_basic'
    },
    {
        'mode': 'oracle',
        'question_type': 'domain-relevant',
        'provider': 'openai',
        'model': 'gpt-4o-mini',
        'temperature': '0.0',
        'template_alias': 'domain_rag_basic'
    },
]

# To actually run the full evaluation, uncomment the line below:
# main() # It will process all files!
main_with_validation()  # It will process all files with validation


GENERATION ANSWER EVALUATION - ENHANCED

üîç Step 1: Validating paths...
   ‚úì Paths validated

üîç Step 2: Validating OpenAI API key...
   ‚úì API key found

üîç Step 3: Checking evaluation functions...
   ‚úì Evaluation functions available

üîç Step 4: Creating processing plan...

üìã PROCESSING PLAN

Total configurations: 20
  ‚úì Already evaluated: 16
  ‚Üí To process: 4
  ‚úó Missing generation files: 0

‚è≠Ô∏è  Files to skip (16):
   - closed_book  | metrics-generated
   - closed_book  | metrics-generated
   - closed_book  | novel-generated
   - closed_book  | domain-relevant
   - rag          | metrics-generated
   - rag          | metrics-generated
   - rag          | metrics-generated
   - rag          | metrics-generated
   - rag          | metrics-generated
   - rag          | metrics-generated
   - rag          | metrics-generated
   - rag          | metrics-generated
   - oracle       | metrics-generated
   - oracle       | metrics-generated
   - oracle       | nove

Processing rag_novel-generated: 100%|‚ñà| 50/50 [03:25<00:00,  4.11s/query, ID: financebench_id_02024]



üíæ Saving evaluation results...
   Output: ../../evaluation_results/generation/evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_global_chunk512_baseline_k20.json
   ‚úì Saved successfully (273.19 KB)

‚úì Evaluation completed successfully!
   Time: 205.75s
   Queries: 50/50

[14/20] rag | novel-generated
Generation: rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_single_chunk512_baseline_k20.json
Evaluation: evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_single_chunk512_baseline_k20.json

‚Üí Processing... (generation file: 324.68 KB)

üìñ Reading generation file...
   ‚úì Loaded 50 queries
   Mode: rag
   Question Type: novel-generated

üîÑ Evaluating 50 queries...


Processing rag_novel-generated: 100%|‚ñà| 50/50 [02:56<00:00,  3.54s/query, ID: financebench_id_02024]



üíæ Saving evaluation results...
   Output: ../../evaluation_results/generation/evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_single_chunk512_baseline_k20.json
   ‚úì Saved successfully (269.73 KB)

‚úì Evaluation completed successfully!
   Time: 176.83s
   Queries: 50/50

[15/20] rag | novel-generated
Generation: rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_global_chunk512_rerank_k20.json
Evaluation: evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_global_chunk512_rerank_k20.json

‚Üí Processing... (generation file: 325.55 KB)

üìñ Reading generation file...
   ‚úì Loaded 50 queries
   Mode: rag
   Question Type: novel-generated

üîÑ Evaluating 50 queries...


Processing rag_novel-generated: 100%|‚ñà| 50/50 [03:06<00:00,  3.74s/query, ID: financebench_id_02024]



üíæ Saving evaluation results...
   Output: ../../evaluation_results/generation/evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_global_chunk512_rerank_k20.json
   ‚úì Saved successfully (274.69 KB)

‚úì Evaluation completed successfully!
   Time: 186.97s
   Queries: 50/50

[16/20] rag | novel-generated
Generation: rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_single_chunk512_rerank_k20.json
Evaluation: evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_single_chunk512_rerank_k20.json

‚Üí Processing... (generation file: 325.46 KB)

üìñ Reading generation file...
   ‚úì Loaded 50 queries
   Mode: rag
   Question Type: novel-generated

üîÑ Evaluating 50 queries...


Processing rag_novel-generated: 100%|‚ñà| 50/50 [03:08<00:00,  3.77s/query, ID: financebench_id_02024]


üíæ Saving evaluation results...
   Output: ../../evaluation_results/generation/evaluation_rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic_single_chunk512_rerank_k20.json
   ‚úì Saved successfully (272.95 KB)

‚úì Evaluation completed successfully!
   Time: 188.49s
   Queries: 50/50

[17/20] oracle | metrics-generated
Generation: oracle_metrics-generated_openai_gpt-4o-mini_0.0_metrics_rag_basic.json
Evaluation: evaluation_oracle_metrics-generated_openai_gpt-4o-mini_0.0_metrics_rag_basic.json

‚úì Evaluation file already exists (171.54 KB)
   ‚Üí SKIPPED (already evaluated)

[18/20] oracle | metrics-generated
Generation: oracle_metrics-generated_openai_gpt-4o_0.0_metrics_rag_basic.json
Evaluation: evaluation_oracle_metrics-generated_openai_gpt-4o_0.0_metrics_rag_basic.json

‚úì Evaluation file already exists (171.01 KB)
   ‚Üí SKIPPED (already evaluated)

[19/20] oracle | novel-generated
Generation: oracle_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic.json
Evaluati


