In [14]:
# ============================================================================
# STEP 1: SETUP & DEPENDENCIES
# ============================================================================

# Standard library imports
import os
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any

# LangChain imports for LLM providers
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_community.chat_models import ChatOllama

# LangChain utilities for prompts and parsing
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Dataset and utility imports
from datasets import load_dataset
from dotenv import load_dotenv
from tqdm.auto import tqdm

# Create output directory if it doesn't exist
OUTPUT_DIR = "../../generation_set/closedbook_oracle"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print("‚úÖ All libraries imported successfully!")
print(f"‚úÖ Output directory ready: {OUTPUT_DIR}")

‚úÖ All libraries imported successfully!
‚úÖ Output directory ready: ../../generation_set/closedbook_oracle


In [27]:
# ============================================================================
# STEP 2: CONFIGURATION VARIABLES
# ============================================================================

# Load environment variables
load_dotenv()

# Dataset Configuration
DATASET_NAME = "PatronusAI/financebench"
DATASET_SPLIT = "train"  # 150 questions in train split

# API Configuration - Load from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")

# Rate Limiting Configuration
CALL_DELAY = 1  # Seconds between each LLM call (rate limiting)
RETRY_DELAY = 30  # Seconds to wait before retry after failure
MAX_RETRIES = 3  # Maximum number of retry attempts

# Global Generation Parameters
TEMPERATURE = 0.0  # Deterministic generation for reproducibility

# Prompt Templates for Closed-Book Mode
CLOSED_BOOK_TEMPLATES = {
    "basic": {
        "alias": "closed_basic",
        "template": """You are a financial expert. Answer the following question based on your knowledge.

Question: {question}

Provide a concise, accurate answer:"""
    },
    "cot": {
        "alias": "closed_cot",
        "template": """You are a financial expert. Answer the following question based on your knowledge. Think step by step.

Question: {question}

Let's approach this step by step:
1. First, let me analyze what information is needed
2. Then, I'll provide the answer

Answer:"""
    }
}

# Prompt Templates for Oracle Mode
ORACLE_TEMPLATES = {
    "basic": {
        "alias": "oracle_basic",
        "template": """You are a financial expert. Answer the question based strictly on the provided evidence from financial documents.

Evidence:
{evidence}

Question: {question}

Provide a concise, accurate answer based solely on the evidence above:"""
    },
    "structured": {
        "alias": "oracle_structured",
        "template": """You are a financial analyst reviewing official financial documents. Use ONLY the provided evidence to answer the question.

=== EVIDENCE FROM FINANCIAL DOCUMENTS ===
{evidence}

=== QUESTION ===
{question}

=== INSTRUCTIONS ===
Provide a precise answer based strictly on the evidence. If the answer requires a number, provide it in the exact format shown in the documents.

Answer:"""
    }
}

print(f"   Dataset: {DATASET_NAME} ({DATASET_SPLIT} split)")
print(f"   Output directory: {OUTPUT_DIR}")


   Dataset: PatronusAI/financebench (train split)
   Output directory: ../../generation_set/closedbook_oracle


In [25]:
# ============================================================================
# STEP 3: LOAD FINANCEBENCH DATASET
# ============================================================================

# Load dataset from HuggingFace
print(f"Loading dataset: {DATASET_NAME} (split: {DATASET_SPLIT})...")
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

print(f"‚úÖ Dataset loaded successfully!")
print(f"   Total questions: {len(dataset)}")
print(f"   Dataset features: {dataset.features.keys()}")

# Display a sample entry to verify structure
print("\n" + "="*80)
print("SAMPLE ENTRY:")
print("="*80)
sample = dataset[0]
print(f"ID: {sample['financebench_id']}")
print(f"Company: {sample['company']}")
print(f"Question Type: {sample['question_type']}")
print(f"Question: {sample['question'][:100]}...")
print(f"Answer: {sample['answer']}")
print(f"Number of evidence pieces: {len(sample['evidence'])}")
if len(sample['evidence']) > 0:
    print(f"First evidence preview: {sample['evidence'][0]['evidence_text'][:150]}...")

Loading dataset: PatronusAI/financebench (split: train)...
‚úÖ Dataset loaded successfully!
   Total questions: 150
   Dataset features: dict_keys(['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link'])

SAMPLE ENTRY:
ID: financebench_id_03029
Company: 3M
Question Type: metrics-generated
Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
Answer: $1577.00
Number of evidence pieces: 1
First evidence preview: Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended December 31
 
(Millions)
 
2018
 
2017
 
2016
 
Cash ...


In [10]:
# ============================================================================
# STEP 4: LANGCHAIN PROMPT TEMPLATES
# ============================================================================

# Convert closed-book templates to LangChain ChatPromptTemplate objects
CLOSED_BOOK_LANGCHAIN_TEMPLATES = {}
for key, template_info in CLOSED_BOOK_TEMPLATES.items():
    CLOSED_BOOK_LANGCHAIN_TEMPLATES[key] = {
        "alias": template_info["alias"],
        "template": template_info["template"],
        "langchain_prompt": ChatPromptTemplate.from_messages([
            ("system", "You are a financial expert assistant."),
            ("human", template_info["template"])
        ])
    }

# Convert oracle templates to LangChain ChatPromptTemplate objects
ORACLE_LANGCHAIN_TEMPLATES = {}
for key, template_info in ORACLE_TEMPLATES.items():
    ORACLE_LANGCHAIN_TEMPLATES[key] = {
        "alias": template_info["alias"],
        "template": template_info["template"],
        "langchain_prompt": ChatPromptTemplate.from_messages([
            ("system", "You are a financial expert assistant."),
            ("human", template_info["template"])
        ])
    }

print("‚úÖ LangChain prompt templates created successfully!")
print(f"   Closed-book templates: {list(CLOSED_BOOK_LANGCHAIN_TEMPLATES.keys())}")
print(f"   Oracle templates: {list(ORACLE_LANGCHAIN_TEMPLATES.keys())}")

# Test: Display one template structure
print("\n" + "="*80)
print("SAMPLE CLOSED-BOOK TEMPLATE (basic):")
print("="*80)
print(CLOSED_BOOK_LANGCHAIN_TEMPLATES["basic"]["langchain_prompt"].format(
    question="What is the revenue of Apple in 2020?"
))

‚úÖ LangChain prompt templates created successfully!
   Closed-book templates: ['basic', 'cot']
   Oracle templates: ['basic', 'structured']

SAMPLE CLOSED-BOOK TEMPLATE (basic):
System: You are a financial expert assistant.
Human: You are a financial expert. Answer the following question based on your knowledge.

Question: What is the revenue of Apple in 2020?

Provide a concise, accurate answer:


In [16]:
# ============================================================================
# STEP 5: LLM MODEL INITIALIZATION FUNCTION
# ============================================================================

def get_llm(provider: str, model: str, temperature: float):
    """
    Initialize and return a LangChain LLM based on provider.
    
    Args:
        provider: One of 'openai', 'anthropic', 'ollama'
        model: Model name (e.g., 'gpt-4o', 'claude-sonnet-4', 'llama3.1:8b')
        temperature: Temperature for generation (0.0 for deterministic)
    
    Returns:
        LangChain chat model instance
    """
    if provider == "openai":
        if not OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY not found in environment variables")
        return ChatOpenAI(
            model=model,
            temperature=temperature,
            api_key=OPENAI_API_KEY
        )
    
    elif provider == "anthropic":
        if not ANTHROPIC_API_KEY:
            raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
        return ChatAnthropic(
            model=model,
            temperature=temperature,
            api_key=ANTHROPIC_API_KEY
        )
    
    elif provider == "ollama":
        return ChatOllama(
            model=model,
            temperature=temperature,
            base_url=OLLAMA_BASE_URL
        )
    
    else:
        raise ValueError(f"Unknown provider: {provider}. Supported: openai, anthropic, ollama")

print("‚úÖ LLM initialization function created successfully!")

# Test: Initialize a sample LLM (if API key available)
try:
    test_llm = get_llm("openai", "gpt-4o-mini", 0.0)
    print(f"   ‚úÖ Test initialization successful: OpenAI GPT-4o-mini")
except Exception as e:
    print(f"   ‚ö†Ô∏è  Could not initialize test LLM: {e}")

‚úÖ LLM initialization function created successfully!
   ‚úÖ Test initialization successful: OpenAI GPT-4o-mini


In [17]:
# ============================================================================
# STEP 6: HELPER FUNCTIONS
# ============================================================================

def sanitize_model_name(model_name: str) -> str:
    """
    Sanitize model name for use in filenames.
    Replace special characters with underscores or hyphens.
    
    Args:
        model_name: Original model name (e.g., 'llama3.1:8b', 'gpt-4o')
    
    Returns:
        Sanitized model name (e.g., 'llama3-1_8b', 'gpt-4o')
    """
    # Replace : with _ and . with -
    sanitized = model_name.replace(":", "_").replace(".", "-")
    return sanitized


def generate_filename(config: Dict[str, Any]) -> str:
    """
    Generate output filename based on configuration.
    Format: {mode}_{provider}_{model}_{temperature}_{template_alias}.json
    
    Args:
        config: Configuration dictionary
    
    Returns:
        Filename string
    """
    mode = config["mode"]
    provider = config["provider"]
    model = sanitize_model_name(config["model"])
    temperature = config["temperature"]
    
    # Get template alias based on mode
    if mode == "closed_book":
        template_alias = CLOSED_BOOK_TEMPLATES[config["template_key"]]["alias"]
    else:  # oracle
        template_alias = ORACLE_TEMPLATES[config["template_key"]]["alias"]
    
    filename = f"{mode}_{provider}_{model}_{temperature}_{template_alias}.json"
    return filename


def check_file_exists(config: Dict[str, Any]) -> bool:
    """
    Check if output file for this configuration already exists.
    
    Args:
        config: Configuration dictionary
    
    Returns:
        True if file exists, False otherwise
    """
    filename = generate_filename(config)
    filepath = Path(OUTPUT_DIR) / filename
    return filepath.exists()


def format_evidence(evidence_list: List[Dict[str, Any]]) -> str:
    """
    Format evidence pieces for oracle mode prompts.
    Handles single or multiple evidence pieces with clear separation.
    
    Args:
        evidence_list: List of evidence dictionaries from FinanceBench
    
    Returns:
        Formatted evidence string
    """
    if not evidence_list or len(evidence_list) == 0:
        return ""
    
    # Single evidence piece
    if len(evidence_list) == 1:
        return evidence_list[0]["evidence_text"]
    
    # Multiple evidence pieces - format with clear separation
    formatted_parts = []
    for idx, evidence in enumerate(evidence_list, 1):
        formatted_parts.append(f"Evidence {idx}:\n{evidence['evidence_text']}")
    
    return "\n\n".join(formatted_parts)


print("‚úÖ Helper functions created successfully!")

# Test helper functions
print("\n" + "="*80)
print("TESTING HELPER FUNCTIONS:")
print("="*80)

# Test 1: Sanitize model name
test_model_1 = "gpt-4o"
test_model_2 = "llama3.1:8b"
print(f"Sanitize 'gpt-4o' ‚Üí '{sanitize_model_name(test_model_1)}'")
print(f"Sanitize 'llama3.1:8b' ‚Üí '{sanitize_model_name(test_model_2)}'")

# Test 2: Generate filename
test_config = CONFIGURATIONS[0]
test_filename = generate_filename(test_config)
print(f"\nGenerated filename: {test_filename}")

# Test 3: Check file exists
file_exists = check_file_exists(test_config)
print(f"File exists: {file_exists}")

# Test 4: Format evidence (using sample from dataset)
sample_evidence = dataset[0]["evidence"]
formatted_evidence = format_evidence(sample_evidence)
print(f"\nFormatted evidence (first 200 chars):\n{formatted_evidence[:200]}...")

‚úÖ Helper functions created successfully!

TESTING HELPER FUNCTIONS:
Sanitize 'gpt-4o' ‚Üí 'gpt-4o'
Sanitize 'llama3.1:8b' ‚Üí 'llama3-1_8b'

Generated filename: oracle_ollama_llama3-1_8b_0.0_oracle_basic.json
File exists: False

Formatted evidence (first 200 chars):
Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended December 31
 
(Millions)
 
2018
 
2017
 
2016
 
Cash Flows from Operating Activities
 
 
 
 
 
 
 
Net ...


In [18]:
# ============================================================================
# STEP 7: LLM CALLER WITH RETRY LOGIC
# ============================================================================

def call_llm_with_retry(
    llm,
    prompt: str,
    max_retries: int = MAX_RETRIES,
    retry_delay: int = RETRY_DELAY,
    call_delay: float = CALL_DELAY
) -> str:
    """
    Call LLM with retry logic and rate limiting.
    
    Args:
        llm: LangChain LLM instance
        prompt: Formatted prompt string
        max_retries: Maximum number of retry attempts
        retry_delay: Seconds to wait before retry after failure
        call_delay: Seconds to wait between successful calls (rate limiting)
    
    Returns:
        Generated answer text
    
    Raises:
        Exception: If all retries fail, stops execution
    """
    last_error = None
    
    for attempt in range(max_retries):
        try:
            # Invoke the LLM
            response = llm.invoke(prompt)
            
            # Extract text content from response
            if hasattr(response, 'content'):
                answer = response.content
            else:
                answer = str(response)
            
            # Rate limiting: wait before next call
            time.sleep(call_delay)
            
            return answer.strip()
        
        except Exception as e:
            last_error = e
            print(f"   ‚ö†Ô∏è  Attempt {attempt + 1}/{max_retries} failed: {str(e)}")
            
            # If not the last attempt, wait before retry
            if attempt < max_retries - 1:
                print(f"   ‚è≥ Waiting {retry_delay} seconds before retry...")
                time.sleep(retry_delay)
            else:
                # Last attempt failed - stop execution
                print(f"   ‚ùå All {max_retries} attempts failed!")
                raise Exception(f"LLM call failed after {max_retries} attempts: {str(last_error)}")
    
    # Should not reach here, but just in case
    raise Exception(f"LLM call failed: {str(last_error)}")


print("‚úÖ LLM caller with retry logic created successfully!")

# Test: Call LLM with a simple prompt (if API key available)
print("\n" + "="*80)
print("TESTING LLM CALLER:")
print("="*80)
try:
    test_llm = get_llm("openai", "gpt-4o-mini", 0.0)
    test_prompt = "What is 2+2? Answer with just the number."
    test_answer = call_llm_with_retry(test_llm, test_prompt)
    print(f"‚úÖ Test successful!")
    print(f"   Prompt: {test_prompt}")
    print(f"   Answer: {test_answer}")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not test LLM caller: {e}")

‚úÖ LLM caller with retry logic created successfully!

TESTING LLM CALLER:
‚úÖ Test successful!
   Prompt: What is 2+2? Answer with just the number.
   Answer: 4


In [19]:
# ============================================================================
# STEP 8: QUERY PROCESSOR WITH LANGCHAIN CHAIN
# ============================================================================

def process_query(
    query_data: Dict[str, Any],
    config: Dict[str, Any],
    llm,
    prompt_template: ChatPromptTemplate
) -> Dict[str, Any]:
    """
    Process a single query and generate an answer using LangChain chain.
    
    Args:
        query_data: Single query from FinanceBench dataset
        config: Configuration dictionary
        llm: LangChain LLM instance
        prompt_template: LangChain ChatPromptTemplate
    
    Returns:
        Query data with 'generated_answer' field added
    """
    mode = config["mode"]
    question = query_data["question"]
    
    # Build the chain: prompt_template | llm | output_parser
    chain = prompt_template | llm | StrOutputParser()
    
    # Prepare variables based on mode
    if mode == "closed_book":
        # Closed-book: only question
        variables = {"question": question}
    else:
        # Oracle: question + evidence
        evidence = query_data.get("evidence", [])
        formatted_evidence = format_evidence(evidence)
        variables = {
            "question": question,
            "evidence": formatted_evidence
        }
    
    # Invoke the chain with retry logic
    try:
        # Use the chain invoke which internally calls the LLM
        generated_answer = chain.invoke(variables)
        
        # Add delay for rate limiting
        time.sleep(CALL_DELAY)
        
    except Exception as e:
        # If error occurs, re-raise to stop execution
        raise Exception(f"Failed to process query {query_data.get('financebench_id', 'unknown')}: {str(e)}")
    
    # Add generated answer to query data
    result = query_data.copy()
    result["generated_answer"] = generated_answer.strip()
    
    return result


print("‚úÖ Query processor with LangChain chain created successfully!")

# Test: Process a single query
print("\n" + "="*80)
print("TESTING QUERY PROCESSOR:")
print("="*80)
try:
    # Get test configuration and LLM
    test_config = {
        "mode": "closed_book",
        "provider": "openai",
        "model": "gpt-4o-mini",
        "template_key": "basic",
        "temperature": 0.0
    }
    test_llm = get_llm(test_config["provider"], test_config["model"], test_config["temperature"])
    test_prompt_template = CLOSED_BOOK_LANGCHAIN_TEMPLATES[test_config["template_key"]]["langchain_prompt"]
    
    # Process first query from dataset
    test_query = dataset[0]
    print(f"Processing query: {test_query['financebench_id']}")
    print(f"Question: {test_query['question'][:100]}...")
    
    result = process_query(test_query, test_config, test_llm, test_prompt_template)
    
    print(f"\n‚úÖ Test successful!")
    print(f"   Ground truth answer: {result['answer']}")
    print(f"   Generated answer: {result['generated_answer']}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Could not test query processor: {e}")

‚úÖ Query processor with LangChain chain created successfully!

TESTING QUERY PROCESSOR:
Processing query: financebench_id_03029
Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...

‚úÖ Test successful!
   Ground truth answer: $1577.00
   Generated answer: The FY2018 capital expenditure amount for 3M was approximately $1,400 million (or $1.4 billion).


In [20]:
# ============================================================================
# STEP 9: METADATA BUILDER
# ============================================================================

def build_metadata(
    config: Dict[str, Any],
    template_info: Dict[str, Any],
    dataset_info: Any
) -> Dict[str, Any]:
    """
    Build metadata object for output JSON file.
    
    Args:
        config: Configuration dictionary
        template_info: Template information (alias and template text)
        dataset_info: Dataset object from HuggingFace
    
    Returns:
        Metadata dictionary
    """
    metadata = {
        "mode": config["mode"],
        "provider": config["provider"],
        "model": config["model"],
        "temperature": config["temperature"],
        "prompt_template": template_info["template"],
        "template_alias": template_info["alias"],
        "dataset": DATASET_NAME,
        "dataset_split": DATASET_SPLIT,
        "total_questions": len(dataset_info),
        "generated_at": datetime.utcnow().isoformat() + "Z",
        "call_delay": CALL_DELAY,
        "max_retries": MAX_RETRIES
    }
    
    return metadata


print("‚úÖ Metadata builder function created successfully!")

# Test: Build sample metadata
print("\n" + "="*80)
print("TESTING METADATA BUILDER:")
print("="*80)
test_config = CONFIGURATIONS[0]
test_template_info = CLOSED_BOOK_LANGCHAIN_TEMPLATES[test_config["template_key"]]
test_metadata = build_metadata(test_config, test_template_info, dataset)

print(json.dumps(test_metadata, indent=2))

‚úÖ Metadata builder function created successfully!

TESTING METADATA BUILDER:
{
  "mode": "oracle",
  "provider": "ollama",
  "model": "llama3.1:8b",
  "temperature": 0.0,
  "prompt_template": "You are a financial expert. Answer the following question based on your knowledge.\n\nQuestion: {question}\n\nProvide a concise, accurate answer:",
  "template_alias": "closed_basic",
  "dataset": "PatronusAI/financebench",
  "dataset_split": "train",
  "total_questions": 150,
  "generated_at": "2025-10-25T15:10:48.955437Z",
  "call_delay": 1,
  "max_retries": 3
}


  "generated_at": datetime.utcnow().isoformat() + "Z",


In [21]:
# ============================================================================
# STEP 10: MAIN EXECUTION LOOP
# ============================================================================

def generate_answers_for_config(config: Dict[str, Any], dataset) -> None:
    """
    Process all queries for a given configuration and save results to JSON.
    
    Args:
        config: Configuration dictionary
        dataset: FinanceBench dataset
    
    Raises:
        Exception: If any query fails after retries, stops execution
    """
    # Check if output file already exists
    if check_file_exists(config):
        filename = generate_filename(config)
        print(f"‚è≠Ô∏è  Skipping {filename} (already exists)")
        return
    
    # Get mode and template information
    mode = config["mode"]
    template_key = config["template_key"]
    
    # Get appropriate template based on mode
    if mode == "closed_book":
        template_info = CLOSED_BOOK_LANGCHAIN_TEMPLATES[template_key]
    else:  # oracle
        template_info = ORACLE_LANGCHAIN_TEMPLATES[template_key]
    
    prompt_template = template_info["langchain_prompt"]
    
    # Initialize LLM
    print(f"\n{'='*80}")
    print(f"üöÄ Starting generation for: {generate_filename(config)}")
    print(f"{'='*80}")
    print(f"   Mode: {mode}")
    print(f"   Provider: {config['provider']}")
    print(f"   Model: {config['model']}")
    print(f"   Temperature: {config['temperature']}")
    print(f"   Template: {template_info['alias']}")
    
    try:
        llm = get_llm(config["provider"], config["model"], config["temperature"])
    except Exception as e:
        print(f"‚ùå Failed to initialize LLM: {e}")
        raise
    
    # Build metadata
    metadata = build_metadata(config, template_info, dataset)
    
    # Initialize results structure
    results = {
        "metadata": metadata,
        "queries": []
    }
    
    # Process all queries with progress bar
    print(f"\nüìä Processing {len(dataset)} queries...")
    
    for idx, query in enumerate(tqdm(dataset, desc="Generating answers")):
        try:
            # Process query
            result = process_query(query, config, llm, prompt_template)
            
            # Keep only required fields in specific order
            query_result = {
                "financebench_id": result["financebench_id"],
                "question_type": result["question_type"],
                "question_reasoning": result["question_reasoning"],
                "question": result["question"],
                "doc_name": result["doc_name"],
                "company": result["company"],
                "answer": result["answer"],
                "generated_answer": result["generated_answer"],
                "evidence": result["evidence"]
            }
            
            results["queries"].append(query_result)
            
        except Exception as e:
            print(f"\n‚ùå Error processing query {idx + 1}/{len(dataset)}: {e}")
            print(f"   Query ID: {query.get('financebench_id', 'unknown')}")
            raise  # Stop execution on error
    
    # Save results to JSON file
    filename = generate_filename(config)
    filepath = Path(OUTPUT_DIR) / filename
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\n‚úÖ Successfully generated and saved: {filename}")
    print(f"   Total queries processed: {len(results['queries'])}")
    print(f"   File size: {filepath.stat().st_size / 1024:.2f} KB")


print("‚úÖ Main execution loop function created successfully!")

‚úÖ Main execution loop function created successfully!


In [28]:
# ============================================================================
# STEP 11: RUN ALL CONFIGURATIONS
# ============================================================================

def run_all_configurations(configurations: List[Dict[str, Any]]):
    """
    Execute all configurations in the provided configurations list.
    Generates JSON files for each configuration.
    
    Args:
        configurations: List of configuration dictionaries to process
    """
    print("="*80)
    print("üéØ STARTING BATCH GENERATION")
    print("="*80)
    print(f"Total configurations: {len(configurations)}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Dataset: {DATASET_NAME} ({len(dataset)} questions)")
    print("="*80)
    
    # Track statistics
    generated_count = 0
    skipped_count = 0
    failed_count = 0
    
    # Process each configuration
    for idx, config in enumerate(configurations, 1):
        print(f"\n[{idx}/{len(configurations)}] Processing configuration...")
        
        try:
            # Check if file exists before processing
            if check_file_exists(config):
                skipped_count += 1
            else:
                generate_answers_for_config(config, dataset)
                generated_count += 1
                
        except Exception as e:
            failed_count += 1
            print(f"\n‚ùå FATAL ERROR: Configuration failed!")
            print(f"   Config: {config}")
            print(f"   Error: {e}")
            print(f"\n‚õî Stopping execution due to error.")
            break
    
    # Print final summary
    print("\n" + "="*80)
    print("üìä GENERATION SUMMARY")
    print("="*80)
    print(f"‚úÖ Generated: {generated_count}")
    print(f"‚è≠Ô∏è  Skipped (already exists): {skipped_count}")
    print(f"‚ùå Failed: {failed_count}")
    print(f"üìÅ Output directory: {OUTPUT_DIR}")
    print("="*80)
    
    if failed_count == 0:
        print("üéâ All configurations completed successfully!")
    else:
        print("‚ö†Ô∏è  Some configurations failed. Check errors above.")


print("‚úÖ Run all configurations function created successfully!")
print("\n" + "="*80)
print("‚ö†Ô∏è  READY TO EXECUTE")
print("="*80)
print("To start generating answers, define your CONFIGURATIONS and run:")
print("   run_all_configurations(CONFIGURATIONS)")
print("\nThis will process all configurations and may take significant time.")
print("="*80)

‚úÖ Run all configurations function created successfully!

‚ö†Ô∏è  READY TO EXECUTE
To start generating answers, define your CONFIGURATIONS and run:
   run_all_configurations(CONFIGURATIONS)

This will process all configurations and may take significant time.


In [None]:
# ============================================================================
# EXECUTION CONFIGURATIONS & BATCH GENERATION
# ============================================================================

# Execution Configurations
# Each configuration will generate a separate output JSON file
CONFIGURATIONS = [
    # Closed-Book Configurations
    # {
    #     "mode": "closed_book",
    #     "provider": "openai",
    #     "model": "gpt-4o",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    {
        "mode": "closed_book",
        "provider": "openai",
        "model": "gpt-4o-mini",
        "template_key": "basic",
        "temperature": 0.0
    },
    # {
    #     "mode": "closed_book",
    #     "provider": "anthropic",
    #     "model": "claude-sonnet-4",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    
    # # Oracle Configurations
    # {
    #     "mode": "oracle",
    #     "provider": "openai",
    #     "model": "gpt-4o",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    # {
    #     "mode": "oracle",
    #     "provider": "openai",
    #     "model": "gpt-4o-mini",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    # {
    #     "mode": "oracle",
    #     "provider": "anthropic",
    #     "model": "claude-sonnet-4",
    #     "template_key": "structured",
    #     "temperature": 0.0
    # },
    # {
    #     "mode": "oracle",
    #     "provider": "ollama",
    #     "model": "llama3.1:8b",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # }
]

# Display configuration statistics
print("="*80)
print("üìã CONFIGURATION STATISTICS")
print("="*80)

# Count by mode
closed_book_count = sum(1 for c in CONFIGURATIONS if c["mode"] == "closed_book")
oracle_count = sum(1 for c in CONFIGURATIONS if c["mode"] == "oracle")

print(f"Total configurations: {len(CONFIGURATIONS)}")
print(f"  ‚Ä¢ Closed-book: {closed_book_count}")
print(f"  ‚Ä¢ Oracle: {oracle_count}")

# Count by provider
providers = {}
for config in CONFIGURATIONS:
    provider = config["provider"]
    providers[provider] = providers.get(provider, 0) + 1

print(f"\nBy provider:")
for provider, count in sorted(providers.items()):
    print(f"  ‚Ä¢ {provider}: {count}")

# Check which files already exist and which need to be generated
print("\n" + "="*80)
print("üìÇ FILE STATUS CHECK")
print("="*80)

existing_files = []
to_generate = []

for idx, config in enumerate(CONFIGURATIONS, 1):
    filename = generate_filename(config)
    exists = check_file_exists(config)
    
    if exists:
        existing_files.append((idx, filename, config))
    else:
        to_generate.append((idx, filename, config))

print(f"Existing files: {len(existing_files)}")
print(f"To be generated: {len(to_generate)}")

# Show existing files
if existing_files:
    print(f"\n‚úÖ Already exist ({len(existing_files)}):")
    for idx, filename, config in existing_files:
        print(f"  [{idx}] {filename}")

# Show files to be generated
if to_generate:
    print(f"\nüîÑ Will generate ({len(to_generate)}):")
    for idx, filename, config in to_generate:
        print(f"  [{idx}] {filename}")
        print(f"      Mode: {config['mode']} | Provider: {config['provider']} | Model: {config['model']}")

# Estimate time
if to_generate:
    estimated_time_minutes = (len(to_generate) * len(dataset) * CALL_DELAY) / 60
    print(f"\n‚è±Ô∏è  Estimated time: ~{estimated_time_minutes:.1f} minutes")
    print(f"   (Based on {len(dataset)} queries √ó {CALL_DELAY}s delay √ó {len(to_generate)} configs)")

print("="*80)

# Confirmation before starting
if to_generate:
    print("\nüöÄ Ready to start generation!")
    print("   This will process all configurations listed above.")
    print("="*80)
    
    # Start the batch generation process
    run_all_configurations(CONFIGURATIONS)
else:
    print("\n‚úÖ All files already exist. Nothing to generate.")
    print("   Delete files from output directory if you want to regenerate.")
    print("="*80)