In [34]:
# ============================================================================
# STEP 1: SETUP & DEPENDENCIES
# ============================================================================

# Standard library imports
import os
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any

# LangChain imports for LLM providers
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_community.chat_models import ChatOllama

# LangChain utilities for prompts and parsing
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

# Structured output imports (NEW) - using standard Pydantic
from pydantic import BaseModel, Field

# Dataset and utility imports
from datasets import load_dataset
from dotenv import load_dotenv
from tqdm.auto import tqdm

# Create output directory if it doesn't exist
OUTPUT_DIR = "../../generation_set/closedbook_oracle_sets"
RAG_PATH = "../../retrieval_set"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print("‚úÖ All libraries imported successfully!")
print(f"‚úÖ Output directory ready: {OUTPUT_DIR}")

‚úÖ All libraries imported successfully!
‚úÖ Output directory ready: ../../generation_set/closedbook_oracle_sets


In [35]:
# ============================================================================
# STRUCTURED OUTPUT SCHEMA
# ============================================================================

class FinanceAnswer(BaseModel):
    """
    Structured output schema for financial question answering.
    Separates reasoning from final answer for better evaluation.
    """
    reasoning: str = Field(
        description="Step-by-step reasoning and analysis leading to the answer"
    )
    final_answer: str = Field(
        description="The final answer only, concise and precise"
    )

print("‚úÖ Structured output schema defined!")
print(f"   Schema fields: {list(FinanceAnswer.__fields__.keys())}")

# Test: Create a sample instance to verify schema
test_instance = FinanceAnswer(
    reasoning="Based on the financial statements, revenue was $100M in Q1 and $120M in Q2. Growth = ($120M - $100M) / $100M = 20%",
    final_answer="20%"
)
print(f"\nüìã Sample instance:")
print(f"   Reasoning: {test_instance.reasoning[:80]}...")
print(f"   Final Answer: {test_instance.final_answer}")

‚úÖ Structured output schema defined!
   Schema fields: ['reasoning', 'final_answer']

üìã Sample instance:
   Reasoning: Based on the financial statements, revenue was $100M in Q1 and $120M in Q2. Grow...
   Final Answer: 20%


/var/folders/lj/175ptt0d6knb0gg0lg2h4n2h0000gp/T/ipykernel_70065/4147351160.py:18: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use the `model_fields` class property instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  print(f"   Schema fields: {list(FinanceAnswer.__fields__.keys())}")


In [31]:
# ============================================================================
# STEP 2: CONFIGURATION VARIABLES
# ============================================================================

# Load environment variables
load_dotenv()

# Dataset Configuration
DATASET_NAME = "PatronusAI/financebench"
DATASET_SPLIT = "train"  # 150 questions in train split

# API Configuration - Load from environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")

# Rate Limiting Configuration
CALL_DELAY = 1  # Seconds between each LLM call (rate limiting)
RETRY_DELAY = 30  # Seconds to wait before retry after failure
MAX_RETRIES = 3  # Maximum number of retry attempts

# Global Generation Parameters
TEMPERATURE = 0.0  # Deterministic generation for reproducibility

print(f"   Dataset: {DATASET_NAME} ({DATASET_SPLIT} split)")
print(f"   Output directory: {OUTPUT_DIR}")

   Dataset: PatronusAI/financebench (train split)
   Output directory: ../../generation_set/closedbook_oracle_sets


In [36]:
# ============================================================================
# PROMPT TEMPLATES - ORGANIZED BY QUESTION TYPE (STRUCTURED OUTPUT)
# ============================================================================

# Templates for METRICS-GENERATED questions
# These questions ask for specific numerical values from financial statements
METRICS_GENERATED_TEMPLATES = {
    "closed_book": {
        "basic": {
            "alias": "metrics_closed_basic",
            "template": """You are a financial expert. Answer the following question about a specific financial metric using only your knowledge.

Question: {question}

Provide your step-by-step reasoning, then give the final numerical answer (e.g., $X million, X%, etc.)."""
        },
        "cot": {
            "alias": "metrics_closed_cot",
            "template": """You are a financial expert. Answer the following question about a specific financial metric using only your knowledge.

Question: {question}

Think through this systematically:
1. Identify what metric is being asked
2. Recall the relevant information
3. Provide the numerical answer

Show your reasoning and provide the final answer."""
        }
    },
    "rag": {
        "basic": {
            "alias": "metrics_rag_basic",
            "template": """You are a financial expert. Extract the specific numerical metric from the provided financial document evidence.

Evidence:
{evidence}

Question: {question}

Analyze the evidence and provide:
1. Your reasoning: Explain which parts of the evidence contain the answer
2. Final answer: Provide only the numerical value exactly as shown in the evidence (e.g., $X million, X%, etc.)"""
        },
        "cot": {
            "alias": "metrics_rag_cot",
            "template": """You are a financial expert. Extract the specific numerical metric from the provided financial document evidence.

Evidence:
{evidence}

Question: {question}

Solve this systematically:
1. Locate the relevant section in the evidence
2. Identify the exact metric requested
3. Extract the value in the correct format

Provide your complete reasoning and the final numerical answer."""
        }
    }
}

# Templates for DOMAIN-RELEVANT questions
# These questions test understanding of financial concepts and domain knowledge
DOMAIN_RELEVANT_TEMPLATES = {
    "closed_book": {
        "basic": {
            "alias": "domain_closed_basic",
            "template": """You are a financial expert. Answer the following question about financial concepts and business operations using your knowledge.

Question: {question}

Provide your reasoning and a clear, accurate final answer."""
        },
        "cot": {
            "alias": "domain_closed_cot",
            "template": """You are a financial expert. Answer the following question about financial concepts and business operations using your knowledge.

Question: {question}

Think through this step by step:
1. Understand the financial concept being asked
2. Apply relevant domain knowledge
3. Formulate the answer

Provide your reasoning process and the final answer."""
        }
    },
    "rag": {
        "basic": {
            "alias": "domain_rag_basic",
            "template": """You are a financial expert. Answer the question based on the provided evidence from financial documents.

Evidence:
{evidence}

Question: {question}

Provide:
1. Your reasoning: Analyze the relevant information from the evidence
2. Final answer: A clear, concise answer based strictly on the evidence"""
        },
        "cot": {
            "alias": "domain_rag_cot",
            "template": """You are a financial expert. Answer the question using the provided evidence from financial documents.

Evidence:
{evidence}

Question: {question}

Analyze this carefully:
1. Identify relevant domain information in the evidence
2. Connect it to the question being asked
3. Formulate a precise answer

Provide your complete reasoning and the final answer."""
        }
    }
}

# Templates for NOVEL-GENERATED questions
# These questions require synthesis and reasoning beyond simple extraction
NOVEL_GENERATED_TEMPLATES = {
    "closed_book": {
        "basic": {
            "alias": "novel_closed_basic",
            "template": """You are a financial expert. Answer the following question that requires analysis and synthesis using your knowledge.

Question: {question}

Provide your analytical reasoning and a well-supported final answer."""
        },
        "cot": {
            "alias": "novel_closed_cot",
            "template": """You are a financial expert. Answer the following question that requires analysis and synthesis using your knowledge.

Question: {question}

Work through this systematically:
1. Break down what the question is asking
2. Identify the information and analysis needed
3. Synthesize an answer with clear reasoning

Provide your complete reasoning process and the final answer."""
        }
    },
    "rag": {
        "basic": {
            "alias": "novel_rag_basic",
            "template": """You are a financial expert. Answer the question by analyzing and synthesizing information from the provided financial document evidence.

Evidence:
{evidence}

Question: {question}

Provide:
1. Your reasoning: Analyze and synthesize the relevant information from the evidence
2. Final answer: A clear, reasoned answer based on your analysis"""
        },
        "cot": {
            "alias": "novel_rag_cot",
            "template": """You are a financial expert. Answer the question by analyzing and synthesizing information from the provided financial document evidence.

Evidence:
{evidence}

Question: {question}

Approach this analytically:
1. Review all relevant information in the evidence
2. Identify connections and patterns
3. Synthesize a comprehensive answer

Provide your complete analytical reasoning and the final answer."""
        }
    }
}

print("‚úÖ Template variables restructured successfully!")
print(f"   METRICS_GENERATED_TEMPLATES: {list(METRICS_GENERATED_TEMPLATES.keys())}")
print(f"   DOMAIN_RELEVANT_TEMPLATES: {list(DOMAIN_RELEVANT_TEMPLATES.keys())}")
print(f"   NOVEL_GENERATED_TEMPLATES: {list(NOVEL_GENERATED_TEMPLATES.keys())}")

‚úÖ Template variables restructured successfully!
   METRICS_GENERATED_TEMPLATES: ['closed_book', 'rag']
   DOMAIN_RELEVANT_TEMPLATES: ['closed_book', 'rag']
   NOVEL_GENERATED_TEMPLATES: ['closed_book', 'rag']


In [37]:
# ============================================================================
# STEP 3: LOAD FINANCEBENCH DATASET
# ============================================================================

# Load dataset from HuggingFace
print(f"Loading dataset: {DATASET_NAME} (split: {DATASET_SPLIT})...")
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

print(f"‚úÖ Dataset loaded successfully!")
print(f"   Total questions: {len(dataset)}")
print(f"   Dataset features: {dataset.features.keys()}")

# Display a sample entry to verify structure
print("\n" + "="*80)
print("SAMPLE ENTRY:")
print("="*80)
sample = dataset[0]
print(f"ID: {sample['financebench_id']}")
print(f"Company: {sample['company']}")
print(f"Question Type: {sample['question_type']}")
print(f"Question: {sample['question'][:100]}...")
print(f"Answer: {sample['answer']}")
print(f"Number of evidence pieces: {len(sample['evidence'])}")
if len(sample['evidence']) > 0:
    print(f"First evidence preview: {sample['evidence'][0]['evidence_text'][:150]}...")

Loading dataset: PatronusAI/financebench (split: train)...
‚úÖ Dataset loaded successfully!
   Total questions: 150
   Dataset features: dict_keys(['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link'])

SAMPLE ENTRY:
ID: financebench_id_03029
Company: 3M
Question Type: metrics-generated
Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
Answer: $1577.00
Number of evidence pieces: 1
First evidence preview: Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended December 31
 
(Millions)
 
2018
 
2017
 
2016
 
Cash ...


In [38]:
# ============================================================================
# STEP 4: LANGCHAIN PROMPT TEMPLATES
# ============================================================================

def create_langchain_templates(template_dict):
    """
    Convert template dictionary to LangChain ChatPromptTemplate objects.
    
    Args:
        template_dict: Dictionary with structure {mode: {variant: {alias, template}}}
    
    Returns:
        Dictionary with LangChain prompt objects added
    """
    langchain_dict = {}
    
    for mode, variants in template_dict.items():
        langchain_dict[mode] = {}
        for variant, template_info in variants.items():
            langchain_dict[mode][variant] = {
                "alias": template_info["alias"],
                "template": template_info["template"],
                "langchain_prompt": ChatPromptTemplate.from_messages([
                    ("system", "You are a financial expert assistant."),
                    ("human", template_info["template"])
                ])
            }
    
    return langchain_dict

# Convert all template dictionaries to LangChain format
METRICS_GENERATED_LANGCHAIN = create_langchain_templates(METRICS_GENERATED_TEMPLATES)
DOMAIN_RELEVANT_LANGCHAIN = create_langchain_templates(DOMAIN_RELEVANT_TEMPLATES)
NOVEL_GENERATED_LANGCHAIN = create_langchain_templates(NOVEL_GENERATED_TEMPLATES)

# Create a master dictionary for easy access by question type
LANGCHAIN_TEMPLATES_BY_QUESTION_TYPE = {
    "metrics-generated": METRICS_GENERATED_LANGCHAIN,
    "domain-relevant": DOMAIN_RELEVANT_LANGCHAIN,
    "novel-generated": NOVEL_GENERATED_LANGCHAIN
}

print("‚úÖ LangChain prompt templates created successfully!")
print(f"   Question types: {list(LANGCHAIN_TEMPLATES_BY_QUESTION_TYPE.keys())}")
print(f"   Modes per type: {list(METRICS_GENERATED_LANGCHAIN.keys())}")
print(f"   Variants per mode: {list(METRICS_GENERATED_LANGCHAIN['closed_book'].keys())}")

# Test: Display sample templates
print("\n" + "="*80)
print("SAMPLE TEMPLATES:")
print("="*80)

# Test metrics-generated closed-book basic
print("\n1. METRICS-GENERATED / CLOSED-BOOK / BASIC:")
print("-" * 80)
test_prompt = METRICS_GENERATED_LANGCHAIN["closed_book"]["basic"]["langchain_prompt"]
print(test_prompt.format(question="What is the FY2020 revenue of Apple in USD millions?"))

# Test domain-relevant rag basic
print("\n2. DOMAIN-RELEVANT / RAG / BASIC:")
print("-" * 80)
test_prompt = DOMAIN_RELEVANT_LANGCHAIN["rag"]["basic"]["langchain_prompt"]
print(test_prompt.format(
    question="What are the main risk factors for the company?",
    evidence="Risk Factors: The company faces competition from other tech firms..."
))

# Test novel-generated rag cot
print("\n3. NOVEL-GENERATED / RAG / COT:")
print("-" * 80)
test_prompt = NOVEL_GENERATED_LANGCHAIN["rag"]["cot"]["langchain_prompt"]
print(test_prompt.format(
    question="How has the company's debt structure changed?",
    evidence="2019 debt: $50M. 2020 debt: $75M..."
))

‚úÖ LangChain prompt templates created successfully!
   Question types: ['metrics-generated', 'domain-relevant', 'novel-generated']
   Modes per type: ['closed_book', 'rag']
   Variants per mode: ['basic', 'cot']

SAMPLE TEMPLATES:

1. METRICS-GENERATED / CLOSED-BOOK / BASIC:
--------------------------------------------------------------------------------
System: You are a financial expert assistant.
Human: You are a financial expert. Answer the following question about a specific financial metric using only your knowledge.

Question: What is the FY2020 revenue of Apple in USD millions?

Provide your step-by-step reasoning, then give the final numerical answer (e.g., $X million, X%, etc.).

2. DOMAIN-RELEVANT / RAG / BASIC:
--------------------------------------------------------------------------------
System: You are a financial expert assistant.
Human: You are a financial expert. Answer the question based on the provided evidence from financial documents.

Evidence:
Risk Factors: Th

In [39]:
# ============================================================================
# STEP 5: LLM MODEL INITIALIZATION FUNCTION
# ============================================================================

def get_llm(provider: str, model: str, temperature: float):
    """
    Initialize and return a LangChain LLM based on provider.
    
    Args:
        provider: One of 'openai', 'anthropic', 'ollama'
        model: Model name (e.g., 'gpt-4o', 'claude-sonnet-4', 'llama3.1:8b')
        temperature: Temperature for generation (0.0 for deterministic)
    
    Returns:
        LangChain chat model instance
    """
    if provider == "openai":
        if not OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY not found in environment variables")
        return ChatOpenAI(
            model=model,
            temperature=temperature,
            api_key=OPENAI_API_KEY
        )
    
    elif provider == "anthropic":
        if not ANTHROPIC_API_KEY:
            raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
        return ChatAnthropic(
            model=model,
            temperature=temperature,
            api_key=ANTHROPIC_API_KEY
        )
    
    elif provider == "ollama":
        return ChatOllama(
            model=model,
            temperature=temperature,
            base_url=OLLAMA_BASE_URL
        )
    
    else:
        raise ValueError(f"Unknown provider: {provider}. Supported: openai, anthropic, ollama")

print("‚úÖ LLM initialization function created successfully!")

# Test: Initialize a sample LLM (if API key available)
try:
    test_llm = get_llm("openai", "gpt-4o-mini", 0.0)
    print(f"   ‚úÖ Test initialization successful: OpenAI GPT-4o-mini")
except Exception as e:
    print(f"   ‚ö†Ô∏è  Could not initialize test LLM: {e}")

‚úÖ LLM initialization function created successfully!
   ‚úÖ Test initialization successful: OpenAI GPT-4o-mini


In [40]:
# ============================================================================
# STEP 6: HELPER FUNCTIONS
# ============================================================================

def sanitize_model_name(model_name: str) -> str:
    """
    Sanitize model name for use in filenames.
    Replace special characters with underscores or hyphens.
    
    Args:
        model_name: Original model name (e.g., 'llama3.1:8b', 'gpt-4o-mini')
    
    Returns:
        Sanitized model name (e.g., 'llama3-1_8b', 'gpt-4o-mini')
    """
    # Replace : with _ and . with -
    sanitized = model_name.replace(":", "_").replace(".", "-")
    return sanitized


def generate_filename(config: Dict[str, Any]) -> str:
    """
    Generate output filename based on configuration.
    Format: {mode}_{question_type}_{provider}_{model}_{temperature}_{template_alias}.json
    
    Args:
        config: Configuration dictionary
    
    Returns:
        Filename string
    """
    mode = config["mode"]
    provider = config["provider"]
    model = sanitize_model_name(config["model"])
    temperature = config["temperature"]
    question_type = config["question_type"]
    template_key = config["template_key"]
    
    # Get template alias based on question_type and mode
    # Map mode: "oracle" or "rag" uses "rag" templates
    template_mode = "rag" if mode in ["oracle", "rag"] else mode
    
    template_dict = LANGCHAIN_TEMPLATES_BY_QUESTION_TYPE[question_type][template_mode][template_key]
    template_alias = template_dict["alias"]
    
    filename = f"{mode}_{question_type}_{provider}_{model}_{temperature}_{template_alias}.json"
    return filename


def check_file_exists(config: Dict[str, Any]) -> bool:
    """
    Check if output file for this configuration already exists.
    
    Args:
        config: Configuration dictionary
    
    Returns:
        True if file exists, False otherwise
    """
    filename = generate_filename(config)
    filepath = Path(OUTPUT_DIR) / filename
    return filepath.exists()


def format_evidence(evidence_list: List[Dict[str, Any]]) -> str:
    """
    Format evidence pieces for oracle/RAG mode prompts.
    Handles single or multiple evidence pieces with clear separation.
    
    Args:
        evidence_list: List of evidence dictionaries from FinanceBench
    
    Returns:
        Formatted evidence string
    """
    if not evidence_list or len(evidence_list) == 0:
        return ""
    
    # Single evidence piece
    if len(evidence_list) == 1:
        return evidence_list[0]["evidence_text"]
    
    # Multiple evidence pieces - format with clear separation
    formatted_parts = []
    for idx, evidence in enumerate(evidence_list, 1):
        formatted_parts.append(f"Evidence {idx}:\n{evidence['evidence_text']}")
    
    return "\n\n".join(formatted_parts)


print("‚úÖ Helper functions created successfully!")

# Test helper functions
print("\n" + "="*80)
print("TESTING HELPER FUNCTIONS:")
print("="*80)

# Test 1: Sanitize model name
test_model_1 = "gpt-4o-mini"
test_model_2 = "llama3.1:8b"
print(f"Sanitize 'gpt-4o-mini' ‚Üí '{sanitize_model_name(test_model_1)}'")
print(f"Sanitize 'llama3.1:8b' ‚Üí '{sanitize_model_name(test_model_2)}'")

# Test 2: Generate filename with question_type (closed-book)
test_config = {
    "mode": "closed_book",
    "provider": "openai",
    "model": "gpt-4o-mini",
    "temperature": 0.0,
    "question_type": "metrics-generated",
    "template_key": "basic"
}
test_filename = generate_filename(test_config)
print(f"\nGenerated filename (closed-book): {test_filename}")
print(f"Expected: closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json")

# Test 3: Generate filename for oracle mode (uses "rag" templates)
test_config_oracle = {
    "mode": "oracle",
    "provider": "openai",
    "model": "gpt-4o-mini",
    "temperature": 0.0,
    "question_type": "domain-relevant",
    "template_key": "cot"
}
test_filename_oracle = generate_filename(test_config_oracle)
print(f"\nGenerated filename (oracle): {test_filename_oracle}")
print(f"Expected: oracle_domain-relevant_openai_gpt-4o-mini_0.0_domain_rag_cot.json")

# Test 4: Check file exists
file_exists = check_file_exists(test_config)
print(f"\nFile exists: {file_exists}")

# Test 5: Format evidence (using sample from dataset)
sample_evidence = dataset[0]["evidence"]
formatted_evidence = format_evidence(sample_evidence)
print(f"\nFormatted evidence (first 200 chars):\n{formatted_evidence[:200]}...")

‚úÖ Helper functions created successfully!

TESTING HELPER FUNCTIONS:
Sanitize 'gpt-4o-mini' ‚Üí 'gpt-4o-mini'
Sanitize 'llama3.1:8b' ‚Üí 'llama3-1_8b'

Generated filename (closed-book): closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json
Expected: closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json

Generated filename (oracle): oracle_domain-relevant_openai_gpt-4o-mini_0.0_domain_rag_cot.json
Expected: oracle_domain-relevant_openai_gpt-4o-mini_0.0_domain_rag_cot.json

File exists: True

Formatted evidence (first 200 chars):
Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended December 31
 
(Millions)
 
2018
 
2017
 
2016
 
Cash Flows from Operating Activities
 
 
 
 
 
 
 
Net ...


In [41]:
# ============================================================================
# STEP 7: LLM CALLER WITH RETRY LOGIC
# ============================================================================

def call_llm_with_retry(
    llm,
    prompt: str,
    max_retries: int = MAX_RETRIES,
    retry_delay: int = RETRY_DELAY,
    call_delay: float = CALL_DELAY
) -> str:
    """
    Call LLM with retry logic and rate limiting.
    
    Args:
        llm: LangChain LLM instance
        prompt: Formatted prompt string
        max_retries: Maximum number of retry attempts
        retry_delay: Seconds to wait before retry after failure
        call_delay: Seconds to wait between successful calls (rate limiting)
    
    Returns:
        Generated answer text
    
    Raises:
        Exception: If all retries fail, stops execution
    """
    last_error = None
    
    for attempt in range(max_retries):
        try:
            # Invoke the LLM
            response = llm.invoke(prompt)
            
            # Extract text content from response
            if hasattr(response, 'content'):
                answer = response.content
            else:
                answer = str(response)
            
            # Rate limiting: wait before next call
            time.sleep(call_delay)
            
            return answer.strip()
        
        except Exception as e:
            last_error = e
            print(f"   ‚ö†Ô∏è  Attempt {attempt + 1}/{max_retries} failed: {str(e)}")
            
            # If not the last attempt, wait before retry
            if attempt < max_retries - 1:
                print(f"   ‚è≥ Waiting {retry_delay} seconds before retry...")
                time.sleep(retry_delay)
            else:
                # Last attempt failed - stop execution
                print(f"   ‚ùå All {max_retries} attempts failed!")
                raise Exception(f"LLM call failed after {max_retries} attempts: {str(last_error)}")
    
    # Should not reach here, but just in case
    raise Exception(f"LLM call failed: {str(last_error)}")


print("‚úÖ LLM caller with retry logic created successfully!")

# Test: Call LLM with a simple prompt (if API key available)
print("\n" + "="*80)
print("TESTING LLM CALLER:")
print("="*80)
try:
    test_llm = get_llm("openai", "gpt-4o-mini", 0.0)
    test_prompt = "What is 2+2? Answer with just the number."
    test_answer = call_llm_with_retry(test_llm, test_prompt)
    print(f"‚úÖ Test successful!")
    print(f"   Prompt: {test_prompt}")
    print(f"   Answer: {test_answer}")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not test LLM caller: {e}")

‚úÖ LLM caller with retry logic created successfully!

TESTING LLM CALLER:
‚úÖ Test successful!
   Prompt: What is 2+2? Answer with just the number.
   Answer: 4


In [43]:
# ============================================================================
# RAG JSON READING AND QUERY EXTRACTION FUNCTIONS
# ============================================================================

from typing import Optional, Tuple

def read_rag_json(filepath: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Read a RAG results JSON file and return both configuration and queries.
    
    Args:
        filepath: Path to the JSON file
        
    Returns:
        Tuple of (configuration_dict, full_data_dict)
        - configuration_dict: The parsed configuration from the JSON
        - full_data_dict: The complete JSON data including queries
        
    Raises:
        FileNotFoundError: If file doesn't exist
        json.JSONDecodeError: If file is not valid JSON
    """
    filepath = Path(filepath)
    
    if not filepath.exists():
        raise FileNotFoundError(f"File not found: {filepath}")
    
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract configuration
    config = data.get('configuration', {})
    
    return config, data


def get_rag_queries_from_json(data: Dict[str, Any]) -> list:
    """
    Extract queries list from the JSON data.
    
    Args:
        data: The full JSON data dictionary
        
    Returns:
        List of query dictionaries
    """
    return data.get('queries', [])


def get_rag_query_by_id(data: Dict[str, Any], financebench_id: str) -> Optional[Dict[str, Any]]:
    """
    Find a specific query by its financebench_id.
    
    Args:
        data: The full JSON data dictionary
        financebench_id: The ID to search for
        
    Returns:
        Query dictionary if found, None otherwise
    """
    queries = get_rag_queries_from_json(data)
    
    for query in queries:
        if query.get('financebench_id') == financebench_id:
            return query
    
    return None

#===============================================================
# Test configurations for simplified RAG retrieval
#===============================================================

filename = "voyage_voyage-3-large_chunk512_k80_global_rerank_k20-voyage-rerank-2.5.json"
config, data = read_rag_json(f'{RAG_PATH}/{filename}')
print("Configuration:", config)
print("Number of queries loaded:", len(get_rag_queries_from_json(data)))
print(data['queries'][0])  # Print first query for inspection

# Find specific query
print("\nSearching for financebench_id 'financebench_id_03029'...")
query = get_rag_query_by_id(data, "financebench_id_03029")
if query:
    print("Query found:", query)

Configuration: {'provider': 'voyage', 'model': 'voyage-3-large', 'chunk_size': 512, 'k_retrieve': 80, 'k_rerank': 20, 'reranker_model': 'voyage-rerank-2.5', 'mode': 'global'}
Number of queries loaded: 150
{'financebench_id': 'financebench_id_03029', 'question_type': 'metrics-generated', 'question_reasoning': 'Information extraction', 'question': 'What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.', 'doc_name': '3M_2018_10K', 'evidence': [{'evidence_text': 'Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\nYears ended December 31\n \n(Millions)\n    \n2018\n    \n2017\n    \n2016\n \nCash Flows from Operating Activities\n \n \n  \n \n  \n \n  \nNet income including noncontrolling interest\n \n$\n5,363  \n$\n4,869  \n$\n5,058  \nAdjustments to reconcile net income including noncontrolling interest to net cash\nprovided by operating activit

In [44]:
# ============================================================================
# STEP 8: QUERY PROCESSOR WITH HYBRID STRUCTURED OUTPUT
# ============================================================================

def process_query(
    query_data: Dict[str, Any],
    config: Dict[str, Any],
    llm,
    prompt_template: ChatPromptTemplate
) -> Dict[str, Any]:
    """
    Process a single query and generate a structured answer.
    
    Hybrid Approach:
    - OpenAI/Anthropic: Use native structured output (with_structured_output)
    - Ollama: Use JsonOutputParser as fallback
    
    Args:
        query_data: Single query from FinanceBench dataset
        config: Configuration dictionary
        llm: LangChain LLM instance
        prompt_template: LangChain ChatPromptTemplate
    
    Returns:
        Query data with 'reasoning' and 'generated_answer' fields added
    """
    mode = config["mode"]
    provider = config["provider"]
    question = query_data["question"]
    
    # Prepare variables based on mode
    if mode == "closed_book":
        # Closed-book: only question
        variables = {"question": question}
    # elif mode == "rag":  # RAG mode
    elif mode == "rag":
        # RAG: question + evidence from retrieval
        evidence = query_data.get("retrieved_evidence", [])
        formatted_evidence = format_evidence(evidence)
        variables = {
            "question": question,
            "evidence": formatted_evidence
        }
    else:  # oracle mode
        # Oracle: question + evidence from FinanceBench annotations
        evidence = query_data.get("evidence", [])
        formatted_evidence = format_evidence(evidence)
        variables = {
            "question": question,
            "evidence": formatted_evidence
        }
    
    # Build chain based on provider (HYBRID APPROACH)
    if provider in ["openai", "anthropic"]:
        # Use native structured output for OpenAI/Anthropic
        # This uses the model's function calling API for reliable JSON
        structured_llm = llm.with_structured_output(FinanceAnswer)
        chain = prompt_template | structured_llm
    else:
        # Use JsonOutputParser for Ollama and other providers
        # This adds JSON format instructions to the prompt automatically
        parser = JsonOutputParser(pydantic_object=FinanceAnswer)
        chain = prompt_template | llm | parser
    
    # Invoke the chain
    try:
        result = chain.invoke(variables)
        
        # Add delay for rate limiting
        time.sleep(CALL_DELAY)
        
    except Exception as e:
        # If error occurs, re-raise to stop execution
        raise Exception(f"Failed to process query {query_data.get('financebench_id', 'unknown')}: {str(e)}")
    
    # Normalize output (handle both Pydantic object and dict)
    if hasattr(result, 'final_answer'):
        # Pydantic object from with_structured_output()
        reasoning = result.reasoning
        final_answer = result.final_answer
    elif isinstance(result, dict):
        # Dictionary from JsonOutputParser
        reasoning = result.get("reasoning", "")
        final_answer = result.get("final_answer", "")
    else:
        # Fallback for unexpected format (shouldn't happen)
        reasoning = str(result)
        final_answer = str(result)
    
    # Add structured output to query data
    result_query = query_data.copy()
    result_query["reasoning"] = reasoning.strip()
    result_query["generated_answer"] = final_answer.strip()
    
    return result_query


print("‚úÖ Query processor with hybrid structured output created successfully!")

# Test: Process a single query
print("\n" + "="*80)
print("TESTING QUERY PROCESSOR WITH STRUCTURED OUTPUT:")
print("="*80)
try:
    # Get test configuration and LLM
    test_config = {
        "mode": "closed_book",
        "provider": "openai",
        "model": "gpt-4o-mini",
        "question_type": "metrics-generated",
        "template_key": "basic",
        "temperature": 0.0
    }
    test_llm = get_llm(test_config["provider"], test_config["model"], test_config["temperature"])
    
    # Get the appropriate template
    template_mode = "closed_book"
    test_prompt_template = LANGCHAIN_TEMPLATES_BY_QUESTION_TYPE[
        test_config["question_type"]
    ][template_mode][test_config["template_key"]]["langchain_prompt"]
    
    # Find a metrics-generated query from dataset
    test_query = None
    for query in dataset:
        if query["question_type"] == "metrics-generated":
            test_query = query
            break
    
    if test_query:
        print(f"Processing query: {test_query['financebench_id']}")
        print(f"Question type: {test_query['question_type']}")
        print(f"Question: {test_query['question'][:100]}...")
        print(f"Provider: {test_config['provider']} (using {'structured output' if test_config['provider'] in ['openai', 'anthropic'] else 'JsonOutputParser'})")
        
        result = process_query(test_query, test_config, test_llm, test_prompt_template)
        
        print(f"\n‚úÖ Test successful!")
        print(f"   Ground truth: {result['answer']}")
        print(f"   Reasoning (first 200 chars): {result['reasoning'][:200]}...")
        print(f"   Final answer: {result['generated_answer']}")
    else:
        print("‚ö†Ô∏è  No metrics-generated query found in dataset")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Could not test query processor: {e}")
    import traceback
    traceback.print_exc()

‚úÖ Query processor with hybrid structured output created successfully!

TESTING QUERY PROCESSOR WITH STRUCTURED OUTPUT:
Processing query: financebench_id_03029
Question type: metrics-generated
Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
Provider: openai (using structured output)

‚úÖ Test successful!
   Ground truth: $1577.00
   Reasoning (first 200 chars): To determine the FY2018 capital expenditure amount for 3M, we need to refer to the cash flow statement for that fiscal year. Capital expenditures (CapEx) are typically listed under the investing activ...
   Final answer: $1,700 million


In [47]:
# ============================================================================
# STEP 9: METADATA BUILDER
# ============================================================================

def build_metadata(
    config: Dict[str, Any],
    template_info: Dict[str, Any],
    dataset_info: Any,
    filtered_count: int
) -> Dict[str, Any]:
    """
    Build metadata object for output JSON file.
    
    Args:
        config: Configuration dictionary
        template_info: Template information (alias and template text)
        dataset_info: Dataset object from HuggingFace
        filtered_count: Number of queries matching the question_type filter
    
    Returns:
        Metadata dictionary
    """
    metadata = {
        "mode": config["mode"],
        "provider": config["provider"],
        "model": config["model"],
        "temperature": config["temperature"],
        "question_type": config["question_type"],
        "rag_file_name": config.get("rag_file_name", "None"),
        "prompt_template": template_info["template"],
        "template_alias": template_info["alias"],
        "structured_output": True,
        "output_schema": {
            "reasoning": "Step-by-step reasoning and analysis",
            "final_answer": "The final answer only (concise and precise)"
        },
        "structured_output_method": "with_structured_output" if config["provider"] in ["openai", "anthropic"] else "JsonOutputParser",
        "dataset": DATASET_NAME,
        "dataset_split": DATASET_SPLIT,
        "total_questions": filtered_count,  # Number of questions for this question_type
        "total_dataset_questions": len(dataset_info),  # Total questions in dataset (150)
        "generated_at": datetime.utcnow().isoformat() + "Z",
        "call_delay": CALL_DELAY,
        "max_retries": MAX_RETRIES
    }
    
    return metadata


print("‚úÖ Metadata builder function created successfully!")

# Test: Build sample metadata
print("\n" + "="*80)
print("TESTING METADATA BUILDER:")
print("="*80)

# Create a sample test configuration
test_config = {
    "mode": "closed_book",
    "provider": "openai",
    "model": "gpt-4o-mini",
    "temperature": 0.0,
    "question_type": "metrics-generated",
    "template_key": "basic"
}

# Get the appropriate template info
template_mode = "closed_book"
test_template_info = LANGCHAIN_TEMPLATES_BY_QUESTION_TYPE[
    test_config["question_type"]
][template_mode][test_config["template_key"]]

# Count how many queries match this question_type
filtered_queries = [q for q in dataset if q["question_type"] == test_config["question_type"]]
filtered_count = len(filtered_queries)

# Build metadata
test_metadata = build_metadata(test_config, test_template_info, dataset, filtered_count)

print(json.dumps(test_metadata, indent=2))
print(f"\nNote: This configuration will process {filtered_count} out of {len(dataset)} total questions")

‚úÖ Metadata builder function created successfully!

TESTING METADATA BUILDER:
{
  "mode": "closed_book",
  "provider": "openai",
  "model": "gpt-4o-mini",
  "temperature": 0.0,
  "question_type": "metrics-generated",
  "rag_file_name": "None",
  "prompt_template": "You are a financial expert. Answer the following question about a specific financial metric using only your knowledge.\n\nQuestion: {question}\n\nProvide your step-by-step reasoning, then give the final numerical answer (e.g., $X million, X%, etc.).",
  "template_alias": "metrics_closed_basic",
  "structured_output": true,
  "output_schema": {
    "reasoning": "Step-by-step reasoning and analysis",
    "final_answer": "The final answer only (concise and precise)"
  },
  "structured_output_method": "with_structured_output",
  "dataset": "PatronusAI/financebench",
  "dataset_split": "train",
  "total_questions": 50,
  "total_dataset_questions": 150,
  "generated_at": "2025-11-09T09:59:56.615500Z",
  "call_delay": 1,
  "max_re

  "generated_at": datetime.utcnow().isoformat() + "Z",


In [52]:
# ============================================================================
# STEP 10: MAIN EXECUTION LOOP
# ============================================================================

def generate_answers_for_config(config: Dict[str, Any], dataset) -> None:
    """
    Process all queries for a given configuration and save results to JSON.
    Only processes queries matching the configuration's question_type.
    Uses hybrid structured output approach.
    
    Args:
        config: Configuration dictionary
        dataset: FinanceBench dataset
    
    Raises:
        Exception: If any query fails after retries, stops execution
    """
    # Check if output file already exists
    if check_file_exists(config):
        filename = generate_filename(config)
        print(f"‚è≠Ô∏è  Skipping {filename} (already exists)")
        return
    
    # Get mode, question_type, and template information
    mode = config["mode"]
    question_type = config["question_type"]
    template_key = config["template_key"]
    
    # Map mode: "oracle" or "rag" uses "rag" templates
    template_mode = "rag" if mode in ["oracle", "rag"] else mode
    
    # Get appropriate template based on question_type and mode
    template_info = LANGCHAIN_TEMPLATES_BY_QUESTION_TYPE[question_type][template_mode][template_key]
    prompt_template = template_info["langchain_prompt"]
    
    # Filter dataset to only include queries matching this question_type
    filtered_dataset = [q for q in dataset if q["question_type"] == question_type]
    
    if len(filtered_dataset) == 0:
        print(f"‚ö†Ô∏è  No queries found for question_type: {question_type}")
        return
    
    # if mode == "rag", we need to load json retrieval with variable rag_file_name
    if mode == "rag":
        rag_config, rag_data = read_rag_json(f"{RAG_PATH}/{config['rag_file_name']}")
        rag_queries = get_rag_queries_from_json(rag_data)
        rag_query_dict = {q["financebench_id"]: q for q in rag_queries}
        # Attach retrieved evidence to each query in filtered_dataset
        # if the following code cannot find financebench_id in rag_query_dict, it should raise error and stop execution
        for q in filtered_dataset:
            fb_id = q["financebench_id"]
            if fb_id not in rag_query_dict:
                raise ValueError(
                    f"Missing required financebench_id: '{fb_id}' not found in rag_query_dict. "
                    f"Available IDs: {list(rag_query_dict.keys())[:5]}..."
                )
            q["retrieved_evidence"] = rag_query_dict[fb_id].get("evidence", [])
    
    # Initialize LLM
    print(f"\n{'='*80}")
    print(f"üöÄ Starting generation for: {generate_filename(config)}")
    print(f"{'='*80}")
    print(f"   Mode: {mode}")
    if mode == "rag":
        print(f"   RAG File: {config['rag_file_name']}")
        print(f"   Total RAG queries loaded: {len(rag_queries)}")
    print(f"   Question Type: {question_type}")
    print(f"   Provider: {config['provider']}")
    print(f"   Model: {config['model']}")
    print(f"   Temperature: {config['temperature']}")
    print(f"   Template: {template_info['alias']}")
    print(f"   Structured Output Method: {'with_structured_output' if config['provider'] in ['openai', 'anthropic'] else 'JsonOutputParser'}")
    print(f"   Filtered queries: {len(filtered_dataset)} out of {len(dataset)}")
    
    try:
        llm = get_llm(config["provider"], config["model"], config["temperature"])
    except Exception as e:
        print(f"‚ùå Failed to initialize LLM: {e}")
        raise
    
    # Build metadata with filtered count
    metadata = build_metadata(config, template_info, dataset, len(filtered_dataset))
    
    # Initialize results structure
    results = {
        "metadata": metadata,
        "queries": []
    }
    
    # Process filtered queries with progress bar
    print(f"\nüìä Processing {len(filtered_dataset)} queries for {question_type}...")
    
    for idx, query in enumerate(tqdm(filtered_dataset, desc="Generating answers")):
        try:
            # Process query with structured output
            result = process_query(query, config, llm, prompt_template)
            
            # Keep only required fields in specific order (including reasoning)
            query_result = {
                "financebench_id": result["financebench_id"],
                "question_type": result["question_type"],
                "question_reasoning": result["question_reasoning"],
                "question": result["question"],
                "doc_name": result["doc_name"],
                "company": result["company"],
                "answer": result["answer"],
                "reasoning": result["reasoning"],  # NEW FIELD - structured output
                "generated_answer": result["generated_answer"],  # Now contains only final answer
                "evidence": result["evidence"]
            }
            
            results["queries"].append(query_result)
            
        except Exception as e:
            print(f"\n‚ùå Error processing query {idx + 1}/{len(filtered_dataset)}: {e}")
            print(f"   Query ID: {query.get('financebench_id', 'unknown')}")
            raise  # Stop execution on error
    
    # Save results to JSON file
    filename = generate_filename(config)
    filepath = Path(OUTPUT_DIR) / filename
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\n‚úÖ Successfully generated and saved: {filename}")
    print(f"   Total queries processed: {len(results['queries'])}")
    print(f"   File size: {filepath.stat().st_size / 1024:.2f} KB")
    print(f"   Output includes structured reasoning + final answers")


print("‚úÖ Main execution loop function created successfully!")

‚úÖ Main execution loop function created successfully!


In [53]:
# ============================================================================
# STEP 11: RUN ALL CONFIGURATIONS
# ============================================================================

def run_all_configurations(configurations: List[Dict[str, Any]]):
    """
    Execute all configurations in the provided configurations list.
    Generates JSON files for each configuration.
    
    Args:
        configurations: List of configuration dictionaries to process
    """
    print("="*80)
    print("üéØ STARTING BATCH GENERATION")
    print("="*80)
    print(f"Total configurations: {len(configurations)}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Dataset: {DATASET_NAME} ({len(dataset)} questions)")
    
    # Show breakdown by question type
    question_type_counts = {}
    for q in dataset:
        qt = q["question_type"]
        question_type_counts[qt] = question_type_counts.get(qt, 0) + 1
    
    print(f"\nQuestion type distribution:")
    for qt, count in sorted(question_type_counts.items()):
        print(f"  ‚Ä¢ {qt}: {count} questions")
    
    print("="*80)
    
    # Track statistics
    generated_count = 0
    skipped_count = 0
    failed_count = 0
    
    # Process each configuration
    for idx, config in enumerate(configurations, 1):
        print(f"\n[{idx}/{len(configurations)}] Processing configuration...")
        print(f"   Mode: {config['mode']}, Question Type: {config['question_type']}, Model: {config['model']}")
        
        try:
            # Check if file exists before processing
            if check_file_exists(config):
                skipped_count += 1
            else:
                generate_answers_for_config(config, dataset)
                generated_count += 1
                
        except Exception as e:
            failed_count += 1
            print(f"\n‚ùå FATAL ERROR: Configuration failed!")
            print(f"   Config: {config}")
            print(f"   Error: {e}")
            print(f"\n‚õî Stopping execution due to error.")
            break
    
    # Print final summary
    print("\n" + "="*80)
    print("üìä GENERATION SUMMARY")
    print("="*80)
    print(f"‚úÖ Generated: {generated_count}")
    print(f"‚è≠Ô∏è  Skipped (already exists): {skipped_count}")
    print(f"‚ùå Failed: {failed_count}")
    print(f"üìÅ Output directory: {OUTPUT_DIR}")
    print("="*80)
    
    if failed_count == 0:
        print("üéâ All configurations completed successfully!")
    else:
        print("‚ö†Ô∏è  Some configurations failed. Check errors above.")


print("‚úÖ Run all configurations function created successfully!")
print("\n" + "="*80)
print("‚ö†Ô∏è  READY TO EXECUTE")
print("="*80)
print("To start generating answers, define your CONFIGURATIONS and run:")
print("   run_all_configurations(CONFIGURATIONS)")
print("\nThis will process all configurations and may take significant time.")
print("="*80)

‚úÖ Run all configurations function created successfully!

‚ö†Ô∏è  READY TO EXECUTE
To start generating answers, define your CONFIGURATIONS and run:
   run_all_configurations(CONFIGURATIONS)

This will process all configurations and may take significant time.


In [70]:
# ============================================================================
# EXECUTION CONFIGURATIONS & BATCH GENERATION
# ============================================================================

provider = "openai"  # Change as needed: "openai", "anthropic", "ollama"
model = "gpt-4o-mini"  # Change as needed

# Execution Configurations
# Each configuration will generate a separate output JSON file
CONFIGURATIONS = [
    # # Closed-Book Configurations - Metrics Generated
    # {
    #     "mode": "closed_book",
    #     "provider": provider,
    #     "model": model,
    #     "question_type": "metrics-generated",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    
    # # Closed-Book Configurations - Domain Relevant
    # {
    #     "mode": "closed_book",
    #     "provider": "openai",
    #     "model": "gpt-4o-mini",
    #     "question_type": "domain-relevant",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    
    # # Closed-Book Configurations - Novel Generated
    # {
    #     "mode": "closed_book",
    #     "provider": "openai",
    #     "model": "gpt-4o-mini",
    #     "question_type": "novel-generated",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },

    # RAG Configurations - Metrics Generated
    # {
    #     "mode": "rag",
    #     "provider": provider,
    #     "model": model,
    #     "question_type": "metrics-generated",
    #     "template_key": "basic",
    #     "temperature": 0.0,
    #     "rag_file_name": "voyage_voyage-3-large_chunk2048_k80_single_rerank_k20-voyage-rerank-2.5.json"
    # },

    # RAG Configurations - Novel Generated
    {
        "mode": "rag",
        "provider": provider,
        "model": model,
        "question_type": "novel-generated",
        "template_key": "basic",
        "temperature": 0.0,
        "rag_file_name": "voyage_voyage-3-large_chunk2048_k80_single_rerank_k20-voyage-rerank-2.5.json"
    },
    
    # # Oracle Configurations - Metrics Generated
    # {
    #     "mode": "oracle",
    #     "provider": provider,
    #     "model": model,
    #     "question_type": "metrics-generated",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    
    # # Oracle Configurations - Domain Relevant
    # {
    #     "mode": "oracle",
    #     "provider": "openai",
    #     "model": "gpt-4o-mini",
    #     "question_type": "domain-relevant",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
    
    # # Oracle Configurations - Novel Generated
    # {
    #     "mode": "oracle",
    #     "provider": "openai",
    #     "model": "gpt-4o-mini",
    #     "question_type": "novel-generated",
    #     "template_key": "basic",
    #     "temperature": 0.0
    # },
]

# Display configuration statistics
print("="*80)
print("üìã CONFIGURATION STATISTICS")
print("="*80)

# Count by mode
closed_book_count = sum(1 for c in CONFIGURATIONS if c["mode"] == "closed_book")
oracle_count = sum(1 for c in CONFIGURATIONS if c["mode"] == "oracle")
rag_count = sum(1 for c in CONFIGURATIONS if c["mode"] == "rag")

print(f"Total configurations: {len(CONFIGURATIONS)}")
print(f"  ‚Ä¢ Closed-book: {closed_book_count}")
print(f"  ‚Ä¢ Oracle: {oracle_count}")
print(f"  ‚Ä¢ RAG: {rag_count}")

# Count by question type
question_types = {}
for config in CONFIGURATIONS:
    qt = config["question_type"]
    question_types[qt] = question_types.get(qt, 0) + 1

print(f"\nBy question type:")
for qt, count in sorted(question_types.items()):
    print(f"  ‚Ä¢ {qt}: {count}")

# Count by provider
providers = {}
for config in CONFIGURATIONS:
    provider = config["provider"]
    providers[provider] = providers.get(provider, 0) + 1

print(f"\nBy provider:")
for provider, count in sorted(providers.items()):
    print(f"  ‚Ä¢ {provider}: {count}")

# Check which files already exist and which need to be generated
print("\n" + "="*80)
print("üìÇ FILE STATUS CHECK")
print("="*80)

existing_files = []
to_generate = []

for idx, config in enumerate(CONFIGURATIONS, 1):
    filename = generate_filename(config)
    exists = check_file_exists(config)
    
    if exists:
        existing_files.append((idx, filename, config))
    else:
        to_generate.append((idx, filename, config))

print(f"Existing files: {len(existing_files)}")
print(f"To be generated: {len(to_generate)}")

# Show existing files
if existing_files:
    print(f"\n‚úÖ Already exist ({len(existing_files)}):")
    for idx, filename, config in existing_files:
        print(f"  [{idx}] {filename}")

# Show files to be generated
if to_generate:
    print(f"\nüîÑ Will generate ({len(to_generate)}):")
    for idx, filename, config in to_generate:
        print(f"  [{idx}] {filename}")
        print(f"      Mode: {config['mode']} | Question Type: {config['question_type']} | Provider: {config['provider']} | Model: {config['model']}")

# Estimate time (considering filtered queries)
if to_generate:
    # Calculate total queries to process (accounting for question_type filtering)
    total_queries_to_process = 0
    for _, _, config in to_generate:
        filtered_count = sum(1 for q in dataset if q["question_type"] == config["question_type"])
        total_queries_to_process += filtered_count
    
    estimated_time_minutes = (total_queries_to_process * CALL_DELAY) / 60
    print(f"\n‚è±Ô∏è  Estimated time: ~{estimated_time_minutes:.1f} minutes")
    print(f"   (Based on {total_queries_to_process} total filtered queries √ó {CALL_DELAY}s delay)")

print("="*80)

# Confirmation before starting
if to_generate:
    print("\nüöÄ Ready to start generation!")
    print("   This will process all configurations listed above.")
    print("="*80)
    
    # Start the batch generation process
    run_all_configurations(CONFIGURATIONS)
else:
    print("\n‚úÖ All files already exist. Nothing to generate.")
    print("   Delete files from output directory if you want to regenerate.")
    print("="*80)

  "generated_at": datetime.utcnow().isoformat() + "Z",


üìã CONFIGURATION STATISTICS
Total configurations: 1
  ‚Ä¢ Closed-book: 0
  ‚Ä¢ Oracle: 0
  ‚Ä¢ RAG: 1

By question type:
  ‚Ä¢ novel-generated: 1

By provider:
  ‚Ä¢ openai: 1

üìÇ FILE STATUS CHECK
Existing files: 0
To be generated: 1

üîÑ Will generate (1):
  [1] rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic.json
      Mode: rag | Question Type: novel-generated | Provider: openai | Model: gpt-4o-mini

‚è±Ô∏è  Estimated time: ~0.8 minutes
   (Based on 50 total filtered queries √ó 1s delay)

üöÄ Ready to start generation!
   This will process all configurations listed above.
üéØ STARTING BATCH GENERATION
Total configurations: 1
Output directory: ../../generation_set/closedbook_oracle_sets
Dataset: PatronusAI/financebench (150 questions)

Question type distribution:
  ‚Ä¢ domain-relevant: 50 questions
  ‚Ä¢ metrics-generated: 50 questions
  ‚Ä¢ novel-generated: 50 questions

[1/1] Processing configuration...
   Mode: rag, Question Type: novel-generated, Model: gpt-4o-

Generating answers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [08:42<00:00, 10.46s/it]


‚úÖ Successfully generated and saved: rag_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic.json
   Total queries processed: 50
   File size: 325.47 KB
   Output includes structured reasoning + final answers

üìä GENERATION SUMMARY
‚úÖ Generated: 1
‚è≠Ô∏è  Skipped (already exists): 0
‚ùå Failed: 0
üìÅ Output directory: ../../generation_set/closedbook_oracle_sets
üéâ All configurations completed successfully!





In [15]:
# ============================================================================
# STEP 13: VERIFICATION & TESTING
# ============================================================================

def verify_generated_files():
    """
    Verify all generated JSON files and display summary.
    Includes validation for structured output fields.
    """
    print("="*80)
    print("üîç VERIFYING GENERATED FILES")
    print("="*80)
    
    # Get all JSON files in output directory
    output_path = Path(OUTPUT_DIR)
    json_files = list(output_path.glob("*.json"))
    
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Total JSON files found: {len(json_files)}\n")
    
    if len(json_files) == 0:
        print("‚ö†Ô∏è  No JSON files found. Run run_all_configurations() first.")
        return
    
    # List all files grouped by question type
    print("üìÅ Generated files:")
    print("-" * 80)
    
    files_by_question_type = {}
    for file in json_files:
        # Extract question type from filename (format: mode_question-type_provider_...)
        parts = file.stem.split("_")
        if len(parts) >= 2:
            question_type = parts[1]
            if question_type not in files_by_question_type:
                files_by_question_type[question_type] = []
            files_by_question_type[question_type].append(file)
    
    for qt in sorted(files_by_question_type.keys()):
        print(f"\n{qt}:")
        for idx, file in enumerate(sorted(files_by_question_type[qt]), 1):
            file_size = file.stat().st_size / 1024  # KB
            print(f"  {idx}. {file.name} ({file_size:.2f} KB)")
    
    # Load and inspect a sample file
    print("\n" + "="*80)
    print("üìÑ SAMPLE FILE INSPECTION")
    print("="*80)
    
    sample_file = sorted(json_files)[0]
    print(f"Inspecting: {sample_file.name}\n")
    
    with open(sample_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Display metadata
    print("METADATA:")
    print("-" * 80)
    for key, value in data["metadata"].items():
        if key == "prompt_template":
            # Truncate long template
            print(f"  {key}: {value[:100]}...")
        elif key == "output_schema":
            # Show schema on separate lines
            print(f"  {key}:")
            for field, desc in value.items():
                print(f"    - {field}: {desc}")
        else:
            print(f"  {key}: {value}")
    
    # Display queries info
    print(f"\nQUERIES:")
    print("-" * 80)
    print(f"  Total queries: {len(data['queries'])}")
    print(f"  Question type filter: {data['metadata'].get('question_type', 'N/A')}")
    print(f"  Structured output: {data['metadata'].get('structured_output', False)}")
    print(f"  Method: {data['metadata'].get('structured_output_method', 'N/A')}")
    
    # Show first query as example
    if len(data["queries"]) > 0:
        print(f"\n  First query example:")
        first_query = data["queries"][0]
        print(f"    ID: {first_query['financebench_id']}")
        print(f"    Question Type: {first_query['question_type']}")
        print(f"    Company: {first_query['company']}")
        print(f"    Question: {first_query['question'][:100]}...")
        print(f"    Ground truth: {first_query['answer']}")
        print(f"    Reasoning (first 150 chars): {first_query['reasoning'][:150]}...")
        print(f"    Generated answer: {first_query['generated_answer']}")
        print(f"    Evidence pieces: {len(first_query['evidence'])}")
    
    # Verify all files have correct structure
    print("\n" + "="*80)
    print("‚úÖ STRUCTURE VALIDATION")
    print("="*80)
    
    all_valid = True
    validation_summary = []
    
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Check required top-level fields
            assert "metadata" in data, "Missing 'metadata' field"
            assert "queries" in data, "Missing 'queries' field"
            
            # Check metadata fields (including structured output fields)
            required_metadata = [
                "mode", "provider", "model", "temperature", "question_type",
                "prompt_template", "template_alias", 
                "structured_output", "output_schema", "structured_output_method",  # NEW
                "dataset", "total_questions", "total_dataset_questions", 
                "generated_at", "call_delay", "max_retries"
            ]
            for field in required_metadata:
                assert field in data["metadata"], f"Missing metadata field: {field}"
            
            # Validate structured output metadata
            assert data["metadata"]["structured_output"] == True, \
                "structured_output should be True"
            assert "reasoning" in data["metadata"]["output_schema"], \
                "output_schema missing 'reasoning' field"
            assert "final_answer" in data["metadata"]["output_schema"], \
                "output_schema missing 'final_answer' field"
            
            # Validate question_type consistency
            expected_question_type = data["metadata"]["question_type"]
            actual_queries = len(data["queries"])
            expected_queries = data["metadata"]["total_questions"]
            
            assert actual_queries == expected_queries, \
                f"Query count mismatch: expected {expected_queries}, got {actual_queries}"
            
            # Verify all queries match the question_type filter
            for query in data["queries"]:
                assert query["question_type"] == expected_question_type, \
                    f"Query {query['financebench_id']} has type {query['question_type']}, expected {expected_question_type}"
            
            # Check query fields (including new reasoning field)
            if len(data["queries"]) > 0:
                required_query_fields = [
                    "financebench_id", "question_type", "question_reasoning",
                    "question", "doc_name", "company", "answer", 
                    "reasoning",  # NEW FIELD
                    "generated_answer", "evidence"
                ]
                for field in required_query_fields:
                    assert field in data["queries"][0], f"Missing query field: {field}"
                
                # Validate that reasoning and generated_answer are not empty
                first_query = data["queries"][0]
                assert len(first_query["reasoning"].strip()) > 0, \
                    "Reasoning field is empty"
                assert len(first_query["generated_answer"].strip()) > 0, \
                    "Generated answer field is empty"
            
            validation_summary.append({
                "file": json_file.name,
                "status": "‚úÖ",
                "question_type": expected_question_type,
                "queries": actual_queries,
                "method": data["metadata"]["structured_output_method"]
            })
            
        except Exception as e:
            validation_summary.append({
                "file": json_file.name,
                "status": "‚ùå",
                "error": str(e)
            })
            all_valid = False
    
    # Display validation results
    for item in validation_summary:
        if item["status"] == "‚úÖ":
            print(f"  {item['status']} {item['file']}")
            print(f"      Question Type: {item['question_type']}, Queries: {item['queries']}, Method: {item['method']}")
        else:
            print(f"  {item['status']} {item['file']}: {item['error']}")
    
    print("\n" + "="*80)
    if all_valid:
        print("üéâ All files validated successfully!")
        print(f"   Total files: {len(json_files)}")
        total_queries = sum(item['queries'] for item in validation_summary if 'queries' in item)
        print(f"   Total queries across all files: {total_queries}")
        print(f"   All files use structured output with reasoning + final_answer")
    else:
        print("‚ö†Ô∏è  Some files have validation errors.")
    print("="*80)


# Run verification
verify_generated_files()

üîç VERIFYING GENERATED FILES
Output directory: ../../generation_set/closedbook_oracle_sets
Total JSON files found: 7

üìÅ Generated files:
--------------------------------------------------------------------------------

book:
  1. closed_book_domain-relevant_openai_gpt-4o-mini_0.0_domain_closed_basic.json (338.87 KB)
  2. closed_book_metrics-generated_openai_gpt-4o-mini_0.0_metrics_closed_basic.json (402.86 KB)
  3. closed_book_metrics-generated_openai_gpt-4o_0.0_metrics_closed_basic.json (414.28 KB)
  4. closed_book_novel-generated_openai_gpt-4o-mini_0.0_novel_closed_basic.json (324.01 KB)

domain-relevant:
  1. oracle_domain-relevant_openai_gpt-4o-mini_0.0_domain_rag_basic.json (336.46 KB)

metrics-generated:
  1. oracle_metrics-generated_openai_gpt-4o-mini_0.0_metrics_rag_basic.json (386.78 KB)

novel-generated:
  1. oracle_novel-generated_openai_gpt-4o-mini_0.0_novel_rag_basic.json (314.31 KB)

üìÑ SAMPLE FILE INSPECTION
Inspecting: closed_book_domain-relevant_openai_gpt-4o-mi