In [1]:
# ============================================================================
# Query Expansion Creator for FinanceBench
# ============================================================================
# 
# This notebook generates expanded versions of FinanceBench queries using
# various query expansion techniques (HyDE, query refinement, term expansion,
# chain-of-thought, and domain adaptation).
#
# The expanded queries are saved as JSON files to be used in retrieval
# evaluation experiments.
# ============================================================================

# %% [markdown]
# # Query Expansion Creator
# 
# ## Overview
# This notebook implements multiple query expansion techniques for financial QA:
# - **HyDE**: Generate hypothetical documents/answers
# - **Query Refinement**: Clarify and formalize queries
# - **Term Expansion**: Expand abbreviations and add synonyms
# - **Chain-of-Thought**: Make implicit context explicit
# - **Domain Adaptation**: Rephrase using financial/10-K language
#
# ## Output
# Each expansion method produces a JSON file containing:
# - Original queries from FinanceBench
# - Expanded query versions
# - Metadata (model, template, timestamps)

# %% [markdown]
# ## 1.1 Import Required Libraries

# %%
import os
import json
import time
from pathlib import Path
from typing import Dict, List, Optional, Any
from datetime import datetime

# Environment variables
from dotenv import load_dotenv

# Progress tracking
from tqdm.auto import tqdm

# Dataset
from datasets import load_dataset

# LangChain components
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama

print("✓ All imports successful")

# %% [markdown]
# ## 1.2 Load Environment Variables

# %%
# Load .env file from project root
load_dotenv()

# Check for API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")

# Verify availability
if OPENAI_API_KEY:
    print("✓ OpenAI API key loaded")
else:
    print("⚠ OpenAI API key not found (required if using OpenAI provider)")

print(f"✓ Ollama URL: {OLLAMA_BASE_URL}")

# %% [markdown]
# ## 1.3 Configuration Constants

# %%
# ============================================================================
# CONFIGURATION SECTION
# ============================================================================

# LLM Provider Configuration
# Choose: "openai" or "ollama"
# LLM_PROVIDER = "openai"  # Change to "ollama" for local models
LLM_PROVIDER = "ollama"  # Change to "ollama" for local models

# Model Selection
# OpenAI options: "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo"
# Ollama options: "llama2", "mistral", etc.
# LLM_MODEL = "gpt-4"
LLM_MODEL = "llama3:8b"

# Paths
OUTPUT_DIR = "../../expanded_queries"
DATASET_NAME = "PatronusAI/financebench"
DATASET_SPLIT = "train"

# Processing Parameters
CALL_DELAY = 1  # Seconds between each LLM call (rate limiting)
RETRY_DELAY = 30  # Seconds to wait before retry after failure
MAX_RETRIES = 3  # Maximum number of retry attempts

# Dry Run Configuration
DRY_RUN_ENABLED = False  # Set True to test with sample queries
DRY_RUN_SAMPLE_SIZE = 5  # Number of queries to test in dry run

# Validation Parameters
VALIDATION_CONFIG = {
    "min_length": 10,  # Minimum characters for expanded query
    "max_length": 1500,  # Maximum characters for expanded query
    "must_differ_from_original": True,  # Expanded must differ from original
    "similarity_threshold": 0.95  # Reject if >95% similar to original
}

print("✓ Configuration set")
print(f"  LLM Provider: {LLM_PROVIDER}")
print(f"  LLM Model: {LLM_MODEL}")
print(f"  Output Directory: {OUTPUT_DIR}")
print(f"  Dry Run: {'Enabled' if DRY_RUN_ENABLED else 'Disabled'}")

# %% [markdown]
# ## 1.4 Create Output Directory

# %%
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✓ Output directory ready: {OUTPUT_DIR}")

# %% [markdown]
# ## 1.5 Verify Configuration

# %%
# Verify that the selected provider has necessary credentials
if LLM_PROVIDER == "openai" and not OPENAI_API_KEY:
    raise ValueError(
        "OpenAI provider selected but OPENAI_API_KEY not found in environment. "
        "Please add it to your .env file or switch to 'ollama' provider."
    )

if LLM_PROVIDER == "ollama":
    print(f"⚠ Make sure Ollama is running at {OLLAMA_BASE_URL}")
    print(f"⚠ Make sure model '{LLM_MODEL}' is pulled: ollama pull {LLM_MODEL}")

print("\n" + "="*60)
print("✓ STEP 1 COMPLETE - Setup & Configuration Ready")
print("="*60)
print("\nNext steps:")
print("  1. Review configuration above")
print("  2. Modify LLM_PROVIDER or LLM_MODEL if needed")
print("  3. Proceed to Step 2: Template Definitions")

✓ All imports successful
✓ OpenAI API key loaded
✓ Ollama URL: http://localhost:11434
✓ Configuration set
  LLM Provider: ollama
  LLM Model: llama3:8b
  Output Directory: ../../expanded_queries
  Dry Run: Disabled
✓ Output directory ready: ../../expanded_queries
⚠ Make sure Ollama is running at http://localhost:11434
⚠ Make sure model 'llama3:8b' is pulled: ollama pull llama3:8b

✓ STEP 1 COMPLETE - Setup & Configuration Ready

Next steps:
  1. Review configuration above
  2. Modify LLM_PROVIDER or LLM_MODEL if needed
  3. Proceed to Step 2: Template Definitions


In [2]:
# ============================================================================
# Step 2: Template Definitions
# ============================================================================
# 
# This section defines all query expansion templates using LangChain's
# PromptTemplate. Each template is designed for a specific expansion
# technique and can be easily modified without touching the core logic.
#
# Templates are designed to work with various FinanceBench document types:
# - 10-K annual reports
# - 10-Q quarterly reports
# - Earnings call transcripts
# - Other SEC filings
#
# Templates are organized by expansion type:
# - HyDE (Hypothetical Document Expansion)
# - Query Refinement
# - Term Expansion
# - Chain-of-Thought
# - Domain Adaptation
# ============================================================================

# %% [markdown]
# ## 2.1 Define All Expansion Templates
# 
# Each template uses `{query}` as the placeholder for the original question.
# The LLM will replace this with the actual FinanceBench question.

# %%
# ============================================================================
# TEMPLATE DEFINITIONS
# ============================================================================

TEMPLATES = {
    # ------------------------------------------------------------------------
    # HyDE (Hypothetical Document Expansion)
    # ------------------------------------------------------------------------
    "hyde_basic": """Given the following financial question, generate a concise hypothetical answer that would likely appear in company financial documents (10-K, 10-Q, earnings calls, or other SEC filings).

Question: {query}

Hypothetical Answer:""",

    "hyde_detailed": """You are analyzing a company's financial documents (annual reports, quarterly filings, earnings call transcripts). Given the following question, write a detailed hypothetical paragraph that would contain the answer, including relevant context and financial terminology.

Question: {query}

Hypothetical Paragraph:""",

    "hyde_financial_terminology": """Given the following financial question, generate a hypothetical answer using proper accounting and financial terminology as it would appear in SEC filings (10-K, 10-Q, 8-K) or earnings call transcripts.

Question: {query}

Hypothetical Answer with Financial Terminology:""",

    # ------------------------------------------------------------------------
    # Query Refinement
    # ------------------------------------------------------------------------
    "query_refinement_clarification": """Rephrase the following financial question to make it more explicit and unambiguous. Keep all key information but make implicit details explicit (such as fiscal year format, reporting period, specific financial statement sections, document types, etc.).

Original Question: {query}

Clarified Question:""",

    "query_refinement_formal": """Convert the following question into formal financial/accounting language as it would appear in analyst reports, SEC filings, or earnings call discussions. Maintain the question format.

Original Question: {query}

Formal Question:""",

    "query_refinement_keyword_focused": """Rewrite the following question emphasizing the key financial terms, metrics, and document-specific terminology (annual report, quarterly filing, earnings call, etc.). Make sure important financial keywords are explicit and prominent.

Original Question: {query}

Keyword-Focused Question:""",

    # ------------------------------------------------------------------------
    # Term Expansion
    # ------------------------------------------------------------------------
    "term_expansion_abbreviation": """Expand all financial abbreviations, acronyms, and shortened terms in the following question while maintaining its original meaning and structure.

Examples:
- "EPS" → "earnings per share"
- "R&D" → "research and development"
- "COGS" → "cost of goods sold"
- "FY" → "fiscal year"
- "YoY" → "year over year"
- "QoQ" → "quarter over quarter"

Original Question: {query}

Expanded Question:""",

    "term_expansion_synonym": """Rewrite the following question by adding financial synonyms and alternative terms in parentheses where appropriate. This helps capture documents that might use different terminology across annual reports, quarterly filings, and earnings discussions.

Examples:
- "revenue" → "revenue (sales, net sales, total revenue)"
- "profit" → "profit (net income, earnings, net profit)"
- "expenses" → "expenses (costs, expenditures, operating expenses)"
- "guidance" → "guidance (outlook, forecast, projections)"

Original Question: {query}

Question with Synonyms:""",

    "term_expansion_context_addition": """Add implicit financial context to the following question by mentioning the likely source documents (10-K, 10-Q, earnings calls, press releases) or financial statements where the answer would be found.

Examples:
- Add phrases like: "in the financial statements", "according to the 10-K/10-Q filing", "reported in the annual/quarterly report", "disclosed in the earnings call", "stated in the income statement/balance sheet"

Original Question: {query}

Question with Context:""",

    # ------------------------------------------------------------------------
    # Chain-of-Thought
    # ------------------------------------------------------------------------
    "chain_of_thought_step_by_step": """Rewrite the following financial question by breaking down what information is being requested into clear, explicit steps or components. Consider whether the answer would come from annual reports, quarterly reports, or earnings discussions.

Original Question: {query}

Step-by-Step Question:""",

    "chain_of_thought_explicit_context": """Rewrite the following question making all implicit information explicit, such as:
- Time periods (fiscal year, quarter, specific dates, reporting period)
- Document sources (10-K annual report, 10-Q quarterly report, earnings call transcript, SEC filing)
- Financial statement sections (balance sheet, income statement, cash flow statement, MD&A)
- Calculation methods or definitions
- Reporting standards (GAAP, non-GAAP)

Original Question: {query}

Explicit Question:""",

    # ------------------------------------------------------------------------
    # Domain Adaptation
    # ------------------------------------------------------------------------
    "domain_adaptation_accounting_perspective": """Rewrite the following question from an accounting or auditing perspective, using language that accountants and financial auditors would use when reviewing annual reports, quarterly filings, and financial statements.

Original Question: {query}

Accounting Perspective Question:""",

    "domain_adaptation_10k_language": """Rewrite the following question using the exact language patterns, structure, and terminology typically found in SEC financial disclosures including 10-K annual reports, 10-Q quarterly reports, 8-K current reports, and earnings call transcripts.

Original Question: {query}

SEC Filing Style Question:"""
}

print(f"✓ Defined {len(TEMPLATES)} expansion templates")
print("\nTemplate types:")
for template_key in TEMPLATES.keys():
    print(f"  - {template_key}")

# %% [markdown]
# ## 2.2 Create LangChain PromptTemplate Objects

# %%
# Convert string templates to LangChain PromptTemplate objects
PROMPT_TEMPLATES = {}

for template_name, template_string in TEMPLATES.items():
    PROMPT_TEMPLATES[template_name] = PromptTemplate(
        input_variables=["query"],
        template=template_string
    )

print(f"✓ Created {len(PROMPT_TEMPLATES)} LangChain PromptTemplate objects")

# %% [markdown]
# ## 2.3 Test Template Formatting (Optional)

# %%
# Test that templates format correctly with a sample query
test_query = "What is the FY2018 capital expenditure amount (in USD millions) for 3M?"

print("="*60)
print("TEMPLATE FORMATTING TEST")
print("="*60)
print(f"\nTest Query: {test_query}\n")

# Show one example template formatting
example_template_name = "hyde_basic"
example_prompt = PROMPT_TEMPLATES[example_template_name]
formatted_prompt = example_prompt.format(query=test_query)

print(f"Example Template: {example_template_name}")
print("-" * 60)
print(formatted_prompt)
print("-" * 60)

print("\n✓ Template formatting works correctly")
print("✓ All templates ready to use")

# %% [markdown]
# ## 2.4 Template Summary

# %%
# Display summary of all templates organized by type
from collections import defaultdict

template_by_type = defaultdict(list)

for template_name in TEMPLATES.keys():
    # Extract type from template name (e.g., "hyde_basic" -> "hyde")
    parts = template_name.split('_')
    if len(parts) >= 2:
        expansion_type = '_'.join(parts[:2])  # e.g., "hyde", "query_refinement"
        subtype = '_'.join(parts[2:]) if len(parts) > 2 else parts[1]
    else:
        expansion_type = parts[0]
        subtype = "default"
    
    template_by_type[expansion_type].append(template_name)

print("\n" + "="*60)
print("TEMPLATE SUMMARY BY TYPE")
print("="*60)

for exp_type, templates in sorted(template_by_type.items()):
    print(f"\n{exp_type.upper().replace('_', ' ')}:")
    for template in templates:
        print(f"  ✓ {template}")

print("\n" + "="*60)
print("✓ STEP 2 COMPLETE - All Templates Defined")
print("="*60)
print("\nTemplate statistics:")
print(f"  Total templates: {len(TEMPLATES)}")
print(f"  Expansion types: {len(template_by_type)}")
print("\nTemplates are designed to work with:")
print("  ✓ 10-K annual reports")
print("  ✓ 10-Q quarterly reports")
print("  ✓ Earnings call transcripts")
print("  ✓ Other SEC filings (8-K, etc.)")
print("\nTemplates are stored in:")
print("  - TEMPLATES (dict): Raw template strings")
print("  - PROMPT_TEMPLATES (dict): LangChain PromptTemplate objects")
print("\nNext: Proceed to Step 3: Configuration Structure")

✓ Defined 13 expansion templates

Template types:
  - hyde_basic
  - hyde_detailed
  - hyde_financial_terminology
  - query_refinement_clarification
  - query_refinement_formal
  - query_refinement_keyword_focused
  - term_expansion_abbreviation
  - term_expansion_synonym
  - term_expansion_context_addition
  - chain_of_thought_step_by_step
  - chain_of_thought_explicit_context
  - domain_adaptation_accounting_perspective
  - domain_adaptation_10k_language
✓ Created 13 LangChain PromptTemplate objects
TEMPLATE FORMATTING TEST

Test Query: What is the FY2018 capital expenditure amount (in USD millions) for 3M?

Example Template: hyde_basic
------------------------------------------------------------
Given the following financial question, generate a concise hypothetical answer that would likely appear in company financial documents (10-K, 10-Q, earnings calls, or other SEC filings).

Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M?

Hypothetical Answer:


In [3]:
# ============================================================================
# Step 3: Configuration Structure
# ============================================================================
# 
# This section defines the configuration structure for all expansion methods.
# Each configuration specifies:
# - expansion_type: High-level category (hyde, query_refinement, etc.)
# - expansion_sub_type: Specific variant within the type
# - template_key: Which template to use from PROMPT_TEMPLATES
# - temperature: LLM temperature for this expansion method
#
# The CONFIGS_TO_RUN list allows you to easily select which expansions
# to execute by commenting/uncommenting entries.
# ============================================================================

# %% [markdown]
# ## 3.1 Define All Expansion Configurations
# 
# Each configuration maps to one template and specifies its parameters.

# %%
# ============================================================================
# ALL EXPANSION CONFIGURATIONS
# ============================================================================

ALL_EXPANSION_CONFIGS = {
    # ------------------------------------------------------------------------
    # HyDE (Hypothetical Document Expansion)
    # Higher temperature for creative hypothesis generation
    # ------------------------------------------------------------------------
    "hyde_basic": {
        "expansion_type": "hyde",
        "expansion_sub_type": "basic",
        "template_key": "hyde_basic",
        "temperature": 0.7
    },
    
    "hyde_detailed": {
        "expansion_type": "hyde",
        "expansion_sub_type": "detailed",
        "template_key": "hyde_detailed",
        "temperature": 0.8
    },
    
    "hyde_financial_terminology": {
        "expansion_type": "hyde",
        "expansion_sub_type": "financial_terminology",
        "template_key": "hyde_financial_terminology",
        "temperature": 0.7
    },
    
    # ------------------------------------------------------------------------
    # Query Refinement
    # Lower temperature for precise rephrasing
    # ------------------------------------------------------------------------
    "query_refinement_clarification": {
        "expansion_type": "query_refinement",
        "expansion_sub_type": "clarification",
        "template_key": "query_refinement_clarification",
        "temperature": 0.3
    },
    
    "query_refinement_formal": {
        "expansion_type": "query_refinement",
        "expansion_sub_type": "formal",
        "template_key": "query_refinement_formal",
        "temperature": 0.3
    },
    
    "query_refinement_keyword_focused": {
        "expansion_type": "query_refinement",
        "expansion_sub_type": "keyword_focused",
        "template_key": "query_refinement_keyword_focused",
        "temperature": 0.4
    },
    
    # ------------------------------------------------------------------------
    # Term Expansion
    # Medium temperature for balanced expansion
    # ------------------------------------------------------------------------
    "term_expansion_abbreviation": {
        "expansion_type": "term_expansion",
        "expansion_sub_type": "abbreviation",
        "template_key": "term_expansion_abbreviation",
        "temperature": 0.3
    },
    
    "term_expansion_synonym": {
        "expansion_type": "term_expansion",
        "expansion_sub_type": "synonym",
        "template_key": "term_expansion_synonym",
        "temperature": 0.5
    },
    
    "term_expansion_context_addition": {
        "expansion_type": "term_expansion",
        "expansion_sub_type": "context_addition",
        "template_key": "term_expansion_context_addition",
        "temperature": 0.4
    },
    
    # ------------------------------------------------------------------------
    # Chain-of-Thought
    # Medium temperature for thoughtful elaboration
    # ------------------------------------------------------------------------
    "chain_of_thought_step_by_step": {
        "expansion_type": "chain_of_thought",
        "expansion_sub_type": "step_by_step",
        "template_key": "chain_of_thought_step_by_step",
        "temperature": 0.5
    },
    
    "chain_of_thought_explicit_context": {
        "expansion_type": "chain_of_thought",
        "expansion_sub_type": "explicit_context",
        "template_key": "chain_of_thought_explicit_context",
        "temperature": 0.4
    },
    
    # ------------------------------------------------------------------------
    # Domain Adaptation
    # Lower temperature for consistent domain-specific language
    # ------------------------------------------------------------------------
    "domain_adaptation_accounting_perspective": {
        "expansion_type": "domain_adaptation",
        "expansion_sub_type": "accounting_perspective",
        "template_key": "domain_adaptation_accounting_perspective",
        "temperature": 0.4
    },
    
    "domain_adaptation_10k_language": {
        "expansion_type": "domain_adaptation",
        "expansion_sub_type": "10k_language",
        "template_key": "domain_adaptation_10k_language",
        "temperature": 0.4
    }
}

print(f"✓ Defined {len(ALL_EXPANSION_CONFIGS)} expansion configurations")

# %% [markdown]
# ## 3.2 Validate Configuration Consistency

# %%
# Validate that all template_keys exist in PROMPT_TEMPLATES
validation_errors = []

for config_name, config in ALL_EXPANSION_CONFIGS.items():
    template_key = config["template_key"]
    if template_key not in PROMPT_TEMPLATES:
        validation_errors.append(f"Config '{config_name}' references missing template '{template_key}'")

if validation_errors:
    print("❌ Configuration validation errors:")
    for error in validation_errors:
        print(f"  - {error}")
    raise ValueError("Configuration validation failed. Fix the errors above.")
else:
    print("✓ All configurations reference valid templates")

# %% [markdown]
# ## 3.3 Select Configurations to Run
# 
# **IMPORTANT: This is where you control which expansions to execute.**
# 
# Comment out any expansion methods you don't want to run.
# The system will also skip any configs that already have output files.

# %%
# ============================================================================
# CONFIGURATION SELECTION
# ============================================================================
# 
# Uncomment the expansion methods you want to run.
# The system will automatically skip configs whose output files already exist.
# ============================================================================

CONFIGS_TO_RUN = [
    # HyDE expansions
    "hyde_basic",
    "hyde_detailed",
    "hyde_financial_terminology",
    
    # Query refinement
    "query_refinement_clarification",
    "query_refinement_formal",
    "query_refinement_keyword_focused",
    
    # Term expansion
    "term_expansion_abbreviation",
    "term_expansion_synonym",
    "term_expansion_context_addition",
    
    # Chain-of-thought
    "chain_of_thought_step_by_step",
    "chain_of_thought_explicit_context",
    
    # Domain adaptation
    "domain_adaptation_accounting_perspective",
    "domain_adaptation_10k_language",
]

print(f"✓ Selected {len(CONFIGS_TO_RUN)} configurations to run")
print("\nSelected configurations:")
for config_name in CONFIGS_TO_RUN:
    config = ALL_EXPANSION_CONFIGS[config_name]
    print(f"  - {config_name} (temp={config['temperature']})")

# %% [markdown]
# ## 3.4 Verify Selected Configurations

# %%
# Check that all selected configs exist
invalid_configs = [c for c in CONFIGS_TO_RUN if c not in ALL_EXPANSION_CONFIGS]

if invalid_configs:
    print("❌ Invalid configuration names in CONFIGS_TO_RUN:")
    for config in invalid_configs:
        print(f"  - {config}")
    raise ValueError("Some selected configurations don't exist. Check CONFIGS_TO_RUN.")
else:
    print("✓ All selected configurations are valid")

# %% [markdown]
# ## 3.5 Check for Existing Output Files

# %%
# Check which configs already have output files
configs_with_existing_files = []
configs_to_process = []

for config_name in CONFIGS_TO_RUN:
    config = ALL_EXPANSION_CONFIGS[config_name]
    exp_type = config["expansion_type"]
    exp_subtype = config["expansion_sub_type"]
    
    output_filename = f"expanded_queries_{exp_type}_{exp_subtype}.json"
    output_path = os.path.join(OUTPUT_DIR, output_filename)
    
    if os.path.exists(output_path):
        configs_with_existing_files.append(config_name)
    else:
        configs_to_process.append(config_name)

# Display status
print("\n" + "="*60)
print("OUTPUT FILE STATUS")
print("="*60)

if configs_with_existing_files:
    print(f"\n✓ Files already exist (will skip): {len(configs_with_existing_files)}")
    for config_name in configs_with_existing_files:
        config = ALL_EXPANSION_CONFIGS[config_name]
        filename = f"expanded_queries_{config['expansion_type']}_{config['expansion_sub_type']}.json"
        print(f"  - {filename}")

if configs_to_process:
    print(f"\n⚙️  Configs to process: {len(configs_to_process)}")
    for config_name in configs_to_process:
        config = ALL_EXPANSION_CONFIGS[config_name]
        filename = f"expanded_queries_{config['expansion_type']}_{config['expansion_sub_type']}.json"
        print(f"  - {filename}")
else:
    print("\n✓ All selected configurations already have output files")
    print("  No processing needed (unless you delete existing files)")

# %% [markdown]
# ## 3.6 Configuration Summary

# %%
# Display summary statistics
print("\n" + "="*60)
print("CONFIGURATION SUMMARY")
print("="*60)

# Count by expansion type
type_counts = defaultdict(int)
for config_name in CONFIGS_TO_RUN:
    config = ALL_EXPANSION_CONFIGS[config_name]
    type_counts[config["expansion_type"]] += 1

print("\nConfigurations by type:")
for exp_type, count in sorted(type_counts.items()):
    print(f"  {exp_type}: {count}")

print(f"\nTotal configurations:")
print(f"  Selected to run: {len(CONFIGS_TO_RUN)}")
print(f"  Already complete: {len(configs_with_existing_files)}")
print(f"  Need processing: {len(configs_to_process)}")

print("\n" + "="*60)
print("✓ STEP 3 COMPLETE - Configuration Structure Ready")
print("="*60)
print("\nNext: Proceed to Step 4: Core Functions")

✓ Defined 13 expansion configurations
✓ All configurations reference valid templates
✓ Selected 13 configurations to run

Selected configurations:
  - hyde_basic (temp=0.7)
  - hyde_detailed (temp=0.8)
  - hyde_financial_terminology (temp=0.7)
  - query_refinement_clarification (temp=0.3)
  - query_refinement_formal (temp=0.3)
  - query_refinement_keyword_focused (temp=0.4)
  - term_expansion_abbreviation (temp=0.3)
  - term_expansion_synonym (temp=0.5)
  - term_expansion_context_addition (temp=0.4)
  - chain_of_thought_step_by_step (temp=0.5)
  - chain_of_thought_explicit_context (temp=0.4)
  - domain_adaptation_accounting_perspective (temp=0.4)
  - domain_adaptation_10k_language (temp=0.4)
✓ All selected configurations are valid

OUTPUT FILE STATUS

⚙️  Configs to process: 13
  - expanded_queries_hyde_basic.json
  - expanded_queries_hyde_detailed.json
  - expanded_queries_hyde_financial_terminology.json
  - expanded_queries_query_refinement_clarification.json
  - expanded_queries_que

In [4]:
# ============================================================================
# Step 4: Core Functions
# ============================================================================
# 
# This section implements the core functions for query expansion:
# 1. LLM client initialization (OpenAI/Ollama)
# 2. Query expansion with retry logic and delays
# 3. Response validation
# 4. JSON file operations
#
# Key features:
# - Automatic retry on failures (with 30s wait)
# - Rate limiting (1s delay between calls)
# - Validation (non-empty, differs from original, length checks)
# - Clean error handling
# ============================================================================

# %% [markdown]
# ## 4.1 LLM Client Initialization

# %%
def get_llm_client(provider: str, model: str, temperature: float):
    """
    Initialize and return appropriate LLM client based on provider.
    
    Args:
        provider: "openai" or "ollama"
        model: Model name/identifier
        temperature: Sampling temperature (0.0 = deterministic, 1.0 = creative)
        
    Returns:
        LangChain LLM client (ChatOpenAI or ChatOllama)
        
    Raises:
        ValueError: If provider is not supported
    """
    if provider == "openai":
        return ChatOpenAI(
            model=model,
            temperature=temperature,
            openai_api_key=OPENAI_API_KEY
        )
    elif provider == "ollama":
        return ChatOllama(
            model=model,
            temperature=temperature,
            base_url=OLLAMA_BASE_URL
        )
    else:
        raise ValueError(f"Unsupported LLM provider: {provider}. Use 'openai' or 'ollama'.")

print("✓ get_llm_client() defined")

# %% [markdown]
# ## 4.2 Response Validation

# %%
def validate_expansion(
    original_query: str,
    expanded_query: str,
    config: Dict = None
) -> tuple[bool, Optional[str]]:
    """
    Validate that the expanded query meets quality criteria.
    
    Args:
        original_query: The original question from FinanceBench
        expanded_query: The LLM-generated expanded version
        config: Validation configuration (uses VALIDATION_CONFIG if None)
        
    Returns:
        Tuple of (is_valid, error_message)
        - (True, None) if valid
        - (False, error_message) if invalid
    """
    if config is None:
        config = VALIDATION_CONFIG
    
    # Check 1: Non-empty
    if not expanded_query or not expanded_query.strip():
        return False, "Expanded query is empty"
    
    expanded_query = expanded_query.strip()
    
    # Check 2: Minimum length
    if len(expanded_query) < config["min_length"]:
        return False, f"Expanded query too short ({len(expanded_query)} chars, minimum {config['min_length']})"
    
    # Check 3: Maximum length
    if len(expanded_query) > config["max_length"]:
        return False, f"Expanded query too long ({len(expanded_query)} chars, maximum {config['max_length']})"
    
    # Check 4: Must differ from original
    if config["must_differ_from_original"]:
        # Exact match check (case-insensitive)
        if expanded_query.lower() == original_query.lower():
            return False, "Expanded query is identical to original"
        
        # Simple token-based similarity check
        # Split into words and compare
        original_tokens = set(original_query.lower().split())
        expanded_tokens = set(expanded_query.lower().split())
        
        # Calculate Jaccard similarity (intersection / union)
        if len(expanded_tokens) > 0:
            intersection = len(original_tokens & expanded_tokens)
            union = len(original_tokens | expanded_tokens)
            jaccard_similarity = intersection / union if union > 0 else 0
            
            # Only reject if extremely similar (almost no new words added)
            if jaccard_similarity > config["similarity_threshold"]:
                return False, f"Expanded query too similar to original (similarity: {jaccard_similarity:.2f})"
    
    # All checks passed
    return True, None

print("✓ validate_expansion() defined")

# %% [markdown]
# ## 4.3 Single Query Expansion with Retry Logic

# %%
def expand_single_query(
    chain: LLMChain,
    query: str,
    original_query: str,
    max_retries: int = MAX_RETRIES,
    retry_delay: int = RETRY_DELAY,
    call_delay: int = CALL_DELAY
) -> str:
    """
    Expand a single query using the LLM chain with retry logic.
    
    Args:
        chain: LangChain LLMChain (prompt + LLM)
        query: The query to expand
        original_query: Original query for validation
        max_retries: Maximum number of retry attempts
        retry_delay: Seconds to wait before retry after failure
        call_delay: Seconds to wait after successful call (rate limiting)
        
    Returns:
        Expanded query string
        
    Raises:
        Exception: If all retries exhausted or validation fails
    """
    last_error = None
    
    for attempt in range(max_retries):
        try:
            # Call LLM
            response = chain.run(query=query)
            expanded = response.strip()
            
            # Validate response
            is_valid, error_msg = validate_expansion(original_query, expanded)
            
            if not is_valid:
                raise ValueError(f"Validation failed: {error_msg}")
            
            # Success - add delay for rate limiting
            time.sleep(call_delay)
            
            return expanded
            
        except Exception as e:
            last_error = e
            
            if attempt < max_retries - 1:  # Not the last attempt
                print(f"    ⚠️  Attempt {attempt + 1} failed: {str(e)}")
                print(f"    ⏳ Waiting {retry_delay}s before retry...")
                time.sleep(retry_delay)
            else:  # Last attempt failed
                print(f"    ❌ All {max_retries} attempts failed")
    
    # All retries exhausted
    raise Exception(f"Failed to expand query after {max_retries} attempts. Last error: {str(last_error)}")

print("✓ expand_single_query() defined")

# %% [markdown]
# ## 4.4 Save Results to JSON

# %%
def save_expansion_results(
    results: List[Dict],
    expansion_type: str,
    expansion_sub_type: str,
    template: str,
    llm_provider: str,
    llm_model: str,
    temperature: float,
    output_dir: str = OUTPUT_DIR
) -> str:
    """
    Save expanded queries to JSON file with metadata.
    
    Args:
        results: List of query dictionaries with expansion results
        expansion_type: High-level expansion category
        expansion_sub_type: Specific variant within the type
        template: The prompt template used
        llm_provider: "openai" or "ollama"
        llm_model: Model identifier
        temperature: Temperature used for generation
        output_dir: Directory to save JSON file
        
    Returns:
        Path to saved JSON file
    """
    # Prepare metadata
    metadata = {
        "expansion_type": expansion_type,
        "expansion_sub_type": expansion_sub_type,
        "llm_provider": llm_provider,
        "llm_model": llm_model,
        "template": template,
        "temperature": temperature,
        "creation_date": datetime.now().isoformat(),
        "total_queries": len(results)
    }
    
    # Prepare output structure
    output_data = {
        "metadata": metadata,
        "queries": results
    }
    
    # Generate filename
    filename = f"expanded_queries_{expansion_type}_{expansion_sub_type}.json"
    filepath = os.path.join(output_dir, filename)
    
    # Save to file
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    return filepath

print("✓ save_expansion_results() defined")

# %% [markdown]
# ## 4.5 Load and Validate JSON

# %%
def load_expansion_results(filepath: str) -> Dict:
    """
    Load and validate expansion results from JSON file.
    
    Args:
        filepath: Path to JSON file
        
    Returns:
        Dictionary with metadata and queries
        
    Raises:
        FileNotFoundError: If file doesn't exist
        json.JSONDecodeError: If file is not valid JSON
        ValueError: If JSON structure is invalid
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Validate structure
    if "metadata" not in data or "queries" not in data:
        raise ValueError("Invalid JSON structure: missing 'metadata' or 'queries'")
    
    required_metadata = ["expansion_type", "expansion_sub_type", "llm_provider", 
                         "llm_model", "template", "temperature", "total_queries"]
    
    for field in required_metadata:
        if field not in data["metadata"]:
            raise ValueError(f"Invalid JSON structure: missing metadata field '{field}'")
    
    return data

print("✓ load_expansion_results() defined")

# %% [markdown]
# ## 4.6 Test Core Functions (Optional)

# %%
# Test LLM client initialization
try:
    test_llm = get_llm_client(LLM_PROVIDER, LLM_MODEL, 0.7)
    print(f"\n✓ LLM client test passed: {LLM_PROVIDER}/{LLM_MODEL}")
except Exception as e:
    print(f"\n❌ LLM client test failed: {e}")

# Test validation function
test_cases = [
    ("What is revenue?", "What is the total revenue?", True, "Valid expansion"),
    ("What is revenue?", "", False, "Empty expansion"),
    ("What is revenue?", "Rev", False, "Too short"),
    ("What is revenue?", "What is revenue?", False, "Identical to original"),
    ("What is revenue?", "X" * 1700, False, "Too long"),
]

print("\n" + "="*60)
print("VALIDATION TESTS")
print("="*60)

for original, expanded, expected_valid, description in test_cases:
    is_valid, error_msg = validate_expansion(original, expanded)
    status = "✓" if is_valid == expected_valid else "❌"
    print(f"{status} {description}: {'Valid' if is_valid else error_msg}")

print("\n" + "="*60)
print("✓ STEP 4 COMPLETE - Core Functions Ready")
print("="*60)
print("\nImplemented functions:")
print("  ✓ get_llm_client() - Initialize LLM with provider/model")
print("  ✓ validate_expansion() - Check expansion quality")
print("  ✓ expand_single_query() - Expand with retry logic")
print("  ✓ save_expansion_results() - Save to JSON with metadata")
print("  ✓ load_expansion_results() - Load and validate JSON")
print("\nNext: Proceed to Step 5: Dataset Loading")

✓ get_llm_client() defined
✓ validate_expansion() defined
✓ expand_single_query() defined
✓ save_expansion_results() defined
✓ load_expansion_results() defined

✓ LLM client test passed: ollama/llama3:8b

VALIDATION TESTS
✓ Valid expansion: Valid
✓ Empty expansion: Expanded query is empty
✓ Too short: Expanded query too short (3 chars, minimum 10)
✓ Identical to original: Expanded query is identical to original
✓ Too long: Expanded query too long (1700 chars, maximum 1500)

✓ STEP 4 COMPLETE - Core Functions Ready

Implemented functions:
  ✓ get_llm_client() - Initialize LLM with provider/model
  ✓ validate_expansion() - Check expansion quality
  ✓ expand_single_query() - Expand with retry logic
  ✓ save_expansion_results() - Save to JSON with metadata
  ✓ load_expansion_results() - Load and validate JSON

Next: Proceed to Step 5: Dataset Loading


In [5]:
# ============================================================================
# Step 5: Dataset Loading
# ============================================================================
# 
# This section loads the FinanceBench dataset from HuggingFace and prepares
# it for query expansion processing.
#
# FinanceBench contains 150 expert-crafted financial questions with:
# - Question text
# - Expected document (10-K, 10-Q, earnings calls, etc.)
# - Evidence page numbers
# - Gold standard answers
# - Company information
#
# We extract the necessary fields for our expansion task:
# - financebench_id: Unique identifier
# - question: Original query text
# - doc_name: Source document name (for metadata)
# ============================================================================

# %% [markdown]
# ## 5.1 Load FinanceBench Dataset

# %%
print("Loading FinanceBench dataset...")
print(f"  Dataset: {DATASET_NAME}")
print(f"  Split: {DATASET_SPLIT}")

# Load dataset from HuggingFace
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)

print(f"✓ Loaded {len(dataset)} queries from FinanceBench")

# %% [markdown]
# ## 5.2 Inspect Dataset Structure

# %%
# Display sample entry to understand structure
print("\n" + "="*60)
print("SAMPLE DATASET ENTRY")
print("="*60)

sample = dataset[0]
print("\nAvailable fields:")
for key in sample.keys():
    value = sample[key]
    # Truncate long values for display
    if isinstance(value, str) and len(value) > 100:
        display_value = value[:100] + "..."
    else:
        display_value = value
    print(f"  {key}: {display_value}")

# %% [markdown]
# ## 5.3 Extract Required Fields

# %%
def extract_query_data(dataset_entry: Dict) -> Dict:
    """
    Extract required fields from a FinanceBench dataset entry.
    
    Args:
        dataset_entry: Single entry from FinanceBench dataset
        
    Returns:
        Dictionary with extracted fields:
        - financebench_id: Unique identifier
        - original_query: The question text
        - doc_name: Source document name
    """
    return {
        "financebench_id": dataset_entry["financebench_id"],
        "original_query": dataset_entry["question"],
        "doc_name": dataset_entry["doc_name"]
    }

# Process all dataset entries
financebench_queries = []

for entry in dataset:
    query_data = extract_query_data(entry)
    financebench_queries.append(query_data)

print(f"✓ Extracted data from {len(financebench_queries)} queries")

# %% [markdown]
# ## 5.4 Validate Extracted Data

# %%
# Validation checks
print("\n" + "="*60)
print("DATA VALIDATION")
print("="*60)

# Check for duplicates
ids = [q["financebench_id"] for q in financebench_queries]
if len(ids) != len(set(ids)):
    print("⚠️  Warning: Duplicate IDs found in dataset")
else:
    print(f"✓ All {len(ids)} IDs are unique")

# Check for empty queries
empty_queries = [q for q in financebench_queries if not q["original_query"].strip()]
if empty_queries:
    print(f"⚠️  Warning: {len(empty_queries)} empty queries found")
else:
    print(f"✓ No empty queries")

# Check for missing doc_names
missing_docs = [q for q in financebench_queries if not q["doc_name"]]
if missing_docs:
    print(f"⚠️  Warning: {len(missing_docs)} queries missing doc_name")
else:
    print(f"✓ All queries have doc_name")

# %% [markdown]
# ## 5.5 Display Statistics

# %%
# Calculate statistics
from collections import Counter

print("\n" + "="*60)
print("DATASET STATISTICS")
print("="*60)

# Query length statistics
query_lengths = [len(q["original_query"]) for q in financebench_queries]
print(f"\nQuery lengths (characters):")
print(f"  Min: {min(query_lengths)}")
print(f"  Max: {max(query_lengths)}")
print(f"  Mean: {sum(query_lengths) / len(query_lengths):.1f}")
print(f"  Median: {sorted(query_lengths)[len(query_lengths)//2]}")

# Document type distribution
doc_names = [q["doc_name"] for q in financebench_queries]
# Extract document type (e.g., "APPLE_2019_10K" -> "10K")
doc_types = []
for doc in doc_names:
    if "_10K" in doc:
        doc_types.append("10-K")
    elif "_10Q" in doc:
        doc_types.append("10-Q")
    elif "earnings" in doc.lower() or "call" in doc.lower():
        doc_types.append("Earnings Call")
    else:
        doc_types.append("Other")

doc_type_counts = Counter(doc_types)
print(f"\nDocument type distribution:")
for doc_type, count in sorted(doc_type_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / len(doc_types)) * 100
    print(f"  {doc_type}: {count} ({percentage:.1f}%)")

# Company distribution (top 10)
companies = [doc.split('_')[0] for doc in doc_names]
company_counts = Counter(companies)
print(f"\nTop 10 companies:")
for company, count in company_counts.most_common(10):
    print(f"  {company}: {count} queries")

# %% [markdown]
# ## 5.6 Sample Queries Display

# %%
# Display sample queries
print("\n" + "="*60)
print("SAMPLE QUERIES")
print("="*60)

import random
random.seed(42)  # For reproducibility

sample_indices = random.sample(range(len(financebench_queries)), min(5, len(financebench_queries)))

for i, idx in enumerate(sample_indices, 1):
    query = financebench_queries[idx]
    print(f"\n[Sample {i}]")
    print(f"  ID: {query['financebench_id']}")
    print(f"  Doc: {query['doc_name']}")
    print(f"  Query: {query['original_query']}")

# %% [markdown]
# ## 5.7 Prepare for Processing

# %%
# Store in global variable for easy access in next steps
FINANCEBENCH_QUERIES = financebench_queries

print("\n" + "="*60)
print("✓ STEP 5 COMPLETE - Dataset Ready")
print("="*60)
print(f"\nDataset summary:")
print(f"  Total queries: {len(FINANCEBENCH_QUERIES)}")
print(f"  Stored in variable: FINANCEBENCH_QUERIES")
print(f"  Ready for expansion processing")
print("\nEach query contains:")
print("  ✓ financebench_id")
print("  ✓ original_query")
print("  ✓ doc_name")
print("\nNext: Proceed to Step 6: Processing Function")

Loading FinanceBench dataset...
  Dataset: PatronusAI/financebench
  Split: train
✓ Loaded 150 queries from FinanceBench

SAMPLE DATASET ENTRY

Available fields:
  financebench_id: financebench_id_03029
  company: 3M
  doc_name: 3M_2018_10K
  question_type: metrics-generated
  question_reasoning: Information extraction
  domain_question_num: None
  question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
  answer: $1577.00
  justification: The metric capital expenditures was directly extracted from the company 10K. The line item name, as ...
  dataset_subset_label: OPEN_SOURCE
  evidence: [{'evidence_text': 'Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\nYears ended December 31\n \n(Millions)\n \n2018\n \n2017\n \n2016\n \nCash Flows from Operating Activities\n \n \n \n \n \n \n \nNet income including noncontrolling interest\n \n$\n5,363 \n$\n4,869 \n$\n5,058 \nAdjustments to reconcile net

In [6]:
# ============================================================================
# Step 6: Processing Function
# ============================================================================
# 
# This section implements the main processing function that:
# 1. Takes a configuration (expansion type/subtype)
# 2. Processes all 150 FinanceBench queries (or dry run sample)
# 3. Applies the expansion with retry logic and delays
# 4. Saves results to JSON file
#
# Supports:
# - Dry run mode (test on N queries)
# - Progress tracking with tqdm
# - Automatic file skipping if output exists
# - Error handling with immediate failure (as requested)
# ============================================================================

# %% [markdown]
# ## 6.1 Main Processing Function

# %%
def process_expansion_config(
    config_name: str,
    config: Dict,
    queries: List[Dict],
    dry_run: bool = False,
    dry_run_size: int = DRY_RUN_SAMPLE_SIZE,
    output_dir: str = OUTPUT_DIR
) -> Dict:
    """
    Process all queries for a single expansion configuration.
    
    Args:
        config_name: Name of the configuration (e.g., "hyde_basic")
        config: Configuration dictionary with type, subtype, template_key, temperature
        queries: List of query dictionaries from FinanceBench
        dry_run: If True, only process first N queries for testing
        dry_run_size: Number of queries to process in dry run
        output_dir: Directory to save output JSON
        
    Returns:
        Dictionary with processing statistics
        
    Raises:
        Exception: If any query expansion fails (fails immediately as requested)
    """
    # Extract config details
    expansion_type = config["expansion_type"]
    expansion_sub_type = config["expansion_sub_type"]
    template_key = config["template_key"]
    temperature = config["temperature"]
    
    # Check if output file already exists
    output_filename = f"expanded_queries_{expansion_type}_{expansion_sub_type}.json"
    output_path = os.path.join(output_dir, output_filename)
    
    if os.path.exists(output_path) and not dry_run:
        print(f"⏭️  File already exists: {output_filename}")
        print(f"   Skipping {config_name}")
        return {
            "status": "skipped",
            "reason": "output_file_exists",
            "output_path": output_path
        }
    
    # Prepare for processing
    print("\n" + "="*60)
    print(f"PROCESSING: {config_name}")
    print("="*60)
    print(f"  Expansion Type: {expansion_type}")
    print(f"  Expansion Sub-Type: {expansion_sub_type}")
    print(f"  Template: {template_key}")
    print(f"  Temperature: {temperature}")
    print(f"  LLM: {LLM_PROVIDER}/{LLM_MODEL}")
    
    if dry_run:
        print(f"  Mode: DRY RUN (processing {dry_run_size} queries)")
        queries_to_process = queries[:dry_run_size]
    else:
        print(f"  Mode: FULL RUN (processing {len(queries)} queries)")
        queries_to_process = queries
    
    # Initialize LLM and chain
    print("\n  Initializing LLM...")
    llm = get_llm_client(LLM_PROVIDER, LLM_MODEL, temperature)
    prompt_template = PROMPT_TEMPLATES[template_key]
    chain = LLMChain(llm=llm, prompt=prompt_template)
    print("  ✓ LLM chain ready")
    
    # Process queries
    print(f"\n  Processing {len(queries_to_process)} queries...")
    results = []
    start_time = time.time()
    
    # Progress bar
    pbar = tqdm(queries_to_process, desc=f"  {config_name}", unit="query")
    
    for query_data in pbar:
        try:
            # Extract query info
            financebench_id = query_data["financebench_id"]
            original_query = query_data["original_query"]
            doc_name = query_data["doc_name"]
            
            # Expand query
            expanded_query = expand_single_query(
                chain=chain,
                query=original_query,
                original_query=original_query
            )
            
            # Store result
            result = {
                "financebench_id": financebench_id,
                "original_query": original_query,
                "expanded_query": expanded_query,
                "doc_name": doc_name
            }
            results.append(result)
            
        except Exception as e:
            # Fail immediately as requested (Option A from earlier discussion)
            print(f"\n\n❌ ERROR processing query: {financebench_id}")
            print(f"   Original query: {original_query}")
            print(f"   Error: {str(e)}")
            print(f"\n⛔ Processing stopped. Fix the issue and re-run.")
            raise e
    
    pbar.close()
    
    # Calculate statistics
    elapsed_time = time.time() - start_time
    avg_time_per_query = elapsed_time / len(results) if results else 0
    
    print(f"\n  ✓ Processed {len(results)} queries successfully")
    print(f"  ⏱️  Total time: {elapsed_time:.1f}s")
    print(f"  ⏱️  Average per query: {avg_time_per_query:.2f}s")
    
    # Save results (skip if dry run)
    if dry_run:
        print(f"\n  ℹ️  DRY RUN - Results not saved")
        print(f"  ℹ️  Set DRY_RUN_ENABLED=False to save results")
        
        # Display sample expansions
        print(f"\n  Sample expansions:")
        for i, result in enumerate(results[:3], 1):
            print(f"\n  [{i}] Original:")
            print(f"      {result['original_query'][:100]}...")
            print(f"      Expanded:")
            print(f"      {result['expanded_query'][:100]}...")
        
        return {
            "status": "dry_run_complete",
            "queries_processed": len(results),
            "avg_time_per_query": avg_time_per_query
        }
    else:
        print(f"\n  Saving results...")
        
        # Get template string for metadata
        template_string = TEMPLATES[template_key]
        
        # Save to JSON
        saved_path = save_expansion_results(
            results=results,
            expansion_type=expansion_type,
            expansion_sub_type=expansion_sub_type,
            template=template_string,
            llm_provider=LLM_PROVIDER,
            llm_model=LLM_MODEL,
            temperature=temperature,
            output_dir=output_dir
        )
        
        print(f"  ✓ Saved to: {saved_path}")
        
        return {
            "status": "completed",
            "queries_processed": len(results),
            "output_path": saved_path,
            "avg_time_per_query": avg_time_per_query,
            "total_time": elapsed_time
        }

print("✓ process_expansion_config() defined")

# %% [markdown]
# ## 6.2 Batch Processing Function

# %%
def process_all_configs(
    configs_to_run: List[str],
    all_configs: Dict,
    queries: List[Dict],
    dry_run: bool = DRY_RUN_ENABLED,
    output_dir: str = OUTPUT_DIR
) -> Dict:
    """
    Process multiple expansion configurations in batch.
    
    Args:
        configs_to_run: List of configuration names to process
        all_configs: Dictionary of all available configurations
        queries: List of query dictionaries from FinanceBench
        dry_run: If True, run in dry run mode for all configs
        output_dir: Directory to save output files
        
    Returns:
        Dictionary with summary statistics for all configs
    """
    print("\n" + "="*60)
    print("BATCH PROCESSING")
    print("="*60)
    print(f"  Configurations to process: {len(configs_to_run)}")
    print(f"  Total queries per config: {len(queries)}")
    print(f"  Dry run mode: {'YES' if dry_run else 'NO'}")
    print(f"  Output directory: {output_dir}")
    
    # Track results
    results_summary = {
        "total_configs": len(configs_to_run),
        "completed": 0,
        "skipped": 0,
        "failed": 0,
        "details": []
    }
    
    # Process each config
    for i, config_name in enumerate(configs_to_run, 1):
        print(f"\n{'='*60}")
        print(f"Configuration {i}/{len(configs_to_run)}")
        print(f"{'='*60}")
        
        config = all_configs[config_name]
        
        try:
            result = process_expansion_config(
                config_name=config_name,
                config=config,
                queries=queries,
                dry_run=dry_run,
                output_dir=output_dir
            )
            
            # Update summary
            if result["status"] == "completed":
                results_summary["completed"] += 1
            elif result["status"] == "skipped":
                results_summary["skipped"] += 1
            elif result["status"] == "dry_run_complete":
                results_summary["completed"] += 1
            
            results_summary["details"].append({
                "config_name": config_name,
                "result": result
            })
            
        except Exception as e:
            print(f"\n❌ Failed to process {config_name}")
            print(f"   Error: {str(e)}")
            results_summary["failed"] += 1
            
            # Stop processing on failure (as requested)
            print(f"\n⛔ Batch processing stopped due to error")
            break
    
    # Print summary
    print("\n" + "="*60)
    print("BATCH PROCESSING SUMMARY")
    print("="*60)
    print(f"  Total configs: {results_summary['total_configs']}")
    print(f"  ✓ Completed: {results_summary['completed']}")
    print(f"  ⏭️  Skipped: {results_summary['skipped']}")
    print(f"  ❌ Failed: {results_summary['failed']}")
    
    return results_summary

print("✓ process_all_configs() defined")

# %% [markdown]
# ## 6.3 Test with Single Config (Optional)

# %%
# Uncomment to test with a single configuration in dry run mode
"""
print("Testing with single configuration...")

test_config_name = "hyde_basic"
test_config = ALL_EXPANSION_CONFIGS[test_config_name]

test_result = process_expansion_config(
    config_name=test_config_name,
    config=test_config,
    queries=FINANCEBENCH_QUERIES,
    dry_run=True,  # Test mode
    dry_run_size=3  # Only 3 queries
)

print("\nTest result:")
print(test_result)
"""

print("\n" + "="*60)
print("✓ STEP 6 COMPLETE - Processing Functions Ready")
print("="*60)
print("\nImplemented functions:")
print("  ✓ process_expansion_config() - Process single expansion method")
print("  ✓ process_all_configs() - Batch process multiple methods")
print("\nReady for execution!")
print("\nNext: Proceed to Step 7: Execution")

✓ process_expansion_config() defined
✓ process_all_configs() defined

✓ STEP 6 COMPLETE - Processing Functions Ready

Implemented functions:
  ✓ process_expansion_config() - Process single expansion method
  ✓ process_all_configs() - Batch process multiple methods

Ready for execution!

Next: Proceed to Step 7: Execution


In [7]:
# ============================================================================
# Step 7: Execution
# ============================================================================
# 
# This is the final step where we execute the query expansion process.
# 
# IMPORTANT: This section is designed to be run in stages:
# 1. First, run a DRY RUN to test a single config with a few queries
# 2. Review the expansions to ensure they look good
# 3. Then run the FULL EXECUTION for all selected configs
#
# The execution will:
# - Process all configs in CONFIGS_TO_RUN
# - Skip configs that already have output files
# - Stop immediately if any error occurs
# - Save results to JSON files in OUTPUT_DIR
# ============================================================================

# %% [markdown]
# ## 7.1 Pre-Execution Checklist
# 
# Before running, verify:
# - ✓ LLM provider and model are correctly configured
# - ✓ API keys are loaded (if using OpenAI)
# - ✓ CONFIGS_TO_RUN contains the configs you want to process
# - ✓ Output directory exists and is writable
# - ✓ You have reviewed the templates and configurations

# LLM Provider Configuration
# Choose: "openai" or "ollama"
# LLM_PROVIDER = "openai"  # Change to "ollama" for local models
LLM_PROVIDER = "openai"  # Change to "ollama" for local models

# Model Selection
# OpenAI options: "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo"
# Ollama options: "llama2", "mistral", etc.
# LLM_MODEL = "gpt-4"
# LLM_MODEL = "gpt-4o-mini"
LLM_MODEL = "gpt-4o"

CONFIGS_TO_RUN = [
    # HyDE expansions
    "hyde_basic",
    # "hyde_detailed",
    # "hyde_financial_terminology",
    
    # # Query refinement
    # "query_refinement_clarification",
    # "query_refinement_formal",
    # "query_refinement_keyword_focused",
    
    # # Term expansion
    # "term_expansion_abbreviation",
    # "term_expansion_synonym",
    # "term_expansion_context_addition",
    
    # # Chain-of-thought
    # "chain_of_thought_step_by_step",
    # "chain_of_thought_explicit_context",
    
    # # Domain adaptation
    # "domain_adaptation_accounting_perspective",
    # "domain_adaptation_10k_language",
]

# %%
print("="*60)
print("PRE-EXECUTION CHECKLIST")
print("="*60)

# Check 1: LLM Configuration
print(f"\n1. LLM Configuration:")
print(f"   Provider: {LLM_PROVIDER}")
print(f"   Model: {LLM_MODEL}")
print(f"   ✓ Configured")

# Check 2: API Keys
print(f"\n2. API Keys:")
if LLM_PROVIDER == "openai":
    if OPENAI_API_KEY:
        print(f"   ✓ OpenAI API key loaded")
    else:
        print(f"   ❌ OpenAI API key MISSING")
elif LLM_PROVIDER == "ollama":
    print(f"   ✓ Ollama (no API key needed)")
    print(f"   ⚠️  Ensure Ollama is running at {OLLAMA_BASE_URL}")

# Check 3: Output Directory
print(f"\n3. Output Directory:")
print(f"   Path: {OUTPUT_DIR}")
if os.path.exists(OUTPUT_DIR):
    print(f"   ✓ Directory exists")
else:
    print(f"   ℹ️  Will be created automatically")

# Check 4: Configurations
print(f"\n4. Configurations to Process:")
print(f"   Total selected: {len(CONFIGS_TO_RUN)}")

# Check which will be processed vs skipped
configs_will_process = []
configs_will_skip = []

for config_name in CONFIGS_TO_RUN:
    config = ALL_EXPANSION_CONFIGS[config_name]
    output_filename = f"expanded_queries_{config['expansion_type']}_{config['expansion_sub_type']}.json"
    output_path = os.path.join(OUTPUT_DIR, output_filename)
    
    if os.path.exists(output_path):
        configs_will_skip.append(config_name)
    else:
        configs_will_process.append(config_name)

print(f"   Will process: {len(configs_will_process)}")
print(f"   Will skip (existing): {len(configs_will_skip)}")

if configs_will_process:
    print(f"\n   Configs to process:")
    for config_name in configs_will_process:
        print(f"     - {config_name}")

if configs_will_skip:
    print(f"\n   Configs to skip (files exist):")
    for config_name in configs_will_skip:
        print(f"     - {config_name}")

# Check 5: Dataset
print(f"\n5. Dataset:")
print(f"   Queries loaded: {len(FINANCEBENCH_QUERIES)}")
print(f"   ✓ Ready")

# Check 6: Dry Run Status
print(f"\n6. Execution Mode:")
if DRY_RUN_ENABLED:
    print(f"   Mode: DRY RUN")
    print(f"   Sample size: {DRY_RUN_SAMPLE_SIZE} queries")
    print(f"   ℹ️  Results will NOT be saved")
else:
    print(f"   Mode: FULL EXECUTION")
    print(f"   Processing: All {len(FINANCEBENCH_QUERIES)} queries")
    print(f"   ✓ Results will be saved to JSON files")

print("\n" + "="*60)
print("✓ Pre-execution checks complete")
print("="*60)

# %% [markdown]
# ## 7.2 Dry Run Execution (RECOMMENDED FIRST)
# 
# **Run this cell first to test with a single configuration.**
# 
# This will:
# - Process only the first config in CONFIGS_TO_RUN
# - Use only a few queries (defined by DRY_RUN_SAMPLE_SIZE)
# - Display sample expansions
# - NOT save to file
# 
# Review the expansions to ensure quality before running full execution.

# %%
# ============================================================================
# DRY RUN - Test with single configuration
# ============================================================================

print("\n" + "="*60)
print("DRY RUN EXECUTION")
print("="*60)
print("\nThis will test the first configuration with a few sample queries.")
print("Review the output before proceeding to full execution.\n")

# Select first config for testing
if CONFIGS_TO_RUN:
    test_config_name = CONFIGS_TO_RUN[0]
    test_config = ALL_EXPANSION_CONFIGS[test_config_name]
    
    print(f"Testing configuration: {test_config_name}")
    print(f"Sample size: {DRY_RUN_SAMPLE_SIZE} queries")
    print("")
    
    # Run dry run
    dry_run_result = process_expansion_config(
        config_name=test_config_name,
        config=test_config,
        queries=FINANCEBENCH_QUERIES,
        dry_run=True,
        dry_run_size=DRY_RUN_SAMPLE_SIZE
    )
    
    print("\n" + "="*60)
    print("DRY RUN COMPLETE")
    print("="*60)
    print("\n✓ Review the sample expansions above")
    print("✓ If they look good, proceed to full execution (Section 7.3)")
    print("✓ If not, adjust templates or temperature and re-run")
    
else:
    print("⚠️  No configurations selected in CONFIGS_TO_RUN")
    print("   Please add configurations to CONFIGS_TO_RUN and try again")

# %% [markdown]
# ## 7.3 Full Execution
# 
# **⚠️ IMPORTANT: Only run this after reviewing the dry run results!**
# 
# This will:
# - Process ALL configs in CONFIGS_TO_RUN
# - Process ALL 150 queries for each config
# - Save results to JSON files
# - Skip configs that already have output files
# - Stop immediately if any error occurs
# 
# **Estimated time:**
# - ~1-2 seconds per query (with delays)
# - ~150-300 seconds (2.5-5 minutes) per config
# - Total: ~30-65 minutes for all 13 configs
# 
# **Cost estimate (if using OpenAI):**
# - Depends on model and query length
# - GPT-4: ~$0.01-0.03 per query
# - Total for 150 queries: ~$1.50-4.50 per config
# - All 13 configs: ~$20-60

# %%
# ============================================================================
# FULL EXECUTION - Process all configurations
# ============================================================================
# 
# UNCOMMENT THE CODE BELOW TO RUN FULL EXECUTION
# ============================================================================


print("\n" + "="*60)
print("FULL EXECUTION - STARTING")
print("="*60)
print("\n⚠️  This will process all selected configurations.")
print("⚠️  Make sure you have reviewed the dry run results.\n")

# Confirm execution
# input("Press Enter to start full execution, or Ctrl+C to cancel...")

# Run full execution
full_execution_results = process_all_configs(
    configs_to_run=CONFIGS_TO_RUN,
    all_configs=ALL_EXPANSION_CONFIGS,
    queries=FINANCEBENCH_QUERIES,
    dry_run=False,  # FULL EXECUTION
    output_dir=OUTPUT_DIR
)

print("\n" + "="*60)
print("FULL EXECUTION COMPLETE")
print("="*60)
print("\nResults summary:")
print(f"  Total configs: {full_execution_results['total_configs']}")
print(f"  ✓ Completed: {full_execution_results['completed']}")
print(f"  ⏭️  Skipped: {full_execution_results['skipped']}")
print(f"  ❌ Failed: {full_execution_results['failed']}")

# List generated files
print(f"\nGenerated files:")
generated_files = [f for f in os.listdir(OUTPUT_DIR) if f.startswith('expanded_queries_')]
for filename in sorted(generated_files):
    filepath = os.path.join(OUTPUT_DIR, filename)
    file_size = os.path.getsize(filepath) / 1024  # KB
    print(f"  ✓ {filename} ({file_size:.1f} KB)")

print("\n" + "="*60)
print("✓ ALL PROCESSING COMPLETE")
print("="*60)


print("\n⚠️  Full execution code is commented out")
print("   Uncomment the code block above to run full execution")

# %% [markdown]
# ## 7.4 Verify Generated Files
# 
# After full execution, run this cell to verify all files were generated correctly.

# %%
def verify_output_files(output_dir: str = OUTPUT_DIR) -> Dict:
    """
    Verify that all output files are valid and complete.
    
    Args:
        output_dir: Directory containing output JSON files
        
    Returns:
        Dictionary with verification results
    """
    print("\n" + "="*60)
    print("VERIFYING OUTPUT FILES")
    print("="*60)
    
    verification_results = {
        "total_files": 0,
        "valid_files": 0,
        "invalid_files": [],
        "file_details": []
    }
    
    # Get all JSON files
    json_files = [f for f in os.listdir(output_dir) if f.startswith('expanded_queries_') and f.endswith('.json')]
    verification_results["total_files"] = len(json_files)
    
    print(f"\nFound {len(json_files)} output files\n")
    
    for filename in sorted(json_files):
        filepath = os.path.join(output_dir, filename)
        
        try:
            # Load and validate
            data = load_expansion_results(filepath)
            
            # Check query count
            num_queries = len(data["queries"])
            expected_queries = 150
            
            # Get metadata
            metadata = data["metadata"]
            
            # Verify
            is_valid = num_queries == expected_queries
            
            if is_valid:
                verification_results["valid_files"] += 1
                status = "✓"
            else:
                verification_results["invalid_files"].append(filename)
                status = "⚠️"
            
            print(f"{status} {filename}")
            print(f"   Type: {metadata['expansion_type']}/{metadata['expansion_sub_type']}")
            print(f"   Queries: {num_queries}/{expected_queries}")
            print(f"   Model: {metadata['llm_provider']}/{metadata['llm_model']}")
            print(f"   Temperature: {metadata['temperature']}")
            print(f"   Created: {metadata['creation_date']}")
            print()
            
            verification_results["file_details"].append({
                "filename": filename,
                "valid": is_valid,
                "num_queries": num_queries,
                "metadata": metadata
            })
            
        except Exception as e:
            print(f"❌ {filename}")
            print(f"   Error: {str(e)}")
            print()
            verification_results["invalid_files"].append(filename)
    
    # Summary
    print("="*60)
    print("VERIFICATION SUMMARY")
    print("="*60)
    print(f"  Total files: {verification_results['total_files']}")
    print(f"  ✓ Valid: {verification_results['valid_files']}")
    print(f"  ⚠️  Invalid: {len(verification_results['invalid_files'])}")
    
    if verification_results["invalid_files"]:
        print(f"\n  Invalid files:")
        for filename in verification_results["invalid_files"]:
            print(f"    - {filename}")
    
    return verification_results

# Uncomment to verify files after execution
"""
verification_results = verify_output_files(OUTPUT_DIR)
"""

print("\n⚠️  Verification code is commented out")
print("   Uncomment after running full execution to verify files")

# %% [markdown]
# ## 7.5 Load and Inspect a Sample File

# %%
def inspect_sample_file(filename: str, output_dir: str = OUTPUT_DIR, num_samples: int = 3):
    """
    Load and display sample expansions from a generated file.
    
    Args:
        filename: Name of the JSON file to inspect
        output_dir: Directory containing the file
        num_samples: Number of sample queries to display
    """
    filepath = os.path.join(output_dir, filename)
    
    print("\n" + "="*60)
    print(f"INSPECTING: {filename}")
    print("="*60)
    
    try:
        data = load_expansion_results(filepath)
        
        # Display metadata
        metadata = data["metadata"]
        print(f"\nMetadata:")
        print(f"  Expansion Type: {metadata['expansion_type']}")
        print(f"  Expansion Sub-Type: {metadata['expansion_sub_type']}")
        print(f"  LLM: {metadata['llm_provider']}/{metadata['llm_model']}")
        print(f"  Temperature: {metadata['temperature']}")
        print(f"  Total Queries: {metadata['total_queries']}")
        print(f"  Created: {metadata['creation_date']}")
        
        # Display sample queries
        queries = data["queries"]
        print(f"\nSample Expansions (showing {min(num_samples, len(queries))}):")
        
        for i, query in enumerate(queries[:num_samples], 1):
            print(f"\n[Sample {i}]")
            print(f"  ID: {query['financebench_id']}")
            print(f"  Doc: {query['doc_name']}")
            print(f"  Original:")
            print(f"    {query['original_query']}")
            print(f"  Expanded:")
            print(f"    {query['expanded_query']}")
        
        print("\n" + "="*60)
        
    except FileNotFoundError:
        print(f"\n❌ File not found: {filepath}")
        print("   Run full execution first to generate files")
    except Exception as e:
        print(f"\n❌ Error loading file: {str(e)}")

# Uncomment to inspect a sample file

# Example: Inspect the hyde_basic results
inspect_sample_file("expanded_queries_hyde_basic.json", num_samples=5)


print("\n⚠️  Inspection code is commented out")
print("   Uncomment after running full execution to inspect files")

# %% [markdown]
# ## 7.6 Final Summary

# %%
print("\n" + "="*60)
print("✓ STEP 7 COMPLETE - EXECUTION READY")
print("="*60)

print("\nExecution workflow:")
print("  1. ✓ Review pre-execution checklist (7.1)")
print("  2. ✓ Run dry run to test (7.2)")
print("  3. → Review dry run results")
print("  4. → Uncomment and run full execution (7.3)")
print("  5. → Verify generated files (7.4)")
print("  6. → Inspect sample results (7.5)")

print("\nCurrent status:")
if DRY_RUN_ENABLED:
    print("  Mode: DRY RUN")
    print("  → Safe to run Section 7.2")
else:
    print("  Mode: FULL EXECUTION")
    print("  → Ready to run Section 7.3 (uncomment code first)")

print("\nOutput files will be saved to:")
print(f"  {OUTPUT_DIR}")

print("\n" + "="*60)
print("🎯 QUERY EXPANSION CREATOR - READY TO USE")
print("="*60)

PRE-EXECUTION CHECKLIST

1. LLM Configuration:
   Provider: openai
   Model: gpt-4o
   ✓ Configured

2. API Keys:
   ✓ OpenAI API key loaded

3. Output Directory:
   Path: ../../expanded_queries
   ✓ Directory exists

4. Configurations to Process:
   Total selected: 1
   Will process: 1
   Will skip (existing): 0

   Configs to process:
     - hyde_basic

5. Dataset:
   Queries loaded: 150
   ✓ Ready

6. Execution Mode:
   Mode: FULL EXECUTION
   Processing: All 150 queries
   ✓ Results will be saved to JSON files

✓ Pre-execution checks complete

DRY RUN EXECUTION

This will test the first configuration with a few sample queries.
Review the output before proceeding to full execution.

Testing configuration: hyde_basic
Sample size: 5 queries


PROCESSING: hyde_basic
  Expansion Type: hyde
  Expansion Sub-Type: basic
  Template: hyde_basic
  Temperature: 0.7
  LLM: openai/gpt-4o
  Mode: DRY RUN (processing 5 queries)

  Initializing LLM...
  ✓ LLM chain ready

  Processing 5 queries...


  chain = LLMChain(llm=llm, prompt=prompt_template)


  hyde_basic:   0%|          | 0/5 [00:00<?, ?query/s]

  response = chain.run(query=query)



  ✓ Processed 5 queries successfully
  ⏱️  Total time: 25.9s
  ⏱️  Average per query: 5.18s

  ℹ️  DRY RUN - Results not saved
  ℹ️  Set DRY_RUN_ENABLED=False to save results

  Sample expansions:

  [1] Original:
      What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
      Expanded:
      In FY2018, 3M's capital expenditures amounted to approximately $1,603 million, as detailed in the ca...

  [2] Original:
      Assume that you are a public equities analyst. Answer the following question by primarily using info...
      Expanded:
      As of the year-end FY2018, the net property, plant, and equipment (PP&E) for 3M Company is reported ...

  [3] Original:
      Is 3M a capital-intensive business based on FY2022 data?...
      Expanded:
      Yes, 3M is considered a capital-intensive business based on FY2022 data. The company reported signif...

DRY RUN COMPLETE

✓ Review the sample expansions above
✓ If they look good, proceed to

  hyde_basic:   0%|          | 0/150 [00:00<?, ?query/s]


  ✓ Processed 150 queries successfully
  ⏱️  Total time: 612.3s
  ⏱️  Average per query: 4.08s

  Saving results...
  ✓ Saved to: ../../expanded_queries/expanded_queries_hyde_basic.json

BATCH PROCESSING SUMMARY
  Total configs: 1
  ✓ Completed: 1
  ⏭️  Skipped: 0
  ❌ Failed: 0

FULL EXECUTION COMPLETE

Results summary:
  Total configs: 1
  ✓ Completed: 1
  ⏭️  Skipped: 0
  ❌ Failed: 0

Generated files:
  ✓ expanded_queries_hyde_basic.json (118.3 KB)
  ✓ expanded_queries_hyde_basic_llama3.json (138.8 KB)

✓ ALL PROCESSING COMPLETE

⚠️  Full execution code is commented out
   Uncomment the code block above to run full execution

⚠️  Verification code is commented out
   Uncomment after running full execution to verify files

INSPECTING: expanded_queries_hyde_basic.json

Metadata:
  Expansion Type: hyde
  Expansion Sub-Type: basic
  LLM: openai/gpt-4o
  Temperature: 0.7
  Total Queries: 150
  Created: 2025-10-23T22:22:53.762816

Sample Expansions (showing 5):

[Sample 1]
  ID: financebe