In [2]:
# ============================================================================
# Text-Based Evaluation Notebook - FinanceBench RAG
# Evaluating Retrieval with Page-Based AND Text-Based Metrics
# ============================================================================

# %% [markdown]
# # FinanceBench Text-Based Evaluation
# 
# This notebook evaluates RAG retrieval performance using BOTH:
# 1. **Page-based metrics**: MRR, Recall, Precision, F1 (based on page number matching)
# 2. **Text-based metrics**: MRR, Recall, Precision, F1 (based on semantic similarity)
# 
# We use Sentence-BERT (all-MiniLM-L6-v2) to compute cosine similarity between
# retrieved chunks and ground truth evidence text.

# %% [markdown]
# ## 1.1 Standard Imports

# %%
import os
import json
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from collections import defaultdict

# Environment
from dotenv import load_dotenv

# Progress tracking
from tqdm.auto import tqdm

# Data handling
from datasets import load_dataset

# Numerical operations
import numpy as np

# Vector stores and embeddings
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings

print("✓ Standard imports successful")

# %% [markdown]
# ## 1.2 Text Similarity Imports (NEW)

# %%
# Sentence-BERT for semantic similarity
from sentence_transformers import SentenceTransformer

# Cosine similarity calculation
from sklearn.metrics.pairwise import cosine_similarity

print("✓ Text similarity imports successful")

# %% [markdown]
# ## 1.3 Load Environment Variables

# %%
# Load .env file
load_dotenv()

# API Keys and URLs
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")

# Verify API keys
if OPENAI_API_KEY:
    print("✓ OpenAI API key loaded")
else:
    print("⚠ OpenAI API key not found (only needed if using OpenAI embeddings)")

if VOYAGE_API_KEY:
    print("✓ VoyageAI API key loaded")
else:
    print("⚠ VoyageAI API key not found (only needed if using VoyageAI embeddings)")

print(f"✓ Ollama URL: {OLLAMA_BASE_URL}")

# %% [markdown]
# ## 1.4 Configuration Variables

# %%
# Directory paths
VECTOR_DB_BASE_DIR = "../../vector_databases"
EXPANDED_QUERIES_DIR = "../../query_enhancement_set"
OUTPUT_DIR = "../../evaluation_results/query_enhancement"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Dataset configuration
DATASET_NAME = "PatronusAI/financebench"
DATASET_SPLIT = "train"

# Vector database configuration
COLLECTION_PREFIX = "financebench_docs_chunk_"

# ============================================================================
# TEXT-BASED EVALUATION PARAMETERS (NEW)
# ============================================================================

# Sentence-BERT model for semantic similarity
SBERT_MODEL_NAME = "all-MiniLM-L6-v2"

# Similarity threshold for text-based matching
# Chunks with cosine similarity >= this threshold are considered matches
TEXT_SIMILARITY_THRESHOLD = 0.8

# Chunk text preview settings
# We store abbreviated chunk text: "first N chars...last N chars"
CHUNK_TEXT_PREFIX_CHARS = 100  # Characters to keep from start
CHUNK_TEXT_SUFFIX_CHARS = 100  # Characters to keep from end

# ============================================================================

print("✓ Configuration set")
print(f"  Vector DB Directory: {VECTOR_DB_BASE_DIR}")
print(f"  Output Directory: {OUTPUT_DIR}")
print(f"\n  Sentence-BERT Model: {SBERT_MODEL_NAME}")
print(f"  Text Similarity Threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  Chunk Text Preview: First {CHUNK_TEXT_PREFIX_CHARS} + Last {CHUNK_TEXT_SUFFIX_CHARS} chars")

# %% [markdown]
# ## 1.5 Load FinanceBench Dataset

# %%
print("\nLoading FinanceBench dataset...")
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
print(f"✓ Loaded {len(dataset)} queries")

# Show sample query with evidence structure
print("\nSample query:")
sample = dataset[0]
print(f"  ID: {sample['financebench_id']}")
print(f"  Company: {sample['company']}")
print(f"  Question: {sample['question'][:100]}...")
print(f"  Doc: {sample['doc_name']}")
print(f"  Evidence items: {len(sample['evidence'])}")

# Show evidence structure
if len(sample['evidence']) > 0:
    print("\n  First evidence item structure:")
    evidence_item = sample['evidence'][0]
    print(f"    - doc_name: {evidence_item['doc_name']}")
    print(f"    - evidence_page_num: {evidence_item['evidence_page_num']}")
    print(f"    - evidence_text (first 100 chars): {evidence_item['evidence_text'][:100]}...")
    print(f"    - Has 'evidence_text_full_page': {'evidence_text_full_page' in evidence_item}")

# %%
print("\n" + "="*60)
print("✓ STEP 1 COMPLETE!")
print("="*60)
print("  ✓ All imports loaded")
print("  ✓ Environment variables configured")
print("  ✓ Paths set up")
print(f"  ✓ Dataset loaded: {len(dataset)} queries")
print(f"  ✓ Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  ✓ Chunk preview length: {CHUNK_TEXT_PREFIX_CHARS} + {CHUNK_TEXT_SUFFIX_CHARS} chars")
print("="*60)

✓ Standard imports successful
✓ Text similarity imports successful
✓ OpenAI API key loaded
✓ VoyageAI API key loaded
✓ Ollama URL: http://localhost:11434
✓ Configuration set
  Vector DB Directory: ../../vector_databases
  Output Directory: ../../evaluation_results/query_enhancement

  Sentence-BERT Model: all-MiniLM-L6-v2
  Text Similarity Threshold: 0.8
  Chunk Text Preview: First 100 + Last 100 chars

Loading FinanceBench dataset...
✓ Loaded 150 queries

Sample query:
  ID: financebench_id_03029
  Company: 3M
  Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
  Doc: 3M_2018_10K
  Evidence items: 1

  First evidence item structure:
    - doc_name: 3M_2018_10K
    - evidence_page_num: 59
    - evidence_text (first 100 chars): Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended Dec...
    - Has 'evidence_text_full_page': True

✓ STEP 1 COMPLETE!
  ✓ All imports loaded
  ✓ Enviro

In [3]:
# ============================================================================
# Step 1.A: Test expanded queries loading functions
# ============================================================================

def load_expanded_queries(
    expanded_queries_dir: str,
    expansion_type: str,
    expansion_subtype: str
) -> Dict[str, Dict]:
    """
    Load expanded queries from JSON file and create a lookup dictionary.
    
    Args:
        expanded_queries_dir: Directory containing expanded query JSON files
        expansion_type: Type of expansion (e.g., "hyde", "query2doc", etc.)
        expansion_subtype: Subtype of expansion (e.g., "basic", "cot", etc.)
        
    Returns:
        Dictionary mapping financebench_id to expanded query information
        Format: {
            'financebench_id_xxxxx': {
                'original_query': '...',
                'expanded_query': '...',
                'doc_name': '...'
            },
            ...
        }
        
    Example:
        expanded_lookup = load_expanded_queries(
            "../../expanded_queries",
            "hyde",
            "basic"
        )
        # Get expanded query for a specific ID
        expanded_info = expanded_lookup['financebench_id_03029']
        print(expanded_info['expanded_query'])
    """
    # Construct filename
    filename = f"expanded_queries_{expansion_type}_{expansion_subtype}.json"
    filepath = os.path.join(expanded_queries_dir, filename)
    
    # Check if file exists
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Expanded queries file not found: {filepath}")
    
    # Load JSON file
    print(f"Loading expanded queries from: {filename}")
    with open(filepath, 'r') as f:
        data = json.load(f)
    
    # Extract metadata
    metadata = data.get('metadata', {})
    queries = data.get('queries', [])
    
    print(f"\nExpanded Queries Info:")
    print(f"  Type: {metadata.get('expansion_type', 'N/A')}")
    print(f"  Subtype: {metadata.get('expansion_sub_type', 'N/A')}")
    print(f"  LLM: {metadata.get('llm_provider', 'N/A')}/{metadata.get('llm_model', 'N/A')}")
    print(f"  Total queries: {len(queries)}")
    
    # Create lookup dictionary indexed by financebench_id
    expanded_lookup = {}
    for query_data in queries:
        fb_id = query_data['financebench_id']
        expanded_lookup[fb_id] = {
            'original_query': query_data['original_query'],
            'expanded_query': query_data['expanded_query'],
            'doc_name': query_data['doc_name']
        }
    
    print(f"  ✓ Created lookup dictionary with {len(expanded_lookup)} entries")
    
    return expanded_lookup


def test_expanded_queries_loading(
    expanded_queries_dir: str = EXPANDED_QUERIES_DIR,
    expansion_type: str = "hyde",
    expansion_subtype: str = "basic"
):
    """
    Test function to verify expanded queries loading works correctly.
    
    Args:
        expanded_queries_dir: Directory containing expanded query files
        expansion_type: Type of expansion to test
        expansion_subtype: Subtype of expansion to test
    """
    print("\n" + "="*60)
    print("TESTING EXPANDED QUERIES LOADING")
    print("="*60)
    
    try:
        # Load expanded queries
        expanded_lookup = load_expanded_queries(
            expanded_queries_dir,
            expansion_type,
            expansion_subtype
        )
        
        # Test retrieval with a sample ID
        sample_id = "financebench_id_03029"
        if sample_id in expanded_lookup:
            print(f"\n✓ Successfully retrieved expanded query for {sample_id}")
            print(f"\nOriginal query (first 100 chars):")
            print(f"  {expanded_lookup[sample_id]['original_query'][:100]}...")
            print(f"\nExpanded query (first 150 chars):")
            print(f"  {expanded_lookup[sample_id]['expanded_query'][:150]}...")
            print(f"\nDoc name: {expanded_lookup[sample_id]['doc_name']}")
        else:
            print(f"\n⚠ Sample ID {sample_id} not found in lookup")
        
        print("\n" + "="*60)
        print("✓ EXPANDED QUERIES LOADING TEST COMPLETE")
        print("="*60)
        
        return expanded_lookup
        
    except Exception as e:
        print(f"\n✗ Error during test: {e}")
        print("\nPossible issues:")
        print(f"  1. File not found: expanded_queries_{expansion_type}_{expansion_subtype}.json")
        print(f"  2. Directory path incorrect: {expanded_queries_dir}")
        print("  3. JSON format issue")
        return None


# Example usage:
print("✓ Expanded queries loading functions defined")

# Test the function
expanded_lookup = test_expanded_queries_loading(
    expanded_queries_dir=EXPANDED_QUERIES_DIR,
    expansion_type="hyde",
    expansion_subtype="basic"
)

# Access expanded queries by financebench_id
if expanded_lookup:
    query_info = expanded_lookup['financebench_id_03029']
    print("Sample expanded query info:\n")
    print("Original query:", query_info['original_query'])
    print("Expanded query:", query_info['expanded_query'])

✓ Expanded queries loading functions defined

TESTING EXPANDED QUERIES LOADING
Loading expanded queries from: expanded_queries_hyde_basic.json

Expanded Queries Info:
  Type: hyde
  Subtype: basic
  LLM: openai/gpt-4o-mini
  Total queries: 150
  ✓ Created lookup dictionary with 150 entries

✓ Successfully retrieved expanded query for financebench_id_03029

Original query (first 100 chars):
  What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...

Expanded query (first 150 chars):
  For the fiscal year 2018, 3M reported capital expenditures of $1,450 million, as detailed in the consolidated cash flow statement....

Doc name: 3M_2018_10K

✓ EXPANDED QUERIES LOADING TEST COMPLETE
Sample expanded query info:

Original query: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.
Expanded query: For the fiscal year 2018, 3M reported ca

In [4]:
# ============================================================================
# Step 2: Load Sentence-BERT Model
# ============================================================================

# %% [markdown]
# ## 2.1 Load Sentence-BERT Model
# 
# We load the `all-MiniLM-L6-v2` model once at the start.
# This model will be used to:
# 1. Encode evidence texts (done once and cached)
# 2. Encode retrieved chunk texts (done for each retrieval)
# 3. Calculate cosine similarity between them

# %%
def load_sentence_bert_model(model_name: str = SBERT_MODEL_NAME):
    """
    Load Sentence-BERT model for semantic similarity computation.
    
    Args:
        model_name: Name of the sentence-transformers model
        
    Returns:
        SentenceTransformer model instance
    
    Notes:
        - all-MiniLM-L6-v2: 384-dimensional embeddings, ~80MB model
        - First load downloads model from HuggingFace
        - Subsequent loads use cached model
        - Uses CPU by default (can be moved to GPU if available)
    """
    print(f"\nLoading Sentence-BERT model: {model_name}")
    print("  (First run will download model from HuggingFace...)")
    
    try:
        model = SentenceTransformer(model_name)
        print(f"✓ Model loaded successfully")
        print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")
        print(f"  Max sequence length: {model.max_seq_length}")
        
        return model
        
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        raise

# %%
# Load the model
sbert_model = load_sentence_bert_model()

# %% [markdown]
# ## 2.2 Test the Model
# 
# Let's verify the model works correctly by encoding sample texts

# %%
def test_sentence_bert_model(model):
    """
    Test Sentence-BERT model with sample texts.
    Verifies encoding and similarity calculation work correctly.
    """
    print("\n" + "="*60)
    print("TESTING SENTENCE-BERT MODEL")
    print("="*60)
    
    # Sample texts
    # text1 = "The company's revenue increased by 15% in Q4 2023."
    # text2 = "Revenue grew 15 percent in the fourth quarter of 2023."
    # text3 = "The weather was sunny and pleasant today."
    text1 = """
    In fiscal year 2018, capital expenditures were $1,577 million, compared to $1,432 million in fiscal year 2017. The increase was primarily due to investments in property, plant, and equipment to support our growth initiatives and enhance operational efficiency."""
    text2 = """
    In fiscal year 2018, capital expenditures were $1,577 million, compared to $1,432 million in fiscal year 2017. This increase was mainly driven by investments in property, plant, and equipment to support growth initiatives and improve operational efficiency."""
    text3 = """
    The weather was sunny and pleasant today."""
    
    print("\nTest texts:")
    print(f"  Text 1: {text1}")
    print(f"  Text 2: {text2}")
    print(f"  Text 3: {text3}")
    
    # Encode texts
    print("\nEncoding texts...")
    embeddings = model.encode([text1, text2, text3])
    
    print(f"✓ Generated embeddings shape: {embeddings.shape}")
    print(f"  (3 texts × {embeddings.shape[1]} dimensions)")
    
    # Calculate similarities
    print("\nCalculating cosine similarities:")
    
    # Similarity between text1 and text2 (semantically similar)
    sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    print(f"  Text 1 ↔ Text 2: {sim_1_2:.4f} (should be HIGH - same meaning)")
    
    # Similarity between text1 and text3 (semantically different)
    sim_1_3 = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
    print(f"  Text 1 ↔ Text 3: {sim_1_3:.4f} (should be LOW - different topics)")
    
    # Similarity between text2 and text3 (semantically different)
    sim_2_3 = cosine_similarity([embeddings[1]], [embeddings[2]])[0][0]
    print(f"  Text 2 ↔ Text 3: {sim_2_3:.4f} (should be LOW - different topics)")
    
    # Verify results make sense
    print("\nValidation:")
    if sim_1_2 > 0.7:
        print(f"  ✓ Similar texts have high similarity ({sim_1_2:.4f} > 0.7)")
    else:
        print(f"  ⚠ Similar texts have lower similarity than expected ({sim_1_2:.4f})")
    
    if sim_1_3 < 0.5:
        print(f"  ✓ Different texts have low similarity ({sim_1_3:.4f} < 0.5)")
    else:
        print(f"  ⚠ Different texts have higher similarity than expected ({sim_1_3:.4f})")
    
    print("\n" + "="*60)
    print("✓ MODEL TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run the test
test_result = test_sentence_bert_model(sbert_model)

# %% [markdown]
# ## 2.3 Test with Actual FinanceBench Evidence
# 
# Let's test with real evidence text from the dataset

# %%
def test_with_real_evidence(model, dataset):
    """
    Test model with actual FinanceBench evidence text.
    This helps verify the model works well with financial domain text.
    """
    print("\n" + "="*60)
    print("TESTING WITH REAL FINANCEBENCH EVIDENCE")
    print("="*60)
    
    # Get first query with evidence
    sample = dataset[0]
    evidence_text = sample['evidence'][0]['evidence_text']
    
    print(f"\nQuery: {sample['question'][:100]}...")
    print(f"\nEvidence text (first 200 chars):")
    print(f"  {evidence_text[:200]}...")
    
    # Create some test chunks
    # Chunk 1: Exact match (should have very high similarity)
    chunk1 = evidence_text
    
    # Chunk 2: Paraphrased version (should have high similarity)
    chunk2 = "Capital expenditures totaled $1,577 million in fiscal year 2018."
    
    # Chunk 3: Different financial topic (should have lower similarity)
    chunk3 = "The company reported strong earnings growth driven by increased sales."
    
    print("\nTest chunks:")
    print(f"  Chunk 1: Exact match - {chunk1[:80]}...")
    print(f"  Chunk 2: Paraphrased - {chunk2}")
    print(f"  Chunk 3: Different topic - {chunk3}")
    
    # Encode
    evidence_embedding = model.encode([evidence_text])
    chunk_embeddings = model.encode([chunk1, chunk2, chunk3])
    
    # Calculate similarities
    print("\nSimilarities with evidence:")
    for i, chunk_emb in enumerate(chunk_embeddings):
        sim = cosine_similarity(evidence_embedding, [chunk_emb])[0][0]
        match_status = "✓ MATCH" if sim >= TEXT_SIMILARITY_THRESHOLD else "✗ NO MATCH"
        print(f"  Chunk {i+1}: {sim:.4f} {match_status}")
    
    print("\n" + "="*60)
    print("✓ REAL EVIDENCE TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test with real evidence
real_evidence_test = test_with_real_evidence(sbert_model, dataset)

# %%
print("\n" + "="*60)
print("✓ STEP 2 COMPLETE!")
print("="*60)
print("  ✓ Sentence-BERT model loaded")
print(f"  ✓ Model: {SBERT_MODEL_NAME}")
print(f"  ✓ Embedding dimension: {sbert_model.get_sentence_embedding_dimension()}")
print("  ✓ Model tested with sample texts")
print("  ✓ Model tested with real FinanceBench evidence")
print("  ✓ Ready for evidence embedding pre-computation")
print("="*60)


Loading Sentence-BERT model: all-MiniLM-L6-v2
  (First run will download model from HuggingFace...)
✓ Model loaded successfully
  Embedding dimension: 384
  Max sequence length: 256

TESTING SENTENCE-BERT MODEL

Test texts:
  Text 1: 
    In fiscal year 2018, capital expenditures were $1,577 million, compared to $1,432 million in fiscal year 2017. The increase was primarily due to investments in property, plant, and equipment to support our growth initiatives and enhance operational efficiency.
  Text 2: 
    In fiscal year 2018, capital expenditures were $1,577 million, compared to $1,432 million in fiscal year 2017. This increase was mainly driven by investments in property, plant, and equipment to support growth initiatives and improve operational efficiency.
  Text 3: 
    The weather was sunny and pleasant today.

Encoding texts...
✓ Generated embeddings shape: (3, 384)
  (3 texts × 384 dimensions)

Calculating cosine similarities:
  Text 1 ↔ Text 2: 0.9967 (should be HIGH - same

In [5]:
# ============================================================================
# Step 3: Pre-compute Evidence Embeddings
# ============================================================================

# %% [markdown]
# ## 3.1 Extract All Evidence Texts
# 
# We need to:
# 1. Extract all unique evidence texts from the dataset
# 2. Create a mapping structure for quick lookup
# 3. Pre-compute embeddings once (instead of computing them 150 times)

# %%
def extract_all_evidence_from_dataset(dataset):
    """
    Extract all evidence items from the dataset.
    
    Returns a list of evidence items with metadata:
    - query_id: Which query this evidence belongs to
    - evidence_index: Index within that query's evidence list
    - doc_name: Source document
    - page_number: Evidence page (1-indexed for consistency)
    - evidence_text: The actual text content
    
    This structure allows us to:
    1. Pre-compute embeddings for all evidence
    2. Map back to original queries during evaluation
    """
    print("\n" + "="*60)
    print("EXTRACTING EVIDENCE FROM DATASET")
    print("="*60)
    
    all_evidence = []
    evidence_texts = []
    
    print(f"\nProcessing {len(dataset)} queries...")
    
    for record in tqdm(dataset, desc="Extracting evidence"):
        query_id = record['financebench_id']
        evidence_list = record['evidence']
        
        for evidence_idx, evidence_item in enumerate(evidence_list):
            # Extract evidence information
            evidence_entry = {
                'query_id': query_id,
                'evidence_index': evidence_idx,
                'doc_name': evidence_item['doc_name'],
                'page_number': evidence_item['evidence_page_num'] + 1,  # Convert to 1-indexed
                'evidence_text': evidence_item['evidence_text']
            }
            
            all_evidence.append(evidence_entry)
            evidence_texts.append(evidence_item['evidence_text'])
    
    print(f"\n✓ Extracted {len(all_evidence)} evidence items")
    print(f"  From {len(dataset)} queries")
    print(f"  Average evidence per query: {len(all_evidence)/len(dataset):.2f}")
    
    # Show statistics
    evidence_per_query = {}
    for record in dataset:
        query_id = record['financebench_id']
        evidence_per_query[query_id] = len(record['evidence'])
    
    print(f"\nEvidence distribution:")
    print(f"  Min evidence per query: {min(evidence_per_query.values())}")
    print(f"  Max evidence per query: {max(evidence_per_query.values())}")
    print(f"  Median evidence per query: {sorted(evidence_per_query.values())[len(evidence_per_query)//2]}")
    
    return all_evidence, evidence_texts

# %%
# Extract all evidence
all_evidence, evidence_texts = extract_all_evidence_from_dataset(dataset)

# Show sample evidence
print("\nSample evidence items:")
for i in range(min(3, len(all_evidence))):
    ev = all_evidence[i]
    print(f"\n  Evidence {i+1}:")
    print(f"    Query ID: {ev['query_id']}")
    print(f"    Doc: {ev['doc_name']}, Page: {ev['page_number']}")
    print(f"    Text (first 100 chars): {ev['evidence_text'][:100]}...")

# %% [markdown]
# ## 3.2 Pre-compute Evidence Embeddings
# 
# This is a critical optimization:
# - Without pre-computation: 150 queries × avg 1.5 evidence × encoding time
# - With pre-computation: Encode once, reuse 150 times
# - Estimated time savings: ~98%

# %%
def compute_evidence_embeddings(
    evidence_texts: List[str],
    model: SentenceTransformer,
    batch_size: int = 32
) -> np.ndarray:
    """
    Pre-compute embeddings for all evidence texts.
    
    Args:
        evidence_texts: List of evidence text strings
        model: Sentence-BERT model
        batch_size: Number of texts to encode at once (larger = faster but more memory)
        
    Returns:
        numpy array of shape (n_evidence, embedding_dim)
        
    Notes:
        - Processes in batches for efficiency
        - Shows progress bar
        - Uses CPU by default (can be moved to GPU if available)
    """
    print("\n" + "="*60)
    print("COMPUTING EVIDENCE EMBEDDINGS")
    print("="*60)
    
    print(f"\nEncoding {len(evidence_texts)} evidence texts...")
    print(f"  Batch size: {batch_size}")
    print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")
    
    # Encode all texts with progress bar
    # show_progress_bar=True displays tqdm progress
    embeddings = model.encode(
        evidence_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    print(f"\n✓ Embeddings computed")
    print(f"  Shape: {embeddings.shape}")
    print(f"  Memory: {embeddings.nbytes / 1024 / 1024:.2f} MB")
    
    return embeddings

# %%
# Compute embeddings
evidence_embeddings = compute_evidence_embeddings(
    evidence_texts=evidence_texts,
    model=sbert_model,
    batch_size=32
)

# %% [markdown]
# ## 3.3 Create Evidence Lookup Structure
# 
# Create a convenient structure to look up evidence by query_id

# %%
def create_evidence_lookup(all_evidence: List[Dict], evidence_embeddings: np.ndarray) -> Dict:
    """
    Create a lookup dictionary mapping query_id to evidence items with embeddings.
    
    Structure:
    {
        'query_id_1': [
            {
                'evidence_index': 0,
                'doc_name': 'DOC_NAME',
                'page_number': 60,
                'evidence_text': 'text...',
                'embedding': numpy array
            },
            ...
        ],
        ...
    }
    
    This allows fast lookup: evidence_lookup[query_id] returns all evidence for that query
    """
    print("\n" + "="*60)
    print("CREATING EVIDENCE LOOKUP STRUCTURE")
    print("="*60)
    
    lookup = defaultdict(list)
    
    print(f"\nBuilding lookup for {len(all_evidence)} evidence items...")
    
    for i, evidence_item in enumerate(all_evidence):
        query_id = evidence_item['query_id']
        
        # Add embedding to evidence item
        evidence_with_embedding = evidence_item.copy()
        evidence_with_embedding['embedding'] = evidence_embeddings[i]
        
        lookup[query_id].append(evidence_with_embedding)
    
    print(f"✓ Lookup created for {len(lookup)} queries")
    
    # Verify
    sample_query_id = list(lookup.keys())[0]
    print(f"\nVerification - Sample query: {sample_query_id}")
    print(f"  Evidence items: {len(lookup[sample_query_id])}")
    print(f"  First evidence embedding shape: {lookup[sample_query_id][0]['embedding'].shape}")
    
    return dict(lookup)

# %%
# Create lookup
evidence_lookup = create_evidence_lookup(all_evidence, evidence_embeddings)

# %% [markdown]
# ## 3.4 Test Evidence Lookup
# 
# Verify we can retrieve evidence for any query

# %%
def test_evidence_lookup(dataset, evidence_lookup):
    """
    Test that evidence lookup works correctly.
    """
    print("\n" + "="*60)
    print("TESTING EVIDENCE LOOKUP")
    print("="*60)
    
    # Test with first query
    sample_record = dataset[0]
    query_id = sample_record['financebench_id']
    
    print(f"\nTest query: {query_id}")
    print(f"  Question: {sample_record['question'][:100]}...")
    
    # Retrieve from lookup
    evidence_items = evidence_lookup.get(query_id, [])
    
    print(f"\n✓ Retrieved {len(evidence_items)} evidence items")
    
    for i, ev in enumerate(evidence_items):
        print(f"\n  Evidence {i+1}:")
        print(f"    Doc: {ev['doc_name']}, Page: {ev['page_number']}")
        print(f"    Text (first 80 chars): {ev['evidence_text'][:80]}...")
        print(f"    Embedding shape: {ev['embedding'].shape}")
        print(f"    Embedding sample (first 5 dims): {ev['embedding'][:5]}")
    
    # Verify count matches original
    original_evidence_count = len(sample_record['evidence'])
    retrieved_evidence_count = len(evidence_items)
    
    if original_evidence_count == retrieved_evidence_count:
        print(f"\n✓ Count matches: {original_evidence_count} evidence items")
    else:
        print(f"\n✗ Count mismatch: {original_evidence_count} vs {retrieved_evidence_count}")
    
    print("\n" + "="*60)
    print("✓ LOOKUP TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Test lookup
test_evidence_lookup(dataset, evidence_lookup)

# %%
print("\n" + "="*60)
print("✓ STEP 3 COMPLETE!")
print("="*60)
print(f"  ✓ Extracted {len(all_evidence)} evidence items from {len(dataset)} queries")
print(f"  ✓ Computed {evidence_embeddings.shape[0]} embeddings")
print(f"  ✓ Embedding dimension: {evidence_embeddings.shape[1]}")
print(f"  ✓ Memory used: {evidence_embeddings.nbytes / 1024 / 1024:.2f} MB")
print(f"  ✓ Evidence lookup created for {len(evidence_lookup)} queries")
print("  ✓ Ready for evaluation with pre-computed embeddings")
print("="*60)


EXTRACTING EVIDENCE FROM DATASET

Processing 150 queries...


Extracting evidence:   0%|          | 0/150 [00:00<?, ?it/s]


✓ Extracted 189 evidence items
  From 150 queries
  Average evidence per query: 1.26

Evidence distribution:
  Min evidence per query: 1
  Max evidence per query: 3
  Median evidence per query: 1

Sample evidence items:

  Evidence 1:
    Query ID: financebench_id_03029
    Doc: 3M_2018_10K, Page: 60
    Text (first 100 chars): Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended Dec...

  Evidence 2:
    Query ID: financebench_id_04672
    Doc: 3M_2018_10K, Page: 58
    Text (first 100 chars): Table of Contents 
3M Company and Subsidiaries
Consolidated Balance Shee t
At December 31
 
 
 
Dece...

  Evidence 3:
    Query ID: financebench_id_00499
    Doc: 3M_2022_10K, Page: 48
    Text (first 100 chars): 3M Company and Subsidiaries
Consolidated Statement of Income
Years ended December 31
(Millions, exce...

COMPUTING EVIDENCE EMBEDDINGS

Encoding 189 evidence texts...
  Batch size: 32
  Embedding dimension: 384


Batches:   0%|          | 0/6 [00:00<?, ?it/s]


✓ Embeddings computed
  Shape: (189, 384)
  Memory: 0.28 MB

CREATING EVIDENCE LOOKUP STRUCTURE

Building lookup for 189 evidence items...
✓ Lookup created for 150 queries

Verification - Sample query: financebench_id_03029
  Evidence items: 1
  First evidence embedding shape: (384,)

TESTING EVIDENCE LOOKUP

Test query: financebench_id_03029
  Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...

✓ Retrieved 1 evidence items

  Evidence 1:
    Doc: 3M_2018_10K, Page: 60
    Text (first 80 chars): Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Fl...
    Embedding shape: (384,)
    Embedding sample (first 5 dims): [ 0.02514487 -0.04951032  0.00813957 -0.02846965 -0.01726394]

✓ Count matches: 1 evidence items

✓ LOOKUP TEST COMPLETE

✓ STEP 3 COMPLETE!
  ✓ Extracted 189 evidence items from 150 queries
  ✓ Computed 189 embeddings
  ✓ Embedding dimension: 384
  ✓ Memory used: 0.28 MB
  ✓ Evidence 

In [6]:
# ============================================================================
# Step 4: Helper Functions - Metadata Extraction and Vector Store Loading
# ============================================================================

# %% [markdown]
# ## 4.1 Metadata Extraction Functions
# 
# These functions extract document name and page number from retrieved chunks

# %%
def extract_doc_name_from_path(file_path: str) -> str:
    """
    Extract document name from file path.
    
    Example:
        "../../documents/3M_2018_10K.pdf" → "3M_2018_10K"
    
    Args:
        file_path: Full path to document
        
    Returns:
        Document name without extension
    """
    return Path(file_path).stem


def extract_metadata_from_retrieved_doc(doc) -> Dict:
    """
    Extract metadata from a retrieved LangChain document.
    
    FIXED: Correctly extracts from ChromaDB metadata structure:
        - file_path: Full path to PDF
        - source: Page number (as integer or string)
    
    Args:
        doc: LangChain Document object from vectorstore.similarity_search()
        
    Returns:
        Dict with:
            - doc_name: Document name (e.g., "3M_2018_10K")
            - page_number: Page number (integer, 0-indexed from ChromaDB)
            - chunk_text: The chunk content
    """
    metadata = doc.metadata
    
    # Extract file path and convert to doc name
    file_path = metadata.get('file_path', '')
    doc_name = extract_doc_name_from_path(file_path)
    
    # Extract page number from 'source' field
    page_num = metadata.get('source', 0)
    
    # Ensure page_num is an integer
    if isinstance(page_num, str):
        try:
            page_num = int(page_num)
        except ValueError:
            page_num = 0
    
    return {
        'doc_name': doc_name,
        'page_number': page_num,  # Keep 0-indexed as stored in ChromaDB
        'chunk_text': doc.page_content
    }

print("✓ Metadata extraction functions defined")

# %% [markdown]
# ## 4.2 Embedding Function Factory
# 
# Creates the appropriate embedding function based on provider

# %%
def get_embedding_function(provider: str, model: str):
    """
    Get embedding function for vector store loading.
    
    Args:
        provider: "ollama", "openai", or "voyage"
        model: Model name (e.g., "nomic-embed-text", "text-embedding-3-small")
        
    Returns:
        Embedding function compatible with LangChain/ChromaDB
        
    Raises:
        ValueError: If provider is unknown
    """
    if provider == "ollama":
        return OllamaEmbeddings(
            model=model,
            base_url=OLLAMA_BASE_URL
        )
    elif provider == "openai":
        return OpenAIEmbeddings(
            model=model,
            openai_api_key=OPENAI_API_KEY
        )
    elif provider == "voyage":
        return VoyageAIEmbeddings(
            model=model,
            voyage_api_key=VOYAGE_API_KEY
        )
    else:
        raise ValueError(f"Unknown provider: {provider}")

print("✓ Embedding function factory defined")

# %% [markdown]
# ## 4.3 Vector Store Loading
# 
# Load pre-built vector databases from disk

# %%
def load_vectorstore(
    provider: str,
    model: str,
    chunk_size: int,
    base_dir: str = VECTOR_DB_BASE_DIR,
    collection_prefix: str = COLLECTION_PREFIX
) -> Chroma:
    """
    Load a pre-built vector store from disk.
    
    The vector databases were created by build_vectore_database.ipynb
    and stored in a specific directory structure:
    
    {base_dir}/{provider}_{model}/financebench_docs_chunk_{chunk_size}/
    
    Args:
        provider: "ollama", "openai", or "voyage"
        model: Model name
        chunk_size: Chunk size (256, 512, 1024, 2048, 4096)
        base_dir: Base directory for vector databases
        collection_prefix: Prefix for collection names
        
    Returns:
        Loaded ChromaDB vectorstore
        
    Example:
        vectorstore = load_vectorstore("voyage", "voyage-finance-2", 1024)
    """
    # Construct paths
    model_id = f"{provider}_{model.replace('/', '_')}"
    db_path = os.path.join(base_dir, model_id)
    collection_name = f"{collection_prefix}{chunk_size}"
    
    # Get embedding function
    emb_fn = get_embedding_function(provider, model)
    
    # Load vectorstore
    vectorstore = Chroma(
        collection_name=collection_name,
        embedding_function=emb_fn,
        persist_directory=db_path
    )
    
    return vectorstore

print("✓ Vector store loading function defined")

# %% [markdown]
# ## 4.4 Test Vector Store Loading
# 
# Verify we can load a vector store and retrieve documents

# %%
def test_vectorstore_loading():
    """
    Test loading a vector store and performing a sample retrieval.
    """
    print("\n" + "="*60)
    print("TESTING VECTOR STORE LOADING")
    print("="*60)
    
    # Test with a common configuration
    test_provider = "voyage"
    test_model = "voyage-finance-2"
    test_chunk_size = 1024
    
    print(f"\nTest configuration:")
    print(f"  Provider: {test_provider}")
    print(f"  Model: {test_model}")
    print(f"  Chunk size: {test_chunk_size}")
    
    try:
        # Load vectorstore
        print("\nLoading vectorstore...")
        vectorstore = load_vectorstore(test_provider, test_model, test_chunk_size)
        
        # Check collection
        doc_count = vectorstore._collection.count()
        print(f"✓ Vectorstore loaded")
        print(f"  Documents in collection: {doc_count:,}")
        
        # Test retrieval
        print("\nTesting retrieval...")
        test_query = "What was the revenue in 2018?"
        results = vectorstore.similarity_search(test_query, k=3)
        
        print(f"✓ Retrieved {len(results)} documents")
        
        # Show sample result
        print("\nSample retrieved document:")
        sample_doc = results[0]
        metadata = extract_metadata_from_retrieved_doc(sample_doc)
        
        print(f"  Doc name: {metadata['doc_name']}")
        print(f"  Page number: {metadata['page_number']}")
        print(f"  Chunk text (first 150 chars): {metadata['chunk_text'][:150]}...")
        print(f"  Chunk text length: {len(metadata['chunk_text'])} characters")
        
        print("\n" + "="*60)
        print("✓ VECTOR STORE TEST COMPLETE")
        print("="*60)
        
        return True
        
    except Exception as e:
        print(f"\n✗ Error during test: {e}")
        print("\nPossible issues:")
        print("  1. Vector database doesn't exist for this configuration")
        print("  2. Path is incorrect")
        print("  3. ChromaDB version mismatch")
        print(f"\nExpected path: {VECTOR_DB_BASE_DIR}/{test_provider}_{test_model}/")
        return False

# %%
# Run test
test_result = test_vectorstore_loading()

# %%
print("\n" + "="*60)
print("✓ STEP 4 COMPLETE!")
print("="*60)
print("  ✓ Metadata extraction functions defined")
print("  ✓ Embedding function factory defined")
print("  ✓ Vector store loading function defined")
print("  ✓ Vector store loading tested successfully")
print("  ✓ Ready to perform retrievals with chunk text extraction")
print("="*60)

✓ Metadata extraction functions defined
✓ Embedding function factory defined
✓ Vector store loading function defined

TESTING VECTOR STORE LOADING

Test configuration:
  Provider: voyage
  Model: voyage-finance-2
  Chunk size: 1024

Loading vectorstore...


  vectorstore = Chroma(


✓ Vectorstore loaded
  Documents in collection: 15,765

Testing retrieval...
✓ Retrieved 3 documents

Sample retrieved document:
  Doc name: 3M_2018_10K
  Page number: 59
  Chunk text (first 150 chars): 44 per share, Note 8)
 
 
(3,193) 
 
  
 
(3,193) 
 
  
 
  
 
  
Transfer of ownership involving non-wholly owned subsidiaries
 
 
 —  
 
  
 
14  
 ...
  Chunk text length: 951 characters

✓ VECTOR STORE TEST COMPLETE

✓ STEP 4 COMPLETE!
  ✓ Metadata extraction functions defined
  ✓ Embedding function factory defined
  ✓ Vector store loading function defined
  ✓ Vector store loading tested successfully
  ✓ Ready to perform retrievals with chunk text extraction


In [7]:
# ============================================================================
# Step 5: Helper Functions - Page-Based Evaluation
# ============================================================================

# %% [markdown]
# ## 5.1 Page-Based Matching Function
# 
# This function checks if a retrieved chunk matches evidence based on page numbers

# %%
def check_page_match(
    retrieved_doc: Dict, 
    evidence_list: List[Dict],
    chunk_size: int = 512,
    use_page_tolerance: bool = True
) -> bool:
    """
    Check if retrieved document matches any evidence based on PAGE NUMBERS.
    
    Uses chunk-size-aware page tolerance:
    - Larger chunks can span multiple pages
    - Retrieved page must be BEFORE or AT evidence page (within tolerance)
    - Retrieved page AFTER evidence page = no match
    
    Page tolerance (when use_page_tolerance=True):
    - chunk_size <= 512: tolerance = 0 (exact match)
    - chunk_size 513-1024: tolerance = 1
    - chunk_size 1025-2048: tolerance = 2
    - chunk_size > 2048: tolerance = 2
    
    Args:
        retrieved_doc: Dict with 'doc_name' and 'page_number' (1-indexed)
        evidence_list: List of evidence dicts (page_number is 1-indexed)
        chunk_size: Chunk size for tolerance calculation
        use_page_tolerance: If True, use tolerance; if False, exact match only
        
    Returns:
        True if match found, False otherwise
        
    Example:
        Evidence on page 50, chunk_size=1024, tolerance=1:
        - Page 49: MATCH (within tolerance, before evidence)
        - Page 50: MATCH (exact match)
        - Page 51: NO MATCH (after evidence page)
    """
    retrieved_doc_name = retrieved_doc['doc_name']
    retrieved_page = retrieved_doc['page_number']
    
    # Calculate page tolerance based on chunk size
    if use_page_tolerance:
        if chunk_size <= 512:
            page_tolerance = 0
        elif chunk_size <= 1024:
            page_tolerance = 1
        elif chunk_size <= 2048:
            page_tolerance = 2
        else:
            page_tolerance = 2
    else:
        page_tolerance = 0  # Exact match only
    
    # Check against all evidence items
    for evidence in evidence_list:
        evidence_doc_name = evidence['doc_name']
        evidence_page = evidence['page_number']  # Already 1-indexed from evidence_lookup
        
        # Check document name match
        if retrieved_doc_name != evidence_doc_name:
            continue
        
        # Check page match with tolerance
        # Only match if retrieved page is BEFORE or AT evidence page
        if retrieved_page <= evidence_page <= retrieved_page + page_tolerance:
            return True
    
    return False

print("✓ Page-based matching function defined")

# %% [markdown]
# ## 5.2 Page-Based MRR Calculation
# 
# Calculate Mean Reciprocal Rank based on page matching

# %%
def calculate_page_mrr_for_query(
    retrieved_docs: List[Dict], 
    evidence_list: List[Dict],
    chunk_size: int = 512,
    use_page_tolerance: bool = True
) -> Tuple[float, int]:
    """
    Calculate PAGE-BASED MRR for a single query.
    
    Finds the rank of the first retrieved document that matches
    any evidence based on page numbers.
    
    Args:
        retrieved_docs: List of retrieved docs with 'doc_name', 'page_number'
        evidence_list: List of evidence items from evidence_lookup
        chunk_size: Chunk size for tolerance calculation
        use_page_tolerance: If True, use chunk-size-aware tolerance
        
    Returns:
        Tuple of (mrr_score, rank):
        - mrr_score: 1/rank if found, 0 if not found
        - rank: Position of first match (1-indexed), -1 if not found
        
    Example:
        First match at position 3: mrr_score = 1/3 = 0.333, rank = 3
        No match found: mrr_score = 0.0, rank = -1
    """
    for rank, retrieved_doc in enumerate(retrieved_docs, start=1):
        if check_page_match(retrieved_doc, evidence_list, chunk_size, use_page_tolerance):
            mrr_score = 1.0 / rank
            return mrr_score, rank
    
    # No match found
    return 0.0, -1

print("✓ Page-based MRR calculation defined")

# %% [markdown]
# ## 5.3 Page-Based Recall, Precision, and F1
# 
# NEW: Calculate precision, recall, and F1 based on page matching

# %%
def calculate_page_metrics_for_query(
    retrieved_docs: List[Dict],
    evidence_list: List[Dict],
    chunk_size: int = 512,
    use_page_tolerance: bool = True
) -> Tuple[float, float, float]:
    """
    Calculate PAGE-BASED Recall, Precision, and F1 for a single query.
    
    Recall: What proportion of evidence pages were found in retrieved chunks?
        recall = (# evidence items matched) / (# total evidence items)
    
    Precision: What proportion of retrieved chunks matched evidence?
        precision = (# retrieved chunks matching evidence) / (# total retrieved chunks)
    
    F1: Harmonic mean of precision and recall
        f1 = 2 × (precision × recall) / (precision + recall)
    
    Args:
        retrieved_docs: List of retrieved docs with 'doc_name', 'page_number'
        evidence_list: List of evidence items from evidence_lookup
        chunk_size: Chunk size for tolerance calculation
        use_page_tolerance: If True, use chunk-size-aware tolerance
        
    Returns:
        Tuple of (recall, precision, f1)
        
    Example:
        Evidence items: 2 (pages 50, 75)
        Retrieved: 20 chunks
        Matches: Found page 50 in 2 chunks, page 75 in 1 chunk
        
        Evidence matched: {page 50, page 75} = 2 unique evidence
        Chunks matching: 3 chunks matched at least one evidence
        
        Recall = 2/2 = 1.0 (found all evidence)
        Precision = 3/20 = 0.15 (3 out of 20 chunks matched)
        F1 = 2 × (1.0 × 0.15) / (1.0 + 0.15) = 0.26
    """
    if len(evidence_list) == 0:
        return 0.0, 0.0, 0.0
    
    if len(retrieved_docs) == 0:
        return 0.0, 0.0, 0.0
    
    # Track which evidence items were found
    evidence_found = set()  # Set of evidence indices that were matched
    
    # Track which retrieved chunks matched at least one evidence
    chunks_matching = 0
    
    # Check each retrieved chunk
    for retrieved_doc in retrieved_docs:
        chunk_matched_any_evidence = False
        
        # Check against each evidence item
        for evidence_idx, evidence in enumerate(evidence_list):
            # Create single-item list for check_page_match
            if check_page_match(retrieved_doc, [evidence], chunk_size, use_page_tolerance):
                evidence_found.add(evidence_idx)
                chunk_matched_any_evidence = True
        
        if chunk_matched_any_evidence:
            chunks_matching += 1
    
    # Calculate metrics
    recall = len(evidence_found) / len(evidence_list)
    precision = chunks_matching / len(retrieved_docs)
    
    # Calculate F1
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0
    
    return recall, precision, f1

print("✓ Page-based metrics (recall, precision, F1) defined")

# %% [markdown]
# ## 5.4 Test Page-Based Evaluation Functions
# 
# Verify all page-based metrics work correctly

# %%
def test_page_based_evaluation():
    """
    Test page-based evaluation functions with sample data.
    """
    print("\n" + "="*60)
    print("TESTING PAGE-BASED EVALUATION")
    print("="*60)
    
    # Create sample evidence (using 1-indexed pages)
    evidence_list = [
        {'doc_name': 'TEST_DOC', 'page_number': 50},
        {'doc_name': 'TEST_DOC', 'page_number': 75}
    ]
    
    # Create sample retrieved documents
    retrieved_docs = [
        {'doc_name': 'OTHER_DOC', 'page_number': 10},  # No match - wrong doc
        {'doc_name': 'TEST_DOC', 'page_number': 50},   # MATCH - exact evidence page 50
        {'doc_name': 'TEST_DOC', 'page_number': 49},   # MATCH - within tolerance of page 50
        {'doc_name': 'TEST_DOC', 'page_number': 30},   # No match - not near evidence
        {'doc_name': 'TEST_DOC', 'page_number': 75},   # MATCH - exact evidence page 75
    ]
    
    chunk_size = 1024  # tolerance = 1
    
    print("\nTest setup:")
    print(f"  Evidence pages: [50, 75]")
    print(f"  Retrieved pages: [10 (OTHER_DOC), 50, 49, 30, 75]")
    print(f"  Chunk size: {chunk_size} (tolerance = 1)")
    
    # Test MRR
    print("\n--- Page-Based MRR ---")
    mrr_score, rank = calculate_page_mrr_for_query(
        retrieved_docs, evidence_list, chunk_size, use_page_tolerance=True
    )
    print(f"  First match at rank: {rank}")
    print(f"  MRR score: {mrr_score:.4f}")
    print(f"  Expected: rank=2 (second doc matches page 50), MRR=0.5000")
    
    # Test Recall, Precision, F1
    print("\n--- Page-Based Recall, Precision, F1 ---")
    recall, precision, f1 = calculate_page_metrics_for_query(
        retrieved_docs, evidence_list, chunk_size, use_page_tolerance=True
    )
    print(f"  Recall: {recall:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1: {f1:.4f}")
    
    print("\n  Expected calculations:")
    print("    Evidence found: {page 50, page 75} = 2/2 evidence items")
    print("    Chunks matching: 3 chunks (pages 50, 49, 75) matched evidence")
    print("    Recall = 2/2 = 1.0000")
    print("    Precision = 3/5 = 0.6000")
    print("    F1 = 2 × (1.0 × 0.6) / (1.0 + 0.6) = 0.7500")
    
    # Verify results
    print("\n--- Verification ---")
    if rank == 2 and abs(mrr_score - 0.5) < 0.001:
        print("  ✓ MRR calculation correct")
    else:
        print("  ✗ MRR calculation incorrect")
    
    if abs(recall - 1.0) < 0.001 and abs(precision - 0.6) < 0.001 and abs(f1 - 0.75) < 0.001:
        print("  ✓ Recall, Precision, F1 calculations correct")
    else:
        print("  ✗ Metrics calculation incorrect")
    
    print("\n" + "="*60)
    print("✓ PAGE-BASED EVALUATION TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test
test_page_based = test_page_based_evaluation()

# %%
print("\n" + "="*60)
print("✓ STEP 5 COMPLETE!")
print("="*60)
print("  ✓ Page-based matching function defined")
print("  ✓ Page-based MRR calculation defined")
print("  ✓ Page-based Recall, Precision, F1 calculation defined")
print("  ✓ All page-based functions tested successfully")
print("  ✓ Ready to implement text-based evaluation")
print("="*60)

✓ Page-based matching function defined
✓ Page-based MRR calculation defined
✓ Page-based metrics (recall, precision, F1) defined

TESTING PAGE-BASED EVALUATION

Test setup:
  Evidence pages: [50, 75]
  Retrieved pages: [10 (OTHER_DOC), 50, 49, 30, 75]
  Chunk size: 1024 (tolerance = 1)

--- Page-Based MRR ---
  First match at rank: 2
  MRR score: 0.5000
  Expected: rank=2 (second doc matches page 50), MRR=0.5000

--- Page-Based Recall, Precision, F1 ---
  Recall: 1.0000
  Precision: 0.6000
  F1: 0.7500

  Expected calculations:
    Evidence found: {page 50, page 75} = 2/2 evidence items
    Chunks matching: 3 chunks (pages 50, 49, 75) matched evidence
    Recall = 2/2 = 1.0000
    Precision = 3/5 = 0.6000
    F1 = 2 × (1.0 × 0.6) / (1.0 + 0.6) = 0.7500

--- Verification ---
  ✓ MRR calculation correct
  ✓ Recall, Precision, F1 calculations correct

✓ PAGE-BASED EVALUATION TEST COMPLETE

✓ STEP 5 COMPLETE!
  ✓ Page-based matching function defined
  ✓ Page-based MRR calculation defined
 

In [8]:
# ============================================================================
# Step 6: Helper Functions - Text-Based Evaluation
# ============================================================================

# %% [markdown]
# ## 6.1 Chunk Text Preview Formatting
# 
# Format chunk text as "first N chars...last N chars" for JSON storage

# %%
def format_chunk_text_preview(
    text: str,
    prefix_chars: int = CHUNK_TEXT_PREFIX_CHARS,
    suffix_chars: int = CHUNK_TEXT_SUFFIX_CHARS
) -> str:
    """
    Format chunk text as abbreviated preview for JSON storage.
    
    Format: "first N characters...last N characters"
    
    This keeps JSON files manageable while providing enough context
    to manually verify matches.
    
    Args:
        text: Full chunk text
        prefix_chars: Number of characters from start
        suffix_chars: Number of characters from end
        
    Returns:
        Formatted preview string
        
    Examples:
        Short text (< prefix + suffix): Returns full text
        Long text: "Capital expenditures were $1,577...in fiscal year 2018."
    """
    if len(text) <= prefix_chars + suffix_chars:
        # Text is short enough, return as-is
        return text
    
    # Extract prefix and suffix
    prefix = text[:prefix_chars]
    suffix = text[-suffix_chars:]
    
    # Format with ellipsis
    return f"{prefix}...{suffix}"

print("✓ Chunk text preview formatting defined")

# %% [markdown]
# ## 6.2 Cosine Similarity Calculation
# 
# Calculate cosine similarity between chunk and evidence embeddings

# %%
def compute_cosine_similarity(
    chunk_embedding: np.ndarray,
    evidence_embedding: np.ndarray
) -> float:
    """
    Calculate cosine similarity between two embeddings.
    
    Cosine similarity ranges from -1 to 1:
    - 1.0: Identical/very similar
    - 0.7-0.9: Strong similarity
    - 0.5-0.7: Moderate similarity
    - 0.0-0.5: Weak/no similarity
    - Negative: Opposite meaning (rare in practice)
    
    Args:
        chunk_embedding: Embedding vector for retrieved chunk (384-dim)
        evidence_embedding: Embedding vector for evidence (384-dim)
        
    Returns:
        Cosine similarity score (float)
        
    Note:
        sklearn's cosine_similarity expects 2D arrays, so we reshape
    """
    # Reshape to 2D arrays: (1, 384)
    chunk_emb_2d = chunk_embedding.reshape(1, -1)
    evidence_emb_2d = evidence_embedding.reshape(1, -1)
    
    # Calculate similarity
    similarity = cosine_similarity(chunk_emb_2d, evidence_emb_2d)[0][0]
    
    return float(similarity)

print("✓ Cosine similarity calculation defined")

# %% [markdown]
# ## 6.3 Calculate Text Similarities for Retrieved Chunk
# 
# For each retrieved chunk, calculate similarity with ALL evidence items

# %%
def calculate_text_similarities_for_chunk(
    chunk_text: str,
    chunk_doc_name: str,
    evidence_items: List[Dict],
    sbert_model: SentenceTransformer
) -> List[Dict]:
    """
    Calculate cosine similarity between a chunk and all evidence items.
    
    Args:
        chunk_text: Text content of retrieved chunk
        evidence_items: List of evidence items (each has 'embedding', 'doc_name', 'page_number')
        sbert_model: Sentence-BERT model for encoding chunk
        
    Returns:
        List of similarity results:
        [
            {
                'evidence_index': 0,
                'evidence_doc': 'DOC_NAME',
                'evidence_page': 60,
                'cosine_similarity': 0.7823
            },
            ...
        ]
        
    Note:
        Evidence embeddings are pre-computed, so we only encode the chunk once
    """
    # Encode chunk text
    chunk_embedding = sbert_model.encode(chunk_text, convert_to_numpy=True)
    
    # Calculate similarity with each evidence
    similarities = []
    
    for evidence_idx, evidence in enumerate(evidence_items):
        # if doc names match, compute similarity; else 0.0
        if chunk_doc_name == evidence['doc_name']:
            similarity_score = compute_cosine_similarity(
                chunk_embedding,
                evidence['embedding']
            )
        else:
            similarity_score = 0.0  # No similarity if different documents
        
        similarities.append({
            'evidence_index': evidence_idx,
            'evidence_doc': evidence['doc_name'],
            'evidence_page': evidence['page_number'],
            'cosine_similarity': similarity_score
        })
    
    return similarities

print("✓ Text similarities calculation for chunk defined")

# %% [markdown]
# ## 6.4 Text-Based Metrics Calculation
# 
# Calculate text-based MRR, Recall, Precision, and F1

# %%
def calculate_text_metrics_for_query(
    retrieved_docs: List[Dict],
    evidence_items: List[Dict],
    sbert_model: SentenceTransformer,
    threshold: float = TEXT_SIMILARITY_THRESHOLD
) -> Tuple[float, int, float, float, float, List[List[Dict]]]:
    """
    Calculate TEXT-BASED metrics for a single query.
    
    For each retrieved chunk:
    1. Encode chunk text
    2. Calculate similarity with all evidence
    3. Determine if chunk matches (max_similarity >= threshold)
    
    Metrics:
    - Text MRR: Rank of first chunk where max(similarities) >= threshold
    - Text Recall: # evidence matched / # total evidence
    - Text Precision: # chunks matching / # total chunks
    - Text F1: Harmonic mean of precision and recall
    
    Args:
        retrieved_docs: List of retrieved docs with 'chunk_text'
        evidence_items: List of evidence items with 'embedding'
        sbert_model: Sentence-BERT model for encoding chunks
        threshold: Similarity threshold for matching (default: 0.7)
        
    Returns:
        Tuple of (text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities)
        - text_mrr: MRR score (0.0 if no match)
        - text_rank: Rank of first match (-1 if no match)
        - text_recall: Proportion of evidence found
        - text_precision: Proportion of chunks matching
        - text_f1: F1 score
        - all_similarities: List of similarity lists for each chunk (for JSON storage)
        
    Example:
        Evidence: 2 items
        Retrieved: 20 chunks
        Chunk 5 has max_similarity=0.82 with evidence[0] (FIRST MATCH)
        Chunk 12 has max_similarity=0.75 with evidence[1]
        
        text_mrr = 1/5 = 0.2
        text_rank = 5
        evidence_matched = {evidence[0], evidence[1]} = 2
        chunks_matching = 2
        text_recall = 2/2 = 1.0
        text_precision = 2/20 = 0.1
        text_f1 = 2 × (1.0 × 0.1) / (1.0 + 0.1) = 0.18
    """
    if len(evidence_items) == 0 or len(retrieved_docs) == 0:
        return 0.0, -1, 0.0, 0.0, 0.0, []
    
    # Track results
    all_similarities = []  # Store all similarities for JSON
    text_mrr = 0.0
    text_rank = -1
    evidence_found = set()  # Set of evidence indices matched
    chunks_matching = 0
    
    # Process each retrieved chunk
    for rank, retrieved_doc in enumerate(retrieved_docs, start=1):
        chunk_text = retrieved_doc.get('chunk_text', '')
        chunk_doc_name = retrieved_doc.get('doc_name', '')
        
        if not chunk_text:
            # No text available
            all_similarities.append([])
            continue
        
        # Calculate similarities with all evidence
        similarities = calculate_text_similarities_for_chunk(
            chunk_text,
            chunk_doc_name,
            evidence_items,
            sbert_model
        )
        
        all_similarities.append(similarities)
        
        # Find maximum similarity
        max_similarity = max([s['cosine_similarity'] for s in similarities])
        
        # Check if this chunk matches (above threshold)
        chunk_matches_any_evidence = (max_similarity >= threshold)
        
        if chunk_matches_any_evidence:
            chunks_matching += 1
            
            # Record which evidence items this chunk matched
            for i, sim in enumerate(similarities):
                if sim['cosine_similarity'] >= threshold:
                    evidence_found.add(i)
            
            # Check for MRR (first match)
            if text_mrr == 0.0:  # First match found
                text_mrr = 1.0 / rank
                text_rank = rank
    
    # Calculate recall and precision
    text_recall = len(evidence_found) / len(evidence_items)
    text_precision = chunks_matching / len(retrieved_docs)
    
    # Calculate F1
    if text_precision + text_recall > 0:
        text_f1 = 2 * (text_precision * text_recall) / (text_precision + text_recall)
    else:
        text_f1 = 0.0
    
    return text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities

print("✓ Text-based metrics calculation defined")

# %% [markdown]
# ## 6.5 Test Text-Based Evaluation Functions
# 
# Verify text-based metrics work correctly with sample data

# %%
def test_text_based_evaluation():
    """
    Test text-based evaluation functions with real FinanceBench data.
    """
    print("\n" + "="*60)
    print("TESTING TEXT-BASED EVALUATION")
    print("="*60)
    
    # Get sample query
    sample_record = dataset[0]
    query_id = sample_record['financebench_id']
    
    print(f"\nTest query: {query_id}")
    print(f"  Question: {sample_record['question'][:100]}...")
    
    # Get evidence for this query
    evidence_items = evidence_lookup[query_id]
    print(f"\n  Evidence items: {len(evidence_items)}")
    for i, ev in enumerate(evidence_items):
        print(f"    {i+1}. {ev['doc_name']}, page {ev['page_number']}")
        print(f"       Text (first 80 chars): {ev['evidence_text'][:80]}...")
    
    # Create sample retrieved chunks
    # Chunk 1: Contains exact evidence text (should have very high similarity)
    chunk1_text = evidence_items[0]['evidence_text']
    
    # Chunk 2: Paraphrased financial content (moderate similarity)
    chunk2_text = "The company's capital spending was approximately $1.6 billion for the fiscal year."
    
    # Chunk 3: Different financial topic (low similarity)
    chunk3_text = "Revenue increased by 8% year-over-year driven by strong product sales."
    
    # Chunk 4: Unrelated content (very low similarity)
    chunk4_text = "The weather forecast predicts sunny skies for the weekend."
    
    retrieved_docs = [
        {'chunk_text': chunk1_text},
        {'chunk_text': chunk2_text},
        {'chunk_text': chunk3_text},
        {'chunk_text': chunk4_text}
    ]
    
    print("\n  Retrieved chunks: 4")
    print("    1. Exact evidence text")
    print("    2. Paraphrased financial content")
    print("    3. Different financial topic")
    print("    4. Unrelated content")
    
    # Calculate text-based metrics
    print(f"\n  Calculating similarities with threshold={TEXT_SIMILARITY_THRESHOLD}...")
    
    text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities = \
        calculate_text_metrics_for_query(
            retrieved_docs,
            evidence_items,
            sbert_model,
            threshold=TEXT_SIMILARITY_THRESHOLD
        )
    
    # Display results
    print("\n--- Text-Based Metrics ---")
    print(f"  Text MRR: {text_mrr:.4f}")
    print(f"  Text Rank: {text_rank}")
    print(f"  Text Recall: {text_recall:.4f}")
    print(f"  Text Precision: {text_precision:.4f}")
    print(f"  Text F1: {text_f1:.4f}")
    
    # Show similarities for each chunk
    print("\n--- Chunk Similarities ---")
    for i, (chunk, similarities) in enumerate(zip(retrieved_docs, all_similarities), start=1):
        print(f"\n  Chunk {i}:")
        print(f"    Text (first 60 chars): {chunk['chunk_text'][:60]}...")
        for sim in similarities:
            match_status = "✓ MATCH" if sim['cosine_similarity'] >= TEXT_SIMILARITY_THRESHOLD else "✗ NO MATCH"
            print(f"    Evidence {sim['evidence_index']}: {sim['cosine_similarity']:.4f} {match_status}")
    
    # Expected behavior
    print("\n--- Expected Behavior ---")
    print("  Chunk 1 (exact evidence): Should have similarity ~0.99, MATCH")
    print("  Chunk 2 (paraphrased): Should have similarity ~0.7-0.8, likely MATCH")
    print("  Chunk 3 (different topic): Should have similarity ~0.3-0.5, NO MATCH")
    print("  Chunk 4 (unrelated): Should have similarity ~0.1-0.2, NO MATCH")
    
    print("\n" + "="*60)
    print("✓ TEXT-BASED EVALUATION TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test
test_text_based = test_text_based_evaluation()

# %%
print("\n" + "="*60)
print("✓ STEP 6 COMPLETE!")
print("="*60)
print("  ✓ Chunk text preview formatting defined")
print("  ✓ Cosine similarity calculation defined")
print("  ✓ Text similarities for chunks defined")
print("  ✓ Text-based MRR, Recall, Precision, F1 calculation defined")
print("  ✓ All text-based functions tested with real data")
print(f"  ✓ Similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print("  ✓ Ready for retrieval functions")
print("="*60)

✓ Chunk text preview formatting defined
✓ Cosine similarity calculation defined
✓ Text similarities calculation for chunk defined
✓ Text-based metrics calculation defined

TESTING TEXT-BASED EVALUATION

Test query: financebench_id_03029
  Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...

  Evidence items: 1
    1. 3M_2018_10K, page 60
       Text (first 80 chars): Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Fl...

  Retrieved chunks: 4
    1. Exact evidence text
    2. Paraphrased financial content
    3. Different financial topic
    4. Unrelated content

  Calculating similarities with threshold=0.8...

--- Text-Based Metrics ---
  Text MRR: 0.0000
  Text Rank: -1
  Text Recall: 0.0000
  Text Precision: 0.0000
  Text F1: 0.0000

--- Chunk Similarities ---

  Chunk 1:
    Text (first 60 chars): Table of Contents 
3M Company and Subsidiaries
Consolidated ...
    Evidence 0: 0.0000 ✗ NO MA

In [9]:
# ============================================================================
# Step 7: Retrieval Functions
# ============================================================================

# %% [markdown]
# ## 7.1 Global Retrieval
# 
# Retrieve documents from the entire corpus (all documents)

# %%
def retrieve_global(
    vectorstore: Chroma,
    query: str,
    k: int
) -> List[Dict]:
    """
    Retrieve documents globally (search all documents in the corpus).
    
    This mode searches across all 84 documents in FinanceBench.
    Use case: Testing if the system can identify the correct document
    among many documents.
    
    Args:
        vectorstore: Loaded ChromaDB vectorstore
        query: Query text
        k: Number of documents to retrieve
        
    Returns:
        List of retrieved documents with metadata:
        [
            {
                'doc_name': 'DOC_NAME',
                'page_number': 60,
                'rank': 1,
                'chunk_text': 'Full chunk text...'
            },
            ...
        ]
        
    Note:
        Results are ordered by similarity score (most similar first)
    """
    # Perform similarity search
    results = vectorstore.similarity_search(query, k=k)
    
    # Extract metadata and add rank
    retrieved = []
    for rank, doc in enumerate(results, start=1):
        metadata = extract_metadata_from_retrieved_doc(doc)
        metadata['rank'] = rank
        retrieved.append(metadata)
    
    return retrieved

print("✓ Global retrieval function defined")

# %% [markdown]
# ## 7.2 Single-Document Retrieval
# 
# Retrieve documents filtered to a specific target document

# %%
def retrieve_single_doc(
    vectorstore: Chroma,
    query: str,
    target_doc_name: str,
    k: int
) -> List[Dict]:
    """
    Retrieve documents filtered to a single target document.
    
    This mode assumes we already know which document contains the answer
    and only searches within that document.
    Use case: Testing passage retrieval accuracy when document is known.
    
    Implementation:
        ChromaDB doesn't support substring matching in filters, so we:
        1. Retrieve more documents (k × 10)
        2. Filter to target document
        3. Return top k from filtered results
    
    Args:
        vectorstore: Loaded ChromaDB vectorstore
        query: Query text
        target_doc_name: Target document name (e.g., "3M_2018_10K")
        k: Number of documents to retrieve
        
    Returns:
        List of retrieved documents from target document:
        [
            {
                'doc_name': '3M_2018_10K',
                'page_number': 47,
                'rank': 1,
                'chunk_text': 'Full chunk text...'
            },
            ...
        ]
        
    Note:
        If target document has fewer than k chunks, returns all available chunks
    """
    # Retrieve more documents to ensure we get enough from target doc
    # Factor of 10 is usually sufficient
    retrieve_count = k * 10
    results = vectorstore.similarity_search(query, k=retrieve_count)
    
    # Filter to target document and extract metadata
    filtered = []
    for doc in results:
        metadata = extract_metadata_from_retrieved_doc(doc)
        if metadata['doc_name'] == target_doc_name:
            filtered.append(metadata)
            # Stop once we have enough
            if len(filtered) >= k:
                break
    
    # Take top k from filtered results
    top_k_filtered = filtered[:k]
    
    # Add rank
    for rank, doc_metadata in enumerate(top_k_filtered, start=1):
        doc_metadata['rank'] = rank
    
    return top_k_filtered

print("✓ Single-document retrieval function defined")

# %% [markdown]
# ## 7.3 Test Retrieval Functions
# 
# Verify both retrieval modes work correctly

# %%
def test_retrieval_functions():
    """
    Test both global and single-document retrieval.
    """
    print("\n" + "="*60)
    print("TESTING RETRIEVAL FUNCTIONS")
    print("="*60)
    
    # Test configuration
    test_provider = "voyage"
    test_model = "voyage-finance-2"
    test_chunk_size = 1024
    test_k = 5
    
    print(f"\nTest configuration:")
    print(f"  Provider: {test_provider}")
    print(f"  Model: {test_model}")
    print(f"  Chunk size: {test_chunk_size}")
    print(f"  K: {test_k}")
    
    # Load vectorstore
    print("\nLoading vectorstore...")
    vectorstore = load_vectorstore(test_provider, test_model, test_chunk_size)
    doc_count = vectorstore._collection.count()
    print(f"✓ Loaded ({doc_count:,} documents)")
    
    # Test query
    test_query = "What was the capital expenditure in 2018?"
    print(f"\nTest query: {test_query}")
    
    # Test 1: Global retrieval
    print("\n" + "-"*60)
    print("TEST 1: Global Retrieval")
    print("-"*60)
    
    global_results = retrieve_global(vectorstore, test_query, test_k)
    
    print(f"✓ Retrieved {len(global_results)} documents")
    print("\nTop 3 results:")
    for i, result in enumerate(global_results[:3], start=1):
        print(f"\n  {i}. Rank {result['rank']}")
        print(f"     Doc: {result['doc_name']}")
        print(f"     Page: {result['page_number']}")
        print(f"     Text (first 100 chars): {result['chunk_text'][:100]}...")
        print(f"     Text length: {len(result['chunk_text'])} chars")
    
    # Test 2: Single-document retrieval
    print("\n" + "-"*60)
    print("TEST 2: Single-Document Retrieval")
    print("-"*60)
    
    target_doc = "3M_2018_10K"
    print(f"Target document: {target_doc}")
    
    singledoc_results = retrieve_single_doc(vectorstore, test_query, target_doc, test_k)
    
    print(f"✓ Retrieved {len(singledoc_results)} documents from target")
    print("\nTop 3 results:")
    for i, result in enumerate(singledoc_results[:3], start=1):
        print(f"\n  {i}. Rank {result['rank']}")
        print(f"     Doc: {result['doc_name']}")
        print(f"     Page: {result['page_number']}")
        print(f"     Text (first 100 chars): {result['chunk_text'][:100]}...")
        print(f"     Text length: {len(result['chunk_text'])} chars")
    
    # Verify all results are from target document
    all_from_target = all(r['doc_name'] == target_doc for r in singledoc_results)
    if all_from_target:
        print(f"\n✓ All results correctly filtered to {target_doc}")
    else:
        print(f"\n✗ Some results not from target document!")
    
    print("\n" + "="*60)
    print("✓ RETRIEVAL FUNCTIONS TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test
test_retrieval = test_retrieval_functions()

# %%
print("\n" + "="*60)
print("✓ STEP 7 COMPLETE!")
print("="*60)
print("  ✓ Global retrieval function defined")
print("  ✓ Single-document retrieval function defined")
print("  ✓ Both retrieval modes tested successfully")
print("  ✓ Chunk text extraction verified")
print("  ✓ Ready for main evaluation function")
print("="*60)

✓ Global retrieval function defined
✓ Single-document retrieval function defined

TESTING RETRIEVAL FUNCTIONS

Test configuration:
  Provider: voyage
  Model: voyage-finance-2
  Chunk size: 1024
  K: 5

Loading vectorstore...
✓ Loaded (15,765 documents)

Test query: What was the capital expenditure in 2018?

------------------------------------------------------------
TEST 1: Global Retrieval
------------------------------------------------------------
✓ Retrieved 5 documents

Top 3 results:

  1. Rank 1
     Doc: 3M_2018_10K
     Page: 39
     Text (first 100 chars): Table of Contents 
Geographic Area Supplemental Information
 
 
 
 
 
 
 
 
  
 
  
 
  
 
 Property...
     Text length: 4356 chars

  2. Rank 2
     Doc: CVSHEALTH_2018_10K
     Page: 280
     Text (first 100 chars): Commentary - 2018 compared to 2017
•
Net cash provided by operating activities increased by $858 mil...
     Text length: 2595 chars

  3. Rank 3
     Doc: 3M_2018_10K
     Page: 47
     Text (first 100 cha

In [10]:
# ============================================================================
# Step 8: Main Evaluation Function
# ============================================================================

# %% [markdown]
# ## 8.1 File Management Functions
# 
# Helper functions for saving and checking results

# %%
# def get_output_filename(
#     provider: str,
#     model: str,
#     chunk_size: int,
#     k: int,
#     mode: str
# ) -> str:
#     """
#     Generate standardized output filename.
    
#     Format: {provider}_{model}_chunk{size}_k{k}_{mode}.json
    
#     Example:
#         voyage_voyage-finance-2_chunk1024_k20_global.json
#     """
#     # Replace slashes in model name
#     model_safe = model.replace('/', '_')
#     filename = f"{provider}_{model_safe}_chunk{chunk_size}_k{k}_{mode}.json"
#     return filename

def get_output_filename(
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    expansion_type: str = None,
    expansion_subtype: str = None
) -> str:
    """
    Generate standardized output filename.
    
    Format (without expansion): {provider}_{model}_chunk{size}_k{k}_{mode}.json
    Format (with expansion): {expansion_type}_{expansion_subtype}_{provider}_{model}_chunk{size}_k{k}_{mode}.json
    
    Args:
        provider: Embedding provider (e.g., "voyage", "openai", "ollama")
        model: Model name (e.g., "voyage-finance-2")
        chunk_size: Chunk size (e.g., 512, 1024)
        k: Number of retrieved documents
        mode: Retrieval mode ("global" or "singledoc")
        expansion_type: Optional expansion type (e.g., "hyde", "query2doc")
        expansion_subtype: Optional expansion subtype (e.g., "basic", "cot")
        
    Returns:
        Formatted filename string
        
    Examples:
        Without expansion:
            voyage_voyage-finance-2_chunk1024_k20_global.json
        
        With expansion:
            hyde_basic_voyage_voyage-finance-2_chunk1024_k20_global.json
    """
    # Replace slashes in model name
    model_safe = model.replace('/', '_')
    
    # Build filename
    if expansion_type and expansion_subtype:
        # With query expansion
        filename = f"{expansion_type}_{expansion_subtype}_{provider}_{model_safe}_chunk{chunk_size}_k{k}_{mode}.json"
    else:
        # Without query expansion (baseline)
        filename = f"{provider}_{model_safe}_chunk{chunk_size}_k{k}_{mode}.json"
    
    return filename


def check_if_results_exist(
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    output_dir: str,
    expansion_type: str = None,
    expansion_subtype: str = None
) -> bool:
    """
    Check if results file already exists.
    
    Used to skip configurations that have already been evaluated.
    """
    filename = get_output_filename(provider, model, chunk_size, k, mode, expansion_type, expansion_subtype)
    filepath = os.path.join(output_dir, filename)
    return os.path.exists(filepath)


def save_results(
    results: List[Dict],
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    output_dir: str,
    expansion_type: str = None,
    expansion_subtype: str = None
):
    """
    Save evaluation results to JSON file.
    
    Args:
        results: List of result dictionaries (queries + summary)
        provider: Embedding provider
        model: Model name
        chunk_size: Chunk size
        k: Number of retrieved documents
        mode: "global" or "singledoc"
        output_dir: Output directory
    """
    filename = get_output_filename(provider, model, chunk_size, k, mode, expansion_type, expansion_subtype)
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"✓ Saved: {filename}")

print("✓ File management functions defined")

# %% [markdown]
# ## 8.2 Single Configuration Evaluation
# 
# Evaluate a single configuration: (provider, model, chunk_size, k, mode)

def evaluate_single_configuration(
    dataset,
    evidence_lookup: Dict,
    sbert_model: SentenceTransformer,
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    use_page_tolerance: bool = True,
    text_similarity_threshold: float = TEXT_SIMILARITY_THRESHOLD,
    output_dir: str = OUTPUT_DIR,
    expansion_type: str = None,
    expansion_subtype: str = None,
    expanded_queries_dir: str = EXPANDED_QUERIES_DIR
) -> Dict:
    """
    Evaluate a single configuration with BOTH page-based AND text-based metrics.
    Now supports query expansion evaluation alongside baseline.
    
    This is the main evaluation function that:
    1. Loads the vector store
    2. Optionally loads expanded queries
    3. For each query:
       - Retrieves documents using BASELINE query
       - Calculates BASELINE metrics (page + text)
       - If expansion enabled: Retrieves documents using EXPANDED query
       - If expansion enabled: Calculates EXPANDED metrics (page + text)
       - Stores all results
    4. Calculates average metrics for both baseline and expanded
    5. Saves results to JSON
    
    Args:
        dataset: FinanceBench dataset
        evidence_lookup: Pre-computed evidence embeddings
        sbert_model: Sentence-BERT model for text similarity
        provider: "ollama", "openai", or "voyage"
        model: Model name
        chunk_size: Chunk size
        k: Number of documents to retrieve
        mode: "global" or "singledoc"
        use_page_tolerance: If True, use chunk-size-aware page tolerance
        text_similarity_threshold: Threshold for text-based matching
        output_dir: Output directory for results
        expansion_type: Optional expansion type (e.g., "hyde", "query2doc")
        expansion_subtype: Optional expansion subtype (e.g., "basic", "cot")
        expanded_queries_dir: Directory containing expanded queries JSON files
        
    Returns:
        Dictionary with status and metrics
    """
    print(f"\n{'='*60}")
    print(f"EVALUATING: {provider}/{model}")
    print(f"  Chunk size: {chunk_size}")
    print(f"  K: {k}")
    print(f"  Mode: {mode}")
    if expansion_type and expansion_subtype:
        print(f"  Query Expansion: {expansion_type}/{expansion_subtype}")
    else:
        print(f"  Query Expansion: DISABLED (baseline only)")
    print(f"  Page tolerance: {'ENABLED' if use_page_tolerance else 'DISABLED'}")
    print(f"  Text similarity threshold: {text_similarity_threshold}")
    print(f"{'='*60}")
    
    # Check if already exists
    if check_if_results_exist(provider, model, chunk_size, k, mode, output_dir, 
                              expansion_type, expansion_subtype):
        print("✓ Results already exist - SKIPPING")
        return {'status': 'skipped'}
    
    # Load vectorstore
    print("\nLoading vectorstore...")
    try:
        vectorstore = load_vectorstore(provider, model, chunk_size)
        doc_count = vectorstore._collection.count()
        print(f"✓ Loaded ({doc_count:,} documents)")
    except Exception as e:
        print(f"✗ Failed to load vectorstore: {e}")
        return {'status': 'failed', 'error': str(e)}
    
    # Load expanded queries if expansion is enabled
    expanded_lookup = None
    if expansion_type and expansion_subtype:
        print("\nLoading expanded queries...")
        try:
            expanded_lookup = load_expanded_queries(
                expanded_queries_dir,
                expansion_type,
                expansion_subtype
            )
            print(f"✓ Loaded {len(expanded_lookup)} expanded queries")
        except Exception as e:
            print(f"✗ Failed to load expanded queries: {e}")
            return {'status': 'failed', 'error': str(e)}
    
    # Initialize tracking lists for BASELINE
    results = []
    
    # Baseline page-based metrics
    page_mrr_scores = []
    page_recall_scores = []
    page_precision_scores = []
    page_f1_scores = []
    
    # Baseline text-based metrics
    text_mrr_scores = []
    text_recall_scores = []
    text_precision_scores = []
    text_f1_scores = []
    
    # Initialize tracking lists for EXPANDED queries (if enabled)
    if expanded_lookup:
        # Expanded page-based metrics
        expanded_page_mrr_scores = []
        expanded_page_recall_scores = []
        expanded_page_precision_scores = []
        expanded_page_f1_scores = []
        
        # Expanded text-based metrics
        expanded_text_mrr_scores = []
        expanded_text_recall_scores = []
        expanded_text_precision_scores = []
        expanded_text_f1_scores = []
    
    # Process all queries
    print(f"\nProcessing {len(dataset)} queries...")
    if expanded_lookup:
        print("(Processing both baseline and expanded queries...)")
    print("(This may take a while due to text similarity calculations...)")
    
    for record in tqdm(dataset, desc="Queries"):
        query_id = record['financebench_id']
        query = record['question']  # Baseline query
        doc_name = record['doc_name']
        expanded_query_temp = "None"  # Placeholder for expanded query text
        
        # Get evidence for this query
        evidence_items = evidence_lookup.get(query_id, [])
        
        if len(evidence_items) == 0:
            # No evidence for this query, skip
            continue
        
        try:
            # ========================================
            # BASELINE RETRIEVAL AND METRICS
            # ========================================
            
            # STEP 1: RETRIEVE DOCUMENTS (BASELINE)
            if mode == "global":
                retrieved_docs_baseline = retrieve_global(vectorstore, query, k)
            elif mode == "singledoc":
                retrieved_docs_baseline = retrieve_single_doc(vectorstore, query, doc_name, k)
            else:
                raise ValueError(f"Unknown mode: {mode}")
            
            # STEP 2: CALCULATE PAGE-BASED METRICS (BASELINE)
            page_mrr, page_rank = calculate_page_mrr_for_query(
                retrieved_docs_baseline, evidence_items, chunk_size, use_page_tolerance
            )
            page_mrr_scores.append(page_mrr)
            
            page_recall, page_precision, page_f1 = calculate_page_metrics_for_query(
                retrieved_docs_baseline, evidence_items, chunk_size, use_page_tolerance
            )
            page_recall_scores.append(page_recall)
            page_precision_scores.append(page_precision)
            page_f1_scores.append(page_f1)
            
            # STEP 3: CALCULATE TEXT-BASED METRICS (BASELINE)
            text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities = \
                calculate_text_metrics_for_query(
                    retrieved_docs_baseline,
                    evidence_items,
                    sbert_model,
                    threshold=text_similarity_threshold
                )
            
            text_mrr_scores.append(text_mrr)
            text_recall_scores.append(text_recall)
            text_precision_scores.append(text_precision)
            text_f1_scores.append(text_f1)
            
            # ========================================
            # EXPANDED QUERY RETRIEVAL AND METRICS (if enabled)
            # ========================================
            
            if expanded_lookup and query_id in expanded_lookup:
                # Get expanded query
                expanded_query = expanded_lookup[query_id]['expanded_query']
                expanded_query_temp = expanded_query  # For logging
                
                # STEP 1: RETRIEVE DOCUMENTS (EXPANDED)
                if mode == "global":
                    retrieved_docs_expanded = retrieve_global(vectorstore, expanded_query, k)
                elif mode == "singledoc":
                    retrieved_docs_expanded = retrieve_single_doc(vectorstore, expanded_query, doc_name, k)

                # STEP 2: CALCULATE PAGE-BASED METRICS (EXPANDED)
                expanded_page_mrr, expanded_page_rank = calculate_page_mrr_for_query(
                    retrieved_docs_expanded, evidence_items, chunk_size, use_page_tolerance
                )
                expanded_page_mrr_scores.append(expanded_page_mrr)
                
                expanded_page_recall, expanded_page_precision, expanded_page_f1 = calculate_page_metrics_for_query(
                    retrieved_docs_expanded, evidence_items, chunk_size, use_page_tolerance
                )
                expanded_page_recall_scores.append(expanded_page_recall)
                expanded_page_precision_scores.append(expanded_page_precision)
                expanded_page_f1_scores.append(expanded_page_f1)
                
                # STEP 3: CALCULATE TEXT-BASED METRICS (EXPANDED)
                expanded_text_mrr, expanded_text_rank, expanded_text_recall, expanded_text_precision, expanded_text_f1, expanded_all_similarities = \
                    calculate_text_metrics_for_query(
                        retrieved_docs_expanded,
                        evidence_items,
                        sbert_model,
                        threshold=text_similarity_threshold
                    )
                
                expanded_text_mrr_scores.append(expanded_text_mrr)
                expanded_text_recall_scores.append(expanded_text_recall)
                expanded_text_precision_scores.append(expanded_text_precision)
                expanded_text_f1_scores.append(expanded_text_f1)
            else:
                # No expanded query available, use zeros
                if expanded_lookup:
                    expanded_page_mrr_scores.append(0.0)
                    expanded_page_recall_scores.append(0.0)
                    expanded_page_precision_scores.append(0.0)
                    expanded_page_f1_scores.append(0.0)
                    expanded_text_mrr_scores.append(0.0)
                    expanded_text_recall_scores.append(0.0)
                    expanded_text_precision_scores.append(0.0)
                    expanded_text_f1_scores.append(0.0)
                    retrieved_docs_expanded = []
                    expanded_query = None
                    expanded_page_mrr, expanded_page_rank = 0.0, -1
                    expanded_text_mrr, expanded_text_rank = 0.0, -1
                    expanded_all_similarities = []
            
            # ========================================
            # FORMAT RESULTS FOR JSON
            # ========================================
            
            # Format expected evidence
            expected_evidence = [
                {
                    'doc_name': ev['doc_name'],
                    'page_number': ev['page_number'],
                    'evidence_text': ev['evidence_text'][:200] + '...' if len(ev['evidence_text']) > 200 else ev['evidence_text']
                }
                for ev in evidence_items
            ]
            
            # Format retrieved docs (BASELINE) with text similarities
            retrieved_docs_baseline_formatted = []
            for i, doc in enumerate(retrieved_docs_baseline):
                doc_formatted = {
                    'doc_name': doc['doc_name'],
                    'page_number': doc['page_number'],
                    'rank': doc['rank'],
                    'chunk_text': format_chunk_text_preview(doc['chunk_text']),
                    'text_similarities': all_similarities[i] if i < len(all_similarities) else []
                }
                retrieved_docs_baseline_formatted.append(doc_formatted)
            
            # Store complete result
            result = {
                'query_id': query_id,
                'query': query,  # Baseline query
                'expanded_query': expanded_query_temp,
                'expected_doc': doc_name,
                'expected_evidence': expected_evidence,
                'retrieved_docs': retrieved_docs_baseline_formatted,
                
                # Baseline page-based metrics
                'page_mrr_score': page_mrr,
                'page_rank': page_rank,
                'page_recall': page_recall,
                'page_precision': page_precision,
                'page_f1': page_f1,
                
                # Baseline text-based metrics
                'text_mrr_score': text_mrr,
                'text_rank': text_rank,
                'text_recall': text_recall,
                'text_precision': text_precision,
                'text_f1': text_f1
            }
            
            # Add expanded query results if available
            if expanded_lookup and query_id in expanded_lookup:
                # Format retrieved docs (EXPANDED) with text similarities
                retrieved_docs_expanded_formatted = []
                for i, doc in enumerate(retrieved_docs_expanded):
                    doc_formatted = {
                        'doc_name': doc['doc_name'],
                        'page_number': doc['page_number'],
                        'rank': doc['rank'],
                        'chunk_text': format_chunk_text_preview(doc['chunk_text']),
                        'text_similarities': expanded_all_similarities[i] if i < len(expanded_all_similarities) else []
                    }
                    retrieved_docs_expanded_formatted.append(doc_formatted)
                
                result['expanded_query'] = expanded_query
                result['expanded_retrieved_docs'] = retrieved_docs_expanded_formatted
                
                # Expanded page-based metrics
                result['expanded_page_mrr_score'] = expanded_page_mrr
                result['expanded_page_rank'] = expanded_page_rank
                result['expanded_page_recall'] = expanded_page_recall
                result['expanded_page_precision'] = expanded_page_precision
                result['expanded_page_f1'] = expanded_page_f1
                
                # Expanded text-based metrics
                result['expanded_text_mrr_score'] = expanded_text_mrr
                result['expanded_text_rank'] = expanded_text_rank
                result['expanded_text_recall'] = expanded_text_recall
                result['expanded_text_precision'] = expanded_text_precision
                result['expanded_text_f1'] = expanded_text_f1
            
            results.append(result)
            
        except Exception as e:
            print(f"\n✗ Error processing query {query_id}: {e}")
            # Store error result
            error_result = {
                'query_id': query_id,
                'query': query,
                'error': str(e),
                'page_mrr_score': 0.0,
                'page_rank': -1,
                'page_recall': 0.0,
                'page_precision': 0.0,
                'page_f1': 0.0,
                'text_mrr_score': 0.0,
                'text_rank': -1,
                'text_recall': 0.0,
                'text_precision': 0.0,
                'text_f1': 0.0
            }
            
            # Append zeros to baseline tracking lists
            page_mrr_scores.append(0.0)
            page_recall_scores.append(0.0)
            page_precision_scores.append(0.0)
            page_f1_scores.append(0.0)
            text_mrr_scores.append(0.0)
            text_recall_scores.append(0.0)
            text_precision_scores.append(0.0)
            text_f1_scores.append(0.0)
            
            # Append zeros to expanded tracking lists if enabled
            if expanded_lookup:
                error_result['expanded_query'] = expanded_lookup.get(query_id, {}).get('expanded_query', None)
                error_result['expanded_page_mrr_score'] = 0.0
                error_result['expanded_page_rank'] = -1
                error_result['expanded_page_recall'] = 0.0
                error_result['expanded_page_precision'] = 0.0
                error_result['expanded_page_f1'] = 0.0
                error_result['expanded_text_mrr_score'] = 0.0
                error_result['expanded_text_rank'] = -1
                error_result['expanded_text_recall'] = 0.0
                error_result['expanded_text_precision'] = 0.0
                error_result['expanded_text_f1'] = 0.0
                
                expanded_page_mrr_scores.append(0.0)
                expanded_page_recall_scores.append(0.0)
                expanded_page_precision_scores.append(0.0)
                expanded_page_f1_scores.append(0.0)
                expanded_text_mrr_scores.append(0.0)
                expanded_text_recall_scores.append(0.0)
                expanded_text_precision_scores.append(0.0)
                expanded_text_f1_scores.append(0.0)
            
            results.append(error_result)
    
    # ========================================
    # CALCULATE AVERAGE METRICS
    # ========================================
    
    # Baseline averages
    avg_page_mrr = sum(page_mrr_scores) / len(page_mrr_scores) if page_mrr_scores else 0.0
    avg_page_recall = sum(page_recall_scores) / len(page_recall_scores) if page_recall_scores else 0.0
    avg_page_precision = sum(page_precision_scores) / len(page_precision_scores) if page_precision_scores else 0.0
    avg_page_f1 = sum(page_f1_scores) / len(page_f1_scores) if page_f1_scores else 0.0
    
    avg_text_mrr = sum(text_mrr_scores) / len(text_mrr_scores) if text_mrr_scores else 0.0
    avg_text_recall = sum(text_recall_scores) / len(text_recall_scores) if text_recall_scores else 0.0
    avg_text_precision = sum(text_precision_scores) / len(text_precision_scores) if text_precision_scores else 0.0
    avg_text_f1 = sum(text_f1_scores) / len(text_f1_scores) if text_f1_scores else 0.0
    
    # Add summary to results
    summary = {
        'provider': provider,
        'model': model,
        'chunk_size': chunk_size,
        'k': k,
        'mode': mode,
        'use_page_tolerance': use_page_tolerance,
        'text_similarity_threshold': text_similarity_threshold,
        'total_queries': len(dataset),
        
        # Baseline page-based averages
        'average_page_mrr': avg_page_mrr,
        'average_page_recall': avg_page_recall,
        'average_page_precision': avg_page_precision,
        'average_page_f1': avg_page_f1,
        
        # Baseline text-based averages
        'average_text_mrr': avg_text_mrr,
        'average_text_recall': avg_text_recall,
        'average_text_precision': avg_text_precision,
        'average_text_f1': avg_text_f1
    }
    
    # Add expanded query averages if enabled
    if expanded_lookup:
        avg_expanded_page_mrr = sum(expanded_page_mrr_scores) / len(expanded_page_mrr_scores) if expanded_page_mrr_scores else 0.0
        avg_expanded_page_recall = sum(expanded_page_recall_scores) / len(expanded_page_recall_scores) if expanded_page_recall_scores else 0.0
        avg_expanded_page_precision = sum(expanded_page_precision_scores) / len(expanded_page_precision_scores) if expanded_page_precision_scores else 0.0
        avg_expanded_page_f1 = sum(expanded_page_f1_scores) / len(expanded_page_f1_scores) if expanded_page_f1_scores else 0.0
        
        avg_expanded_text_mrr = sum(expanded_text_mrr_scores) / len(expanded_text_mrr_scores) if expanded_text_mrr_scores else 0.0
        avg_expanded_text_recall = sum(expanded_text_recall_scores) / len(expanded_text_recall_scores) if expanded_text_recall_scores else 0.0
        avg_expanded_text_precision = sum(expanded_text_precision_scores) / len(expanded_text_precision_scores) if expanded_text_precision_scores else 0.0
        avg_expanded_text_f1 = sum(expanded_text_f1_scores) / len(expanded_text_f1_scores) if expanded_text_f1_scores else 0.0
        
        summary['expansion_type'] = expansion_type
        summary['expansion_subtype'] = expansion_subtype
        
        # Expanded page-based averages
        summary['average_expanded_page_mrr'] = avg_expanded_page_mrr
        summary['average_expanded_page_recall'] = avg_expanded_page_recall
        summary['average_expanded_page_precision'] = avg_expanded_page_precision
        summary['average_expanded_page_f1'] = avg_expanded_page_f1
        
        # Expanded text-based averages
        summary['average_expanded_text_mrr'] = avg_expanded_text_mrr
        summary['average_expanded_text_recall'] = avg_expanded_text_recall
        summary['average_expanded_text_precision'] = avg_expanded_text_precision
        summary['average_expanded_text_f1'] = avg_expanded_text_f1
    
    # Insert summary at the start of results
    results.insert(0, {'summary': summary})
    
    # Save results
    save_results(results, provider, model, chunk_size, k, mode, output_dir,
                expansion_type, expansion_subtype)
    
    # Print summary
    print(f"\n{'='*60}")
    print("RESULTS SUMMARY")
    print(f"{'='*60}")
    print("\nBASELINE - Page-Based Metrics:")
    print(f"  Average MRR:       {avg_page_mrr:.4f}")
    print(f"  Average Recall:    {avg_page_recall:.4f}")
    print(f"  Average Precision: {avg_page_precision:.4f}")
    print(f"  Average F1:        {avg_page_f1:.4f}")
    print("\nBASELINE - Text-Based Metrics:")
    print(f"  Average MRR:       {avg_text_mrr:.4f}")
    print(f"  Average Recall:    {avg_text_recall:.4f}")
    print(f"  Average Precision: {avg_text_precision:.4f}")
    print(f"  Average F1:        {avg_text_f1:.4f}")
    
    if expanded_lookup:
        print(f"\nEXPANDED ({expansion_type}/{expansion_subtype}) - Page-Based Metrics:")
        print(f"  Average MRR:       {avg_expanded_page_mrr:.4f} (Δ {avg_expanded_page_mrr - avg_page_mrr:+.4f})")
        print(f"  Average Recall:    {avg_expanded_page_recall:.4f} (Δ {avg_expanded_page_recall - avg_page_recall:+.4f})")
        print(f"  Average Precision: {avg_expanded_page_precision:.4f} (Δ {avg_expanded_page_precision - avg_page_precision:+.4f})")
        print(f"  Average F1:        {avg_expanded_page_f1:.4f} (Δ {avg_expanded_page_f1 - avg_page_f1:+.4f})")
        print(f"\nEXPANDED ({expansion_type}/{expansion_subtype}) - Text-Based Metrics:")
        print(f"  Average MRR:       {avg_expanded_text_mrr:.4f} (Δ {avg_expanded_text_mrr - avg_text_mrr:+.4f})")
        print(f"  Average Recall:    {avg_expanded_text_recall:.4f} (Δ {avg_expanded_text_recall - avg_text_recall:+.4f})")
        print(f"  Average Precision: {avg_expanded_text_precision:.4f} (Δ {avg_expanded_text_precision - avg_text_precision:+.4f})")
        print(f"  Average F1:        {avg_expanded_text_f1:.4f} (Δ {avg_expanded_text_f1 - avg_text_f1:+.4f})")
    
    print(f"{'='*60}")
    
    return_dict = {
        'status': 'completed',
        'average_page_mrr': avg_page_mrr,
        'average_page_recall': avg_page_recall,
        'average_page_precision': avg_page_precision,
        'average_page_f1': avg_page_f1,
        'average_text_mrr': avg_text_mrr,
        'average_text_recall': avg_text_recall,
        'average_text_precision': avg_text_precision,
        'average_text_f1': avg_text_f1,
        'total_queries': len(dataset)
    }
    
    if expanded_lookup:
        return_dict['average_expanded_page_mrr'] = avg_expanded_page_mrr
        return_dict['average_expanded_page_recall'] = avg_expanded_page_recall
        return_dict['average_expanded_page_precision'] = avg_expanded_page_precision
        return_dict['average_expanded_page_f1'] = avg_expanded_page_f1
        return_dict['average_expanded_text_mrr'] = avg_expanded_text_mrr
        return_dict['average_expanded_text_recall'] = avg_expanded_text_recall
        return_dict['average_expanded_text_precision'] = avg_expanded_text_precision
        return_dict['average_expanded_text_f1'] = avg_expanded_text_f1
    
    return return_dict


print("✓ Single configuration evaluation function defined")

# %%
print("\n" + "="*60)
print("✓ STEP 8 COMPLETE!")
print("="*60)
print("  ✓ File management functions defined")
print("  ✓ Main evaluation function defined")
print("  ✓ Processes both page-based AND text-based metrics")
print("  ✓ Saves comprehensive results to JSON")
print("  ✓ Ready for batch evaluation")
print("="*60)

✓ File management functions defined
✓ Single configuration evaluation function defined

✓ STEP 8 COMPLETE!
  ✓ File management functions defined
  ✓ Main evaluation function defined
  ✓ Processes both page-based AND text-based metrics
  ✓ Saves comprehensive results to JSON
  ✓ Ready for batch evaluation


In [12]:
# ============================================================================
# Step 9: Batch Evaluation Function
# ============================================================================


def evaluate_multiple_configurations(
    dataset,
    evidence_lookup: Dict,
    sbert_model: SentenceTransformer,
    configurations: List[Dict],
    k_values: List[int],
    modes: List[str],
    use_page_tolerance: bool = True,
    text_similarity_threshold: float = TEXT_SIMILARITY_THRESHOLD,
    output_dir: str = OUTPUT_DIR,
    expanded_queries_dir: str = "../../query_enhancement_set"
) -> Dict:
    """
    Evaluate multiple configurations in batch.
    
    This function iterates through all combinations of:
    - Configurations (provider, model, chunk_sizes, expansion_type, expansion_subtype)
    - K values (number of documents to retrieve)
    - Modes (global, singledoc)
    
    And evaluates each combination using evaluate_single_configuration().
    
    Args:
        dataset: FinanceBench dataset
        evidence_lookup: Pre-computed evidence embeddings
        sbert_model: Sentence-BERT model
        configurations: List of configuration dicts
        k_values: List of k values to test
        modes: List of modes ["global", "singledoc"]
        use_page_tolerance: If True, use chunk-size-aware tolerance
        text_similarity_threshold: Threshold for text-based matching
        output_dir: Output directory
        expanded_queries_dir: Directory containing expanded queries JSON files
        
    Returns:
        Summary dictionary with all results
        
    Example configurations:
        [
            {
                'provider': 'voyage',
                'model': 'voyage-finance-2',
                'chunk_sizes': [512, 1024, 2048]
            },
            {
                'expansion_type': 'hyde',
                'expansion_subtype': 'basic',
                'provider': 'voyage',
                'model': 'voyage-finance-2',
                'chunk_sizes': [512, 1024]
            },
            ...
        ]
    """
    print(f"\n{'='*60}")
    print("BATCH EVALUATION")
    print(f"{'='*60}")
    print(f"Configurations: {len(configurations)}")
    print(f"K values: {k_values}")
    print(f"Modes: {modes}")
    print(f"Page tolerance: {'ENABLED' if use_page_tolerance else 'DISABLED'}")
    print(f"Text similarity threshold: {text_similarity_threshold}")
    
    # Calculate total runs
    total_runs = 0
    for config in configurations:
        total_runs += len(config['chunk_sizes']) * len(k_values) * len(modes)
    
    print(f"Total evaluation runs: {total_runs}")
    print(f"{'='*60}")
    
    # Track results
    all_results = []
    completed = 0
    skipped = 0
    failed = 0
    
    # Start time
    import time
    start_time = time.time()
    
    # Iterate through all combinations
    for config in configurations:
        provider = config['provider']
        model = config['model']
        chunk_sizes = config['chunk_sizes']
        
        # Extract expansion parameters (optional)
        expansion_type = config.get('expansion_type', None)
        expansion_subtype = config.get('expansion_subtype', None)
        
        for chunk_size in chunk_sizes:
            for k in k_values:
                for mode in modes:
                    print(f"\n{'#'*60}")
                    print(f"CONFIGURATION {completed + skipped + failed + 1}/{total_runs}")
                    print(f"{'#'*60}")
                    
                    result = evaluate_single_configuration(
                        dataset=dataset,
                        evidence_lookup=evidence_lookup,
                        sbert_model=sbert_model,
                        provider=provider,
                        model=model,
                        chunk_size=chunk_size,
                        k=k,
                        mode=mode,
                        use_page_tolerance=use_page_tolerance,
                        text_similarity_threshold=text_similarity_threshold,
                        output_dir=output_dir,
                        expansion_type=expansion_type,
                        expansion_subtype=expansion_subtype,
                        expanded_queries_dir=expanded_queries_dir
                    )
                    
                    all_results.append({
                        'provider': provider,
                        'model': model,
                        'chunk_size': chunk_size,
                        'k': k,
                        'mode': mode,
                        'expansion_type': expansion_type,
                        'expansion_subtype': expansion_subtype,
                        'result': result
                    })
                    
                    if result['status'] == 'completed':
                        completed += 1
                    elif result['status'] == 'skipped':
                        skipped += 1
                    else:
                        failed += 1
    
    # End time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Summary
    print(f"\n{'='*60}")
    print("BATCH EVALUATION SUMMARY")
    print(f"{'='*60}")
    print(f"Total runs: {total_runs}")
    print(f"Completed: {completed}")
    print(f"Skipped: {skipped}")
    print(f"Failed: {failed}")
    print(f"Total time: {elapsed_time/60:.2f} minutes")
    print(f"Average time per run: {elapsed_time/total_runs:.2f} seconds")
    print(f"{'='*60}")
    
    return {
        'total_runs': total_runs,
        'completed': completed,
        'skipped': skipped,
        'failed': failed,
        'elapsed_time': elapsed_time,
        'results': all_results
    }

print("✓ Batch evaluation function defined")

# %% [markdown]
# ## 9.2 Results Analysis Helper
# 
# Helper function to display results in a readable format

# %%
def display_batch_results(summary: Dict):
    """
    Display batch evaluation results in a readable table format.
    
    Shows both page-based and text-based metrics for baseline and expanded queries.
    """
    print("\n" + "="*100)
    print("DETAILED RESULTS - ALL CONFIGURATIONS")
    print("="*100)
    
    # Group results by status
    completed_results = [r for r in summary['results'] if r['result']['status'] == 'completed']
    skipped_results = [r for r in summary['results'] if r['result']['status'] == 'skipped']
    failed_results = [r for r in summary['results'] if r['result']['status'] == 'failed']
    
    if completed_results:
        print("\n" + "-"*100)
        print(f"COMPLETED EVALUATIONS ({len(completed_results)})")
        print("-"*100)
        
        # Separate baseline and expanded results
        baseline_results = [r for r in completed_results if r.get('expansion_type') is None]
        expanded_results = [r for r in completed_results if r.get('expansion_type') is not None]
        
        # ========================================
        # BASELINE RESULTS
        # ========================================
        if baseline_results:
            print("\n" + "="*100)
            print("BASELINE RESULTS (No Query Expansion)")
            print("="*100)
            
            # Table header
            print(f"\n{'Config':<45} {'Page Metrics':<25} {'Text Metrics':<25}")
            print(f"{'-'*45} {'-'*25} {'-'*25}")
            print(f"{'Provider/Model/Chunk/K/Mode':<45} {'MRR':>6} {'Rec':>6} {'Prec':>6} {'F1':>6} {'MRR':>6} {'Rec':>6} {'Prec':>6} {'F1':>6}")
            print("-"*95)
            
            # Sort by provider, model, chunk_size, k, mode
            sorted_baseline = sorted(
                baseline_results,
                key=lambda x: (x['provider'], x['model'], x['chunk_size'], x['k'], x['mode'])
            )
            
            for r in sorted_baseline:
                config_str = f"{r['provider']}/{r['model']}/ch{r['chunk_size']}/k{r['k']}/{r['mode']}"
                result = r['result']
                
                # Page-based metrics
                page_mrr = result['average_page_mrr']
                page_rec = result['average_page_recall']
                page_prec = result['average_page_precision']
                page_f1 = result['average_page_f1']
                
                # Text-based metrics
                text_mrr = result['average_text_mrr']
                text_rec = result['average_text_recall']
                text_prec = result['average_text_precision']
                text_f1 = result['average_text_f1']
                
                print(f"{config_str:<45} {page_mrr:>6.3f} {page_rec:>6.3f} {page_prec:>6.3f} {page_f1:>6.3f} {text_mrr:>6.3f} {text_rec:>6.3f} {text_prec:>6.3f} {text_f1:>6.3f}")
        
        # ========================================
        # EXPANDED QUERY RESULTS
        # ========================================
        if expanded_results:
            print("\n" + "="*100)
            print("EXPANDED QUERY RESULTS")
            print("="*100)
            
            # Group by expansion type
            expansion_groups = {}
            for r in expanded_results:
                exp_key = f"{r['expansion_type']}/{r['expansion_subtype']}"
                if exp_key not in expansion_groups:
                    expansion_groups[exp_key] = []
                expansion_groups[exp_key].append(r)
            
            # Display each expansion type group
            for exp_key, exp_results in expansion_groups.items():
                print(f"\n{'='*100}")
                print(f"EXPANSION: {exp_key}")
                print(f"{'='*100}")
                
                # Table header with baseline and expanded columns
                print(f"\n{'Config':<40} {'BASELINE Page':<20} {'EXPANDED Page':<20} {'BASELINE Text':<20} {'EXPANDED Text':<20}")
                print(f"{'-'*40} {'-'*20} {'-'*20} {'-'*20} {'-'*20}")
                print(f"{'Provider/Model/Ch/K/Mode':<40} {'MRR':>6} {'F1':>6} {'Δ':>6} {'MRR':>6} {'F1':>6} {'Δ':>6} {'MRR':>6} {'F1':>6} {'Δ':>6} {'MRR':>6} {'F1':>6} {'Δ':>6}")
                print("-"*120)
                
                # Sort by provider, model, chunk_size, k, mode
                sorted_expanded = sorted(
                    exp_results,
                    key=lambda x: (x['provider'], x['model'], x['chunk_size'], x['k'], x['mode'])
                )
                
                for r in sorted_expanded:
                    config_str = f"{r['provider']}/{r['model']}/ch{r['chunk_size']}/k{r['k']}/{r['mode']}"
                    result = r['result']
                    
                    # Baseline metrics
                    base_page_mrr = result['average_page_mrr']
                    base_page_f1 = result['average_page_f1']
                    base_text_mrr = result['average_text_mrr']
                    base_text_f1 = result['average_text_f1']
                    
                    # Expanded metrics
                    exp_page_mrr = result.get('average_expanded_page_mrr', 0.0)
                    exp_page_f1 = result.get('average_expanded_page_f1', 0.0)
                    exp_text_mrr = result.get('average_expanded_text_mrr', 0.0)
                    exp_text_f1 = result.get('average_expanded_text_f1', 0.0)
                    
                    # Calculate deltas
                    delta_page_mrr = exp_page_mrr - base_page_mrr
                    delta_page_f1 = exp_page_f1 - base_page_f1
                    delta_text_mrr = exp_text_mrr - base_text_mrr
                    delta_text_f1 = exp_text_f1 - base_text_f1
                    
                    print(f"{config_str:<40} "
                          f"{base_page_mrr:>6.3f} {base_page_f1:>6.3f} {delta_page_mrr:>+6.3f} "
                          f"{exp_page_mrr:>6.3f} {exp_page_f1:>6.3f} {delta_page_f1:>+6.3f} "
                          f"{base_text_mrr:>6.3f} {base_text_f1:>6.3f} {delta_text_mrr:>+6.3f} "
                          f"{exp_text_mrr:>6.3f} {exp_text_f1:>6.3f} {delta_text_f1:>+6.3f}")
                
                # Summary statistics for this expansion type
                print(f"\n{'-'*100}")
                print(f"SUMMARY for {exp_key}:")
                avg_delta_page_mrr = sum(r['result'].get('average_expanded_page_mrr', 0.0) - r['result']['average_page_mrr'] for r in sorted_expanded) / len(sorted_expanded)
                avg_delta_text_mrr = sum(r['result'].get('average_expanded_text_mrr', 0.0) - r['result']['average_text_mrr'] for r in sorted_expanded) / len(sorted_expanded)
                print(f"  Average Page MRR improvement: {avg_delta_page_mrr:+.4f}")
                print(f"  Average Text MRR improvement: {avg_delta_text_mrr:+.4f}")
                
                # Count improvements/degradations
                improved_page = sum(1 for r in sorted_expanded if r['result'].get('average_expanded_page_mrr', 0.0) > r['result']['average_page_mrr'])
                improved_text = sum(1 for r in sorted_expanded if r['result'].get('average_expanded_text_mrr', 0.0) > r['result']['average_text_mrr'])
                print(f"  Configurations improved (Page MRR): {improved_page}/{len(sorted_expanded)}")
                print(f"  Configurations improved (Text MRR): {improved_text}/{len(sorted_expanded)}")
    
    if skipped_results:
        print("\n" + "-"*100)
        print(f"SKIPPED EVALUATIONS ({len(skipped_results)})")
        print("-"*100)
        for r in skipped_results:
            exp_str = f"{r['expansion_type']}/{r['expansion_subtype']}" if r.get('expansion_type') else "baseline"
            config_str = f"{exp_str} - {r['provider']}/{r['model']}/chunk{r['chunk_size']}/k{r['k']}/{r['mode']}"
            print(f"  - {config_str}")
    
    if failed_results:
        print("\n" + "-"*100)
        print(f"FAILED EVALUATIONS ({len(failed_results)})")
        print("-"*100)
        for r in failed_results:
            exp_str = f"{r['expansion_type']}/{r['expansion_subtype']}" if r.get('expansion_type') else "baseline"
            config_str = f"{exp_str} - {r['provider']}/{r['model']}/chunk{r['chunk_size']}/k{r['k']}/{r['mode']}"
            error = r['result'].get('error', 'Unknown error')
            print(f"  - {config_str}: {error}")
    
    print("\n" + "="*100)

print("✓ Results analysis helper defined")

# %% [markdown]
# ## 9.3 List Generated Files
# 
# Helper to show all generated JSON files

# %%
def list_generated_files(output_dir: str = OUTPUT_DIR):
    """
    List all generated JSON files with their sizes.
    """
    print("\n" + "="*60)
    print("GENERATED FILES")
    print("="*60)
    
    output_path = Path(output_dir)
    json_files = sorted(output_path.glob("*.json"))
    
    print(f"\nTotal JSON files: {len(json_files)}")
    print(f"Location: {output_dir}\n")
    
    if json_files:
        # Calculate total size
        total_size = sum(f.stat().st_size for f in json_files)
        
        print(f"{'Filename':<60} {'Size':>10}")
        print("-"*72)
        
        for filepath in json_files:
            file_size = filepath.stat().st_size / 1024  # KB
            print(f"{filepath.name:<60} {file_size:>8.1f} KB")
        
        print("-"*72)
        print(f"{'TOTAL':<60} {total_size/1024:>8.1f} KB")
    else:
        print("No JSON files found.")
    
    print("\n" + "="*60)

print("✓ File listing helper defined")

# %%
print("\n" + "="*60)
print("✓ STEP 9 COMPLETE!")
print("="*60)
print("  ✓ Batch evaluation function defined")
print("  ✓ Results display helper defined")
print("  ✓ File listing helper defined")
print("  ✓ Ready for configuration and execution")
print("="*60)

✓ Batch evaluation function defined
✓ Results analysis helper defined
✓ File listing helper defined

✓ STEP 9 COMPLETE!
  ✓ Batch evaluation function defined
  ✓ Results display helper defined
  ✓ File listing helper defined
  ✓ Ready for configuration and execution


In [21]:
# ============================================================================
# Step 10: Configuration and Execution
# ============================================================================

# %% [markdown]
# ## 10.1 Define Configurations to Test
# 
# Specify which embedding models and chunk sizes to evaluate

# %%
# Define configurations to evaluate
# Each configuration specifies: provider, model, and chunk sizes to test

provider = 'voyage' # ollama, voyage
model = 'voyage-3-large' # nomic-embed-text, voyage-finance-2, voyage-3-large

configurations = [
    {
        'expansion_type': 'hyde',
        'expansion_subtype': 'basic',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'hyde',
        'expansion_subtype': 'detailed',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'hyde',
        'expansion_subtype': 'financial_terminology',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'query_refinement',
        'expansion_subtype': 'clarification',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'query_refinement',
        'expansion_subtype': 'formal',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'query_refinement',
        'expansion_subtype': 'keyword_focused',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'term_expansion',
        'expansion_subtype': 'abbreviation_synonym',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'term_expansion',
        'expansion_subtype': 'context_addition',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'chain_of_thought',
        'expansion_subtype': 'step_by_step',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'chain_of_thought',
        'expansion_subtype': 'explicit_context',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'domain_adaptation',
        'expansion_subtype': 'accounting_perspective',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
    {
        'expansion_type': 'domain_adaptation',
        'expansion_subtype': '10k_language',
        'provider': provider,
        'model': model,
        'chunk_sizes': [512]
    },
]

print("✓ Configurations defined")

# %% [markdown]
# ## 10.2 Define Evaluation Parameters

# %%
# K values to test (number of documents to retrieve)
# k_values = [20, 40, 60, 80]
k_values = [20]

# Modes to test
modes = ['global', 'singledoc']

# Page tolerance setting
# - True: Use chunk-size-aware page tolerance (lenient matching for large chunks)
# - False: Exact page match only (strict evaluation)
USE_PAGE_TOLERANCE = True

# Text similarity threshold
# - Chunks with cosine similarity >= this value are considered matches
# - Higher = stricter matching, Lower = more lenient matching
TEXT_SIMILARITY_THRESHOLD = 0.8

print("✓ Evaluation parameters defined")

# %% [markdown]
# ## 10.3 Display Evaluation Plan

# %%
print("\n" + "="*60)
print("EVALUATION PLAN")
print("="*60)

print(f"\nDataset: FinanceBench ({len(dataset)} queries)")
print(f"Evidence items: {len(all_evidence)}")
print(f"Pre-computed embeddings: {evidence_embeddings.shape[0]}")

print(f"\nEvaluation Settings:")
print(f"  K values: {k_values}")
print(f"  Modes: {modes}")
print(f"  Page tolerance: {'ENABLED' if USE_PAGE_TOLERANCE else 'DISABLED'}")
print(f"  Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")

print(f"\nConfigurations to evaluate:")
total_runs = 0
for i, config in enumerate(configurations, start=1):
    expansion_type = config.get('expansion_type')
    expansion_subtype = config.get('expansion_subtype')
    provider = config['provider']
    model = config['model']
    chunk_sizes = config['chunk_sizes']
    
    runs_for_config = len(chunk_sizes) * len(k_values) * len(modes)
    total_runs += runs_for_config

    print(f"\n  {i}. {expansion_type} {expansion_subtype} {provider}/{model}")
    print(f"     Chunk sizes: {chunk_sizes}")
    print(f"     Evaluation runs: {runs_for_config}")
    
    # Show output filenames that will be generated
    print(f"     Output files:")
    for chunk_size in chunk_sizes:
        for k in k_values:
            for mode in modes:
                filename = get_output_filename(provider, model, chunk_size, k, mode, expansion_type, expansion_subtype)
                exists = check_if_results_exist(provider, model, chunk_size, k, mode, OUTPUT_DIR, expansion_type, expansion_subtype)
                status = "EXISTS" if exists else "TO CREATE"
                print(f"       - {filename} [{status}]")

print(f"\n{'='*60}")
print(f"Total evaluation runs: {total_runs}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"{'='*60}")

# %% [markdown]
# ## 10.4 Execute Batch Evaluation
# 
# **IMPORTANT**: This cell will run the full evaluation.
# - Depending on configurations, this may take 30 minutes to several hours
# - Progress will be shown for each configuration
# - Results are saved incrementally (existing results are skipped)

# %%
# Run batch evaluation
print("\n" + "#"*60)
print("STARTING BATCH EVALUATION")
print("#"*60)
print("\nNOTE: This may take a while. Progress will be shown for each configuration.")
print("You can interrupt and resume later - completed evaluations will be skipped.\n")

# Uncomment the line below to run the evaluation
summary = evaluate_multiple_configurations(
    dataset=dataset,
    evidence_lookup=evidence_lookup,
    sbert_model=sbert_model,
    configurations=configurations,
    k_values=k_values,
    modes=modes,
    use_page_tolerance=USE_PAGE_TOLERANCE,
    text_similarity_threshold=TEXT_SIMILARITY_THRESHOLD,
    output_dir=OUTPUT_DIR
)

print("\n⚠️  EVALUATION NOT RUN - Uncomment the code above to execute")
print("This is intentional to prevent accidental execution during testing.")

# %% [markdown]
# ## 10.5 Display Results (Run after evaluation completes)
# 
# Uncomment and run this cell after the evaluation completes

# %%
# Display detailed results in table format
# Uncomment after evaluation completes:
display_batch_results(summary)

print("\n⚠️  Results display not run - uncomment after evaluation completes")

# %% [markdown]
# ## 10.6 List Generated Files
# 
# View all generated JSON files

# %%
# List all generated files
list_generated_files(OUTPUT_DIR)

# %% [markdown]
# ## 10.7 Load and Analyze a Single Result
# 
# Example: How to load and inspect a single result file

# %%
def load_and_inspect_result(filename: str, output_dir: str = OUTPUT_DIR):
    """
    Load and display a single result file.
    Supports both baseline and query expansion results.
    
    Args:
        filename: Name of the JSON file 
                  Baseline: "voyage_voyage-finance-2_chunk1024_k20_global.json"
                  Expanded: "hyde_basic_voyage_voyage-finance-2_chunk1024_k20_global.json"
        output_dir: Output directory
    """
    filepath = os.path.join(output_dir, filename)
    
    if not os.path.exists(filepath):
        print(f"❌ File not found: {filename}")
        return None
    
    # Load JSON
    with open(filepath, 'r') as f:
        results = json.load(f)
    
    # Extract summary (last item)
    summary = results[0]['summary']
    
    print("\n" + "="*60)
    print(f"RESULTS: {filename}")
    print("="*60)
    
    print("\nConfiguration:")
    print(f"  Provider: {summary['provider']}")
    print(f"  Model: {summary['model']}")
    print(f"  Chunk size: {summary['chunk_size']}")
    print(f"  K: {summary['k']}")
    print(f"  Mode: {summary['mode']}")
    
    # Check if this is an expanded query result
    has_expansion = 'expansion_type' in summary
    if has_expansion:
        print(f"  Query Expansion: {summary['expansion_type']}/{summary['expansion_subtype']}")
    else:
        print(f"  Query Expansion: None (baseline)")
    
    print(f"  Page tolerance: {summary['use_page_tolerance']}")
    print(f"  Text threshold: {summary['text_similarity_threshold']}")
    
    # ========================================
    # BASELINE METRICS
    # ========================================
    print("\n" + "="*60)
    print("BASELINE METRICS")
    print("="*60)
    
    print("\nPage-Based Metrics:")
    print(f"  Average MRR:       {summary['average_page_mrr']:.4f}")
    print(f"  Average Recall:    {summary['average_page_recall']:.4f}")
    print(f"  Average Precision: {summary['average_page_precision']:.4f}")
    print(f"  Average F1:        {summary['average_page_f1']:.4f}")
    
    print("\nText-Based Metrics:")
    print(f"  Average MRR:       {summary['average_text_mrr']:.4f}")
    print(f"  Average Recall:    {summary['average_text_recall']:.4f}")
    print(f"  Average Precision: {summary['average_text_precision']:.4f}")
    print(f"  Average F1:        {summary['average_text_f1']:.4f}")
    
    # ========================================
    # EXPANDED QUERY METRICS (if available)
    # ========================================
    if has_expansion:
        print("\n" + "="*60)
        print(f"EXPANDED QUERY METRICS ({summary['expansion_type']}/{summary['expansion_subtype']})")
        print("="*60)
        
        print("\nPage-Based Metrics:")
        exp_page_mrr = summary['average_expanded_page_mrr']
        exp_page_recall = summary['average_expanded_page_recall']
        exp_page_precision = summary['average_expanded_page_precision']
        exp_page_f1 = summary['average_expanded_page_f1']
        
        print(f"  Average MRR:       {exp_page_mrr:.4f} (Δ {exp_page_mrr - summary['average_page_mrr']:+.4f})")
        print(f"  Average Recall:    {exp_page_recall:.4f} (Δ {exp_page_recall - summary['average_page_recall']:+.4f})")
        print(f"  Average Precision: {exp_page_precision:.4f} (Δ {exp_page_precision - summary['average_page_precision']:+.4f})")
        print(f"  Average F1:        {exp_page_f1:.4f} (Δ {exp_page_f1 - summary['average_page_f1']:+.4f})")
        
        print("\nText-Based Metrics:")
        exp_text_mrr = summary['average_expanded_text_mrr']
        exp_text_recall = summary['average_expanded_text_recall']
        exp_text_precision = summary['average_expanded_text_precision']
        exp_text_f1 = summary['average_expanded_text_f1']
        
        print(f"  Average MRR:       {exp_text_mrr:.4f} (Δ {exp_text_mrr - summary['average_text_mrr']:+.4f})")
        print(f"  Average Recall:    {exp_text_recall:.4f} (Δ {exp_text_recall - summary['average_text_recall']:+.4f})")
        print(f"  Average Precision: {exp_text_precision:.4f} (Δ {exp_text_precision - summary['average_text_precision']:+.4f})")
        print(f"  Average F1:        {exp_text_f1:.4f} (Δ {exp_text_f1 - summary['average_text_f1']:+.4f})")
        
        # Overall improvement summary
        print("\n" + "-"*60)
        print("IMPROVEMENT SUMMARY:")
        print(f"  Page MRR improvement: {exp_page_mrr - summary['average_page_mrr']:+.4f} ({((exp_page_mrr - summary['average_page_mrr']) / summary['average_page_mrr'] * 100):+.2f}%)")
        print(f"  Text MRR improvement: {exp_text_mrr - summary['average_text_mrr']:+.4f} ({((exp_text_mrr - summary['average_text_mrr']) / summary['average_text_mrr'] * 100):+.2f}%)")
    
    print("\n" + "="*60)
    print(f"Total queries: {summary['total_queries']}")
    print(f"Total results (queries + summary): {len(results)}")
    print("="*60)
    
    # ========================================
    # SAMPLE QUERY RESULT
    # ========================================
    if len(results) > 1:
        sample_query = results[1]
        print("\n" + "="*60)
        print("SAMPLE QUERY RESULT")
        print("="*60)
        
        print(f"\nQuery ID: {sample_query['query_id']}")
        print(f"Question: {sample_query['query'][:100]}...")
        
        # Baseline results
        print("\nBASELINE Retrieval:")
        print(f"  Page MRR: {sample_query['page_mrr_score']:.4f}, Rank: {sample_query['page_rank']}")
        print(f"  Text MRR: {sample_query['text_mrr_score']:.4f}, Rank: {sample_query['text_rank']}")
        print(f"  Retrieved docs: {len(sample_query['retrieved_docs'])}")
        
        if len(sample_query['retrieved_docs']) > 0:
            first_doc = sample_query['retrieved_docs'][0]
            print(f"\n  First retrieved doc:")
            print(f"    Doc: {first_doc['doc_name']}, Page: {first_doc['page_number']}")
            print(f"    Chunk text: {first_doc['chunk_text'][:100]}...")
            if len(first_doc['text_similarities']) > 0:
                print(f"    Text similarity with evidence 0: {first_doc['text_similarities'][0]['cosine_similarity']:.4f}")
        
        # Expanded query results (if available)
        if has_expansion and 'expanded_query' in sample_query:
            print("\nEXPANDED Query:")
            print(f"  Expanded question: {sample_query['expanded_query'][:100]}...")
            print(f"\nEXPANDED Retrieval:")
            print(f"  Page MRR: {sample_query['expanded_page_mrr_score']:.4f}, Rank: {sample_query['expanded_page_rank']}")
            print(f"  Text MRR: {sample_query['expanded_text_mrr_score']:.4f}, Rank: {sample_query['expanded_text_rank']}")
            print(f"  Retrieved docs: {len(sample_query['expanded_retrieved_docs'])}")
            
            if len(sample_query['expanded_retrieved_docs']) > 0:
                first_doc_exp = sample_query['expanded_retrieved_docs'][0]
                print(f"\n  First retrieved doc (expanded):")
                print(f"    Doc: {first_doc_exp['doc_name']}, Page: {first_doc_exp['page_number']}")
                print(f"    Chunk text: {first_doc_exp['chunk_text'][:100]}...")
                if len(first_doc_exp['text_similarities']) > 0:
                    print(f"    Text similarity with evidence 0: {first_doc_exp['text_similarities'][0]['cosine_similarity']:.4f}")
            
            # Show improvement for this query
            page_delta = sample_query['expanded_page_mrr_score'] - sample_query['page_mrr_score']
            text_delta = sample_query['expanded_text_mrr_score'] - sample_query['text_mrr_score']
            print(f"\n  Improvement:")
            print(f"    Page MRR: {page_delta:+.4f}")
            print(f"    Text MRR: {text_delta:+.4f}")
    
    print("\n" + "="*60)
    
    return results

print("✓ Result inspection function defined")

# Example usage (uncomment to use):
# results = load_and_inspect_result("voyage_voyage-finance-2_chunk1024_k20_global.json")

# %%
print("\n" + "="*60)
print("✓ STEP 10 COMPLETE!")
print("="*60)
print("  ✓ Configurations defined")
print("  ✓ Evaluation parameters set")
print("  ✓ Evaluation plan displayed")
print("  ✓ Batch evaluation ready (uncomment to run)")
print("  ✓ Result analysis tools ready")
print("="*60)

# %%
print("\n" + "="*80)
print("🎉 ALL STEPS COMPLETE! 🎉")
print("="*80)
print("\n✅ SETUP COMPLETE:")
print("  ✓ Step 1: Imports and configuration")
print("  ✓ Step 2: Sentence-BERT model loaded")
print("  ✓ Step 3: Evidence embeddings pre-computed")
print("  ✓ Step 4: Vector store loading functions")
print("  ✓ Step 5: Page-based evaluation functions")
print("  ✓ Step 6: Text-based evaluation functions")
print("  ✓ Step 7: Retrieval functions")
print("  ✓ Step 8: Main evaluation function")
print("  ✓ Step 9: Batch evaluation function")
print("  ✓ Step 10: Configuration and execution ready")

print("\n📊 EVALUATION CAPABILITIES:")
print("  ✓ Page-based metrics: MRR, Recall, Precision, F1")
print("  ✓ Text-based metrics: MRR, Recall, Precision, F1")
print("  ✓ Both global and single-document modes")
print("  ✓ Comprehensive JSON output with all similarities")
print(f"  ✓ Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  ✓ Pre-computed embeddings: {evidence_embeddings.shape[0]} evidence items")

print("\n🚀 NEXT STEPS:")
print("  1. Review the evaluation plan above")
print("  2. Uncomment the evaluation code in section 10.4")
print("  3. Run the batch evaluation (may take 30+ minutes)")
print("  4. After completion, uncomment section 10.5 to view results")
print("  5. Use section 10.7 to inspect individual result files")

print("\n💾 OUTPUT:")
print(f"  Location: {OUTPUT_DIR}")
print(f"  Format: JSON files with complete metrics and similarities")
print(f"  Naming: {{provider}}_{{model}}_chunk{{size}}_k{{k}}_{{mode}}.json")

print("\n⚠️  IMPORTANT NOTES:")
print("  - Evaluation runs incrementally (existing results are skipped)")
print("  - You can interrupt and resume anytime")
print("  - Progress is shown for each configuration")
print("  - Each query processes text similarities (slowest part)")
print("  - Results are saved immediately after each configuration")

print("\n" + "="*80)
print("Ready to evaluate! Uncomment section 10.4 when ready to start.")
print("="*80)

✓ Configurations defined
✓ Evaluation parameters defined

EVALUATION PLAN

Dataset: FinanceBench (150 queries)
Evidence items: 189
Pre-computed embeddings: 189

Evaluation Settings:
  K values: [20]
  Modes: ['global', 'singledoc']
  Page tolerance: ENABLED
  Text similarity threshold: 0.8

Configurations to evaluate:

  1. hyde basic voyage/voyage-3-large
     Chunk sizes: [512]
     Evaluation runs: 2
     Output files:
       - hyde_basic_voyage_voyage-3-large_chunk512_k20_global.json [TO CREATE]
       - hyde_basic_voyage_voyage-3-large_chunk512_k20_singledoc.json [TO CREATE]

  2. hyde detailed voyage/voyage-3-large
     Chunk sizes: [512]
     Evaluation runs: 2
     Output files:
       - hyde_detailed_voyage_voyage-3-large_chunk512_k20_global.json [TO CREATE]
       - hyde_detailed_voyage_voyage-3-large_chunk512_k20_singledoc.json [TO CREATE]

  3. hyde financial_terminology voyage/voyage-3-large
     Chunk sizes: [512]
     Evaluation runs: 2
     Output files:
       - hyde_f

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: hyde_basic_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (hyde/basic) - Page-Based Metrics:
  Average MRR:       0.4130 (Δ -0.1048)
  Average Recall:    0.7322 (Δ -0.0944)
  Average Precision: 0.0573 (Δ -0.0067)
  Average F1:        0.1049 (Δ -0.0125)

EXPANDED (hyde/basic) - Text-Based Metrics:
  Average MRR:       0.2736 (Δ -0.1038)
  Average Recall:    0.5500 (Δ -0.0822)
  Average Precision: 0.0397 (Δ -0.0137)
  Average F1:        0.0730 (Δ -0.0228)

############################################################
CONFIGURATION 2/24
############################################################

EVALUATING: voyage/voyage-3-large
  Chunk size: 512
  K: 20
  Mode: singled

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: hyde_basic_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5935
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4349
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (hyde/basic) - Page-Based Metrics:
  Average MRR:       0.5171 (Δ -0.0764)
  Average Recall:    0.8756 (Δ -0.0511)
  Average Precision: 0.0964 (Δ -0.0048)
  Average F1:        0.1650 (Δ -0.0079)

EXPANDED (hyde/basic) - Text-Based Metrics:
  Average MRR:       0.3568 (Δ -0.0782)
  Average Recall:    0.6578 (Δ -0.0467)
  Average Precision: 0.0624 (Δ -0.0112)
  Average F1:        0.1106 (Δ -0.0179)

############################################################
CONFIGURATION 3/24
############################################################

EVALUATING: voyage/voyage-3-large
  Chunk size: 512
  K: 20
  Mode: glob

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: hyde_detailed_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5179
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3776
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (hyde/detailed) - Page-Based Metrics:
  Average MRR:       0.3837 (Δ -0.1342)
  Average Recall:    0.7311 (Δ -0.0956)
  Average Precision: 0.0560 (Δ -0.0080)
  Average F1:        0.1028 (Δ -0.0145)

EXPANDED (hyde/detailed) - Text-Based Metrics:
  Average MRR:       0.2413 (Δ -0.1364)
  Average Recall:    0.5333 (Δ -0.0989)
  Average Precision: 0.0377 (Δ -0.0157)
  Average F1:        0.0694 (Δ -0.0264)

############################################################
CONFIGURATION 4/24
############################################################

EVALUATING: voyage/voyage-3-large
  Chunk size: 512
  K: 20
  Mode

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: hyde_detailed_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (hyde/detailed) - Page-Based Metrics:
  Average MRR:       0.4947 (Δ -0.0986)
  Average Recall:    0.8433 (Δ -0.0833)
  Average Precision: 0.0890 (Δ -0.0122)
  Average F1:        0.1551 (Δ -0.0178)

EXPANDED (hyde/detailed) - Text-Based Metrics:
  Average MRR:       0.3147 (Δ -0.1200)
  Average Recall:    0.6144 (Δ -0.0900)
  Average Precision: 0.0584 (Δ -0.0153)
  Average F1:        0.1034 (Δ -0.0252)

############################################################
CONFIGURATION 5/24
############################################################

EVALUATING: voyage/voyage-3-large
  Chunk size: 512
  K: 20
  M

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: hyde_financial_terminology_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (hyde/financial_terminology) - Page-Based Metrics:
  Average MRR:       0.3939 (Δ -0.1239)
  Average Recall:    0.7144 (Δ -0.1122)
  Average Precision: 0.0553 (Δ -0.0087)
  Average F1:        0.1016 (Δ -0.0158)

EXPANDED (hyde/financial_terminology) - Text-Based Metrics:
  Average MRR:       0.2469 (Δ -0.1305)
  Average Recall:    0.5233 (Δ -0.1089)
  Average Precision: 0.0350 (Δ -0.0183)
  Average F1:        0.0648 (Δ -0.0310)

############################################################
CONFIGURATION 6/24
############################################################

EVALUATING: voyage/voyage-3

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: hyde_financial_terminology_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (hyde/financial_terminology) - Page-Based Metrics:
  Average MRR:       0.4964 (Δ -0.0969)
  Average Recall:    0.8567 (Δ -0.0700)
  Average Precision: 0.0964 (Δ -0.0048)
  Average F1:        0.1643 (Δ -0.0086)

EXPANDED (hyde/financial_terminology) - Text-Based Metrics:
  Average MRR:       0.3191 (Δ -0.1157)
  Average Recall:    0.6489 (Δ -0.0556)
  Average Precision: 0.0591 (Δ -0.0146)
  Average F1:        0.1053 (Δ -0.0233)

############################################################
CONFIGURATION 7/24
############################################################

EVALUATING: voyage/voyag

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: query_refinement_clarification_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (query_refinement/clarification) - Page-Based Metrics:
  Average MRR:       0.5220 (Δ +0.0043)
  Average Recall:    0.8189 (Δ -0.0078)
  Average Precision: 0.0627 (Δ -0.0013)
  Average F1:        0.1150 (Δ -0.0023)

EXPANDED (query_refinement/clarification) - Text-Based Metrics:
  Average MRR:       0.3820 (Δ +0.0045)
  Average Recall:    0.6156 (Δ -0.0167)
  Average Precision: 0.0530 (Δ -0.0003)
  Average F1:        0.0948 (Δ -0.0010)

############################################################
CONFIGURATION 8/24
############################################################

EVALUATING: voy

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: query_refinement_clarification_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (query_refinement/clarification) - Page-Based Metrics:
  Average MRR:       0.6007 (Δ +0.0074)
  Average Recall:    0.9144 (Δ -0.0122)
  Average Precision: 0.0969 (Δ -0.0042)
  Average F1:        0.1656 (Δ -0.0073)

EXPANDED (query_refinement/clarification) - Text-Based Metrics:
  Average MRR:       0.4415 (Δ +0.0067)
  Average Recall:    0.6844 (Δ -0.0200)
  Average Precision: 0.0732 (Δ -0.0005)
  Average F1:        0.1273 (Δ -0.0012)

############################################################
CONFIGURATION 9/24
############################################################

EVALUATING: 

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: query_refinement_formal_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (query_refinement/formal) - Page-Based Metrics:
  Average MRR:       0.5594 (Δ +0.0417)
  Average Recall:    0.8356 (Δ +0.0089)
  Average Precision: 0.0637 (Δ -0.0003)
  Average F1:        0.1168 (Δ -0.0005)

EXPANDED (query_refinement/formal) - Text-Based Metrics:
  Average MRR:       0.4090 (Δ +0.0315)
  Average Recall:    0.6256 (Δ -0.0067)
  Average Precision: 0.0553 (Δ +0.0020)
  Average F1:        0.0989 (Δ +0.0031)

############################################################
CONFIGURATION 10/24
############################################################

EVALUATING: voyage/voyage-3-large
 

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: query_refinement_formal_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5944
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (query_refinement/formal) - Page-Based Metrics:
  Average MRR:       0.6282 (Δ +0.0338)
  Average Recall:    0.9300 (Δ +0.0033)
  Average Precision: 0.0978 (Δ -0.0034)
  Average F1:        0.1683 (Δ -0.0046)

EXPANDED (query_refinement/formal) - Text-Based Metrics:
  Average MRR:       0.4567 (Δ +0.0220)
  Average Recall:    0.6978 (Δ -0.0067)
  Average Precision: 0.0751 (Δ +0.0014)
  Average F1:        0.1306 (Δ +0.0020)

############################################################
CONFIGURATION 11/24
############################################################

EVALUATING: voyage/voyage-3-larg

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: query_refinement_keyword_focused_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (query_refinement/keyword_focused) - Page-Based Metrics:
  Average MRR:       0.4727 (Δ -0.0451)
  Average Recall:    0.8222 (Δ -0.0044)
  Average Precision: 0.0627 (Δ -0.0013)
  Average F1:        0.1150 (Δ -0.0023)

EXPANDED (query_refinement/keyword_focused) - Text-Based Metrics:
  Average MRR:       0.3543 (Δ -0.0231)
  Average Recall:    0.6389 (Δ +0.0067)
  Average Precision: 0.0540 (Δ +0.0007)
  Average F1:        0.0969 (Δ +0.0011)

############################################################
CONFIGURATION 12/24
############################################################

EVALUATI

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: query_refinement_keyword_focused_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (query_refinement/keyword_focused) - Page-Based Metrics:
  Average MRR:       0.5650 (Δ -0.0283)
  Average Recall:    0.9211 (Δ -0.0056)
  Average Precision: 0.0974 (Δ -0.0037)
  Average F1:        0.1678 (Δ -0.0051)

EXPANDED (query_refinement/keyword_focused) - Text-Based Metrics:
  Average MRR:       0.4172 (Δ -0.0176)
  Average Recall:    0.7000 (Δ -0.0044)
  Average Precision: 0.0748 (Δ +0.0011)
  Average F1:        0.1301 (Δ +0.0015)

############################################################
CONFIGURATION 13/24
############################################################

EVALU

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: term_expansion_abbreviation_synonym_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (term_expansion/abbreviation_synonym) - Page-Based Metrics:
  Average MRR:       0.5633 (Δ +0.0455)
  Average Recall:    0.8478 (Δ +0.0211)
  Average Precision: 0.0643 (Δ +0.0003)
  Average F1:        0.1180 (Δ +0.0007)

EXPANDED (term_expansion/abbreviation_synonym) - Text-Based Metrics:
  Average MRR:       0.4143 (Δ +0.0369)
  Average Recall:    0.6256 (Δ -0.0067)
  Average Precision: 0.0533 (Δ +0.0000)
  Average F1:        0.0956 (Δ -0.0002)

############################################################
CONFIGURATION 14/24
############################################################


Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: term_expansion_abbreviation_synonym_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (term_expansion/abbreviation_synonym) - Page-Based Metrics:
  Average MRR:       0.6355 (Δ +0.0422)
  Average Recall:    0.9111 (Δ -0.0156)
  Average Precision: 0.0955 (Δ -0.0056)
  Average F1:        0.1655 (Δ -0.0074)

EXPANDED (term_expansion/abbreviation_synonym) - Text-Based Metrics:
  Average MRR:       0.4678 (Δ +0.0331)
  Average Recall:    0.6922 (Δ -0.0122)
  Average Precision: 0.0726 (Δ -0.0010)
  Average F1:        0.1270 (Δ -0.0015)

############################################################
CONFIGURATION 15/24
##########################################################

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: term_expansion_context_addition_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (term_expansion/context_addition) - Page-Based Metrics:
  Average MRR:       0.5283 (Δ +0.0106)
  Average Recall:    0.8278 (Δ +0.0011)
  Average Precision: 0.0637 (Δ -0.0003)
  Average F1:        0.1165 (Δ -0.0008)

EXPANDED (term_expansion/context_addition) - Text-Based Metrics:
  Average MRR:       0.3878 (Δ +0.0104)
  Average Recall:    0.6256 (Δ -0.0067)
  Average Precision: 0.0517 (Δ -0.0017)
  Average F1:        0.0931 (Δ -0.0027)

############################################################
CONFIGURATION 16/24
############################################################

EVALUATING:

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: term_expansion_context_addition_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (term_expansion/context_addition) - Page-Based Metrics:
  Average MRR:       0.5880 (Δ -0.0052)
  Average Recall:    0.9300 (Δ +0.0033)
  Average Precision: 0.1028 (Δ +0.0016)
  Average F1:        0.1753 (Δ +0.0024)

EXPANDED (term_expansion/context_addition) - Text-Based Metrics:
  Average MRR:       0.4361 (Δ +0.0015)
  Average Recall:    0.7111 (Δ +0.0067)
  Average Precision: 0.0736 (Δ -0.0000)
  Average F1:        0.1288 (Δ +0.0002)

############################################################
CONFIGURATION 17/24
############################################################

EVALUATI

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: chain_of_thought_step_by_step_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5177
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (chain_of_thought/step_by_step) - Page-Based Metrics:
  Average MRR:       0.3768 (Δ -0.1410)
  Average Recall:    0.7322 (Δ -0.0944)
  Average Precision: 0.0553 (Δ -0.0087)
  Average F1:        0.1017 (Δ -0.0157)

EXPANDED (chain_of_thought/step_by_step) - Text-Based Metrics:
  Average MRR:       0.2456 (Δ -0.1318)
  Average Recall:    0.5500 (Δ -0.0822)
  Average Precision: 0.0393 (Δ -0.0140)
  Average F1:        0.0723 (Δ -0.0235)

############################################################
CONFIGURATION 18/24
############################################################

EVALUATING: voyag

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: chain_of_thought_step_by_step_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5931
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (chain_of_thought/step_by_step) - Page-Based Metrics:
  Average MRR:       0.4717 (Δ -0.1215)
  Average Recall:    0.8544 (Δ -0.0722)
  Average Precision: 0.0909 (Δ -0.0103)
  Average F1:        0.1572 (Δ -0.0157)

EXPANDED (chain_of_thought/step_by_step) - Text-Based Metrics:
  Average MRR:       0.3182 (Δ -0.1165)
  Average Recall:    0.6478 (Δ -0.0567)
  Average Precision: 0.0624 (Δ -0.0113)
  Average F1:        0.1105 (Δ -0.0181)

############################################################
CONFIGURATION 19/24
############################################################

EVALUATING: vo

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: chain_of_thought_explicit_context_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5175
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (chain_of_thought/explicit_context) - Page-Based Metrics:
  Average MRR:       0.5190 (Δ +0.0015)
  Average Recall:    0.8289 (Δ +0.0022)
  Average Precision: 0.0640 (Δ +0.0000)
  Average F1:        0.1173 (Δ -0.0000)

EXPANDED (chain_of_thought/explicit_context) - Text-Based Metrics:
  Average MRR:       0.3931 (Δ +0.0157)
  Average Recall:    0.6222 (Δ -0.0100)
  Average Precision: 0.0523 (Δ -0.0010)
  Average F1:        0.0941 (Δ -0.0017)

############################################################
CONFIGURATION 20/24
############################################################

EVALU

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: chain_of_thought_explicit_context_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1729

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0737
  Average F1:        0.1286

EXPANDED (chain_of_thought/explicit_context) - Page-Based Metrics:
  Average MRR:       0.5879 (Δ -0.0054)
  Average Recall:    0.8967 (Δ -0.0300)
  Average Precision: 0.0960 (Δ -0.0052)
  Average F1:        0.1636 (Δ -0.0093)

EXPANDED (chain_of_thought/explicit_context) - Text-Based Metrics:
  Average MRR:       0.4427 (Δ +0.0079)
  Average Recall:    0.6844 (Δ -0.0200)
  Average Precision: 0.0728 (Δ -0.0008)
  Average F1:        0.1267 (Δ -0.0019)

############################################################
CONFIGURATION 21/24
############################################################

EV

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: domain_adaptation_accounting_perspective_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5179
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (domain_adaptation/accounting_perspective) - Page-Based Metrics:
  Average MRR:       0.5088 (Δ -0.0091)
  Average Recall:    0.8500 (Δ +0.0233)
  Average Precision: 0.0637 (Δ -0.0003)
  Average F1:        0.1171 (Δ -0.0002)

EXPANDED (domain_adaptation/accounting_perspective) - Text-Based Metrics:
  Average MRR:       0.3686 (Δ -0.0088)
  Average Recall:    0.6344 (Δ +0.0022)
  Average Precision: 0.0540 (Δ +0.0007)
  Average F1:        0.0968 (Δ +0.0010)

############################################################
CONFIGURATION 22/24
##############################################

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: domain_adaptation_accounting_perspective_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1011
  Average F1:        0.1728

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0736
  Average F1:        0.1285

EXPANDED (domain_adaptation/accounting_perspective) - Page-Based Metrics:
  Average MRR:       0.5959 (Δ +0.0027)
  Average Recall:    0.9200 (Δ -0.0067)
  Average Precision: 0.0983 (Δ -0.0027)
  Average F1:        0.1698 (Δ -0.0030)

EXPANDED (domain_adaptation/accounting_perspective) - Text-Based Metrics:
  Average MRR:       0.4315 (Δ -0.0032)
  Average Recall:    0.6911 (Δ -0.0133)
  Average Precision: 0.0759 (Δ +0.0023)
  Average F1:        0.1321 (Δ +0.0036)

############################################################
CONFIGURATION 23/24
###########################################

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: domain_adaptation_10k_language_voyage_voyage-3-large_chunk512_k20_global.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5211
  Average Recall:    0.8267
  Average Precision: 0.0640
  Average F1:        0.1173

BASELINE - Text-Based Metrics:
  Average MRR:       0.3774
  Average Recall:    0.6322
  Average Precision: 0.0533
  Average F1:        0.0958

EXPANDED (domain_adaptation/10k_language) - Page-Based Metrics:
  Average MRR:       0.4928 (Δ -0.0283)
  Average Recall:    0.7956 (Δ -0.0311)
  Average Precision: 0.0587 (Δ -0.0053)
  Average F1:        0.1080 (Δ -0.0094)

EXPANDED (domain_adaptation/10k_language) - Text-Based Metrics:
  Average MRR:       0.3593 (Δ -0.0181)
  Average Recall:    0.5956 (Δ -0.0367)
  Average Precision: 0.0507 (Δ -0.0027)
  Average F1:        0.0911 (Δ -0.0047)

############################################################
CONFIGURATION 24/24
############################################################

EVALUATING: vo

Queries:   0%|          | 0/150 [00:00<?, ?it/s]

✓ Saved: domain_adaptation_10k_language_voyage_voyage-3-large_chunk512_k20_singledoc.json

RESULTS SUMMARY

BASELINE - Page-Based Metrics:
  Average MRR:       0.5933
  Average Recall:    0.9267
  Average Precision: 0.1009
  Average F1:        0.1726

BASELINE - Text-Based Metrics:
  Average MRR:       0.4347
  Average Recall:    0.7044
  Average Precision: 0.0735
  Average F1:        0.1284

EXPANDED (domain_adaptation/10k_language) - Page-Based Metrics:
  Average MRR:       0.5708 (Δ -0.0225)
  Average Recall:    0.9078 (Δ -0.0189)
  Average Precision: 0.0948 (Δ -0.0062)
  Average F1:        0.1636 (Δ -0.0091)

EXPANDED (domain_adaptation/10k_language) - Text-Based Metrics:
  Average MRR:       0.4164 (Δ -0.0183)
  Average Recall:    0.6778 (Δ -0.0267)
  Average Precision: 0.0728 (Δ -0.0007)
  Average F1:        0.1269 (Δ -0.0014)

BATCH EVALUATION SUMMARY
Total runs: 24
Completed: 24
Skipped: 0
Failed: 0
Total time: 70.44 minutes
Average time per run: 176.10 seconds

⚠️  EVALUATION 

In [None]:
# Example usage (uncomment to use):
#results = load_and_inspect_result("hyde_detailed_ollama_nomic-embed-text_chunk512_k20_global.json")


RESULTS: hyde_detailed_ollama_nomic-embed-text_chunk512_k20_global.json

Configuration:
  Provider: ollama
  Model: nomic-embed-text
  Chunk size: 512
  K: 20
  Mode: global
  Query Expansion: hyde/detailed
  Page tolerance: True
  Text threshold: 0.8

BASELINE METRICS

Page-Based Metrics:
  Average MRR:       0.2456
  Average Recall:    0.5156
  Average Precision: 0.0373
  Average F1:        0.0687

Text-Based Metrics:
  Average MRR:       0.1921
  Average Recall:    0.4533
  Average Precision: 0.0427
  Average F1:        0.0748

EXPANDED QUERY METRICS (hyde/detailed)

Page-Based Metrics:
  Average MRR:       0.2663 (Δ +0.0206)
  Average Recall:    0.5500 (Δ +0.0344)
  Average Precision: 0.0403 (Δ +0.0030)
  Average F1:        0.0744 (Δ +0.0057)

Text-Based Metrics:
  Average MRR:       0.1896 (Δ -0.0024)
  Average Recall:    0.4467 (Δ -0.0067)
  Average Precision: 0.0477 (Δ +0.0050)
  Average F1:        0.0820 (Δ +0.0072)

------------------------------------------------------------

In [19]:
# ============================================================================
# Query Expansion Comparison Report (With Baseline from Same Files)
# ============================================================================

def generate_expansion_comparison_report(
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str = None,  # None = both modes, or specify "global"/"singledoc"
    output_dir: str = OUTPUT_DIR,
    save_to_file: bool = True
):
    """
    Generate a comprehensive comparison report for all query expansion methods.
    
    Each expansion result file contains BOTH baseline (original query) and 
    expanded query metrics. This function compares them to show improvements.
    
    Args:
        provider: Embedding provider (e.g., "ollama", "voyage")
        model: Embedding model (e.g., "nomic-embed-text", "voyage-finance-2")
        chunk_size: Chunk size (e.g., 512, 1024)
        k: Number of documents retrieved (e.g., 20)
        mode: Retrieval mode - None (both), "global", or "singledoc"
        output_dir: Directory containing result files
        save_to_file: Whether to save report to text file
    
    Returns:
        Dictionary with all comparison data
    """
    
    print(f"\n{'='*80}")
    print(f"SCANNING FOR QUERY EXPANSION RESULTS")
    print(f"{'='*80}")
    print(f"  Provider: {provider}")
    print(f"  Model: {model}")
    print(f"  Chunk Size: {chunk_size}")
    print(f"  K: {k}")
    print(f"  Mode: {mode if mode else 'both (global + singledoc)'}")
    print(f"  Directory: {output_dir}\n")
    
    # ========================================
    # STEP 1: Define expected expansion configs
    # ========================================
    expansion_configs = [
        ("hyde", "basic"),
        ("hyde", "detailed"),
        ("hyde", "financial_terminology"),
        ("query_refinement", "clarification"),
        ("query_refinement", "formal"),
        ("query_refinement", "keyword_focused"),
        ("term_expansion", "abbreviation_synonym"),
        ("term_expansion", "context_addition"),
        ("chain_of_thought", "step_by_step"),
        ("chain_of_thought", "explicit_context"),
        ("domain_adaptation", "accounting_perspective"),
        ("domain_adaptation", "10k_language"),
    ]
    
    # ========================================
    # STEP 2: Determine which modes to check
    # ========================================
    if mode is None:
        modes_to_check = ["global", "singledoc"]
    else:
        modes_to_check = [mode]
    
    # ========================================
    # STEP 3: Scan for files and load data
    # ========================================
    all_expansion_data = {}
    
    for check_mode in modes_to_check:
        mode_data = {}
        
        for exp_type, exp_subtype in expansion_configs:
            # Construct expected filename
            filename = f"{exp_type}_{exp_subtype}_{provider}_{model}_chunk{chunk_size}_k{k}_{check_mode}.json"
            filepath = os.path.join(output_dir, filename)
            
            if os.path.exists(filepath):
                try:
                    with open(filepath, 'r') as f:
                        results = json.load(f)
                    
                    summary = results[0]['summary']
                    
                    mode_data[f"{exp_type}_{exp_subtype}"] = {
                        'filename': filename,
                        'summary': summary,
                        'type': exp_type,
                        'subtype': exp_subtype,
                        'mode': check_mode
                    }
                    
                    print(f"  ✓ Found: {filename}")
                    
                except Exception as e:
                    print(f"  ⚠️  Error loading {filename}: {e}")
            else:
                print(f"  ⏭️  Missing: {filename}")
        
        all_expansion_data[check_mode] = mode_data
    
    # Check if we found any data
    total_found = sum(len(mode_data) for mode_data in all_expansion_data.values())
    
    if total_found == 0:
        print(f"\n❌ No expansion result files found!")
        print(f"\nExpected filename format:")
        print(f"  {{expansion_type}}_{{expansion_subtype}}_{provider}_{model}_chunk{chunk_size}_k{k}_{{mode}}.json")
        print(f"\nExample:")
        print(f"  hyde_basic_{provider}_{model}_chunk{chunk_size}_k{k}_global.json")
        return None
    
    print(f"\n✓ Found {total_found} expansion result file(s)")
    
    # ========================================
    # STEP 4: Generate report for each mode
    # ========================================
    all_reports = []
    
    for check_mode in modes_to_check:
        mode_data = all_expansion_data[check_mode]
        
        if not mode_data:
            print(f"\n⏭️  No data for mode: {check_mode}")
            continue
        
        # Build report for this mode
        report_lines = []
        
        report_lines.append("\n" + "="*80)
        report_lines.append(f"QUERY EXPANSION COMPARISON REPORT - MODE: {check_mode.upper()}")
        report_lines.append("="*80)
        
        report_lines.append(f"\nConfiguration:")
        report_lines.append(f"  Provider: {provider}")
        report_lines.append(f"  Model: {model}")
        report_lines.append(f"  Chunk Size: {chunk_size}")
        report_lines.append(f"  K (retrieved): {k}")
        report_lines.append(f"  Mode: {check_mode}")
        report_lines.append(f"  Expansion methods found: {len(mode_data)}")
        
        # ========================================
        # Extract baseline and expansion metrics
        # ========================================
        # Get baseline from first method (they all have same baseline)
        first_summary = list(mode_data.values())[0]['summary']
        
        baseline_metrics = {
            'page_mrr': first_summary['average_page_mrr'],
            'page_recall': first_summary['average_page_recall'],
            'page_precision': first_summary['average_page_precision'],
            'page_f1': first_summary['average_page_f1'],
            'text_mrr': first_summary['average_text_mrr'],
            'text_recall': first_summary['average_text_recall'],
            'text_precision': first_summary['average_text_precision'],
            'text_f1': first_summary['average_text_f1'],
        }
        
        # ========================================
        # Show baseline performance
        # ========================================
        report_lines.append("\n" + "="*80)
        report_lines.append("BASELINE PERFORMANCE (Original Queries - No Expansion)")
        report_lines.append("="*80)
        
        report_lines.append("\nPage-Based Metrics:")
        report_lines.append(f"  MRR:       {baseline_metrics['page_mrr']:.4f}")
        report_lines.append(f"  Recall:    {baseline_metrics['page_recall']:.4f}")
        report_lines.append(f"  Precision: {baseline_metrics['page_precision']:.4f}")
        report_lines.append(f"  F1:        {baseline_metrics['page_f1']:.4f}")
        
        report_lines.append("\nText-Based Metrics:")
        report_lines.append(f"  MRR:       {baseline_metrics['text_mrr']:.4f}")
        report_lines.append(f"  Recall:    {baseline_metrics['text_recall']:.4f}")
        report_lines.append(f"  Precision: {baseline_metrics['text_precision']:.4f}")
        report_lines.append(f"  F1:        {baseline_metrics['text_f1']:.4f}")
        
        # ========================================
        # Extract expansion metrics with improvements
        # ========================================
        metrics_data = {}
        
        for exp_key, exp_info in mode_data.items():
            summary = exp_info['summary']
            
            # Extract expanded query metrics
            expanded_metrics = {
                'page_mrr': summary['average_expanded_page_mrr'],
                'page_recall': summary['average_expanded_page_recall'],
                'page_precision': summary['average_expanded_page_precision'],
                'page_f1': summary['average_expanded_page_f1'],
                'text_mrr': summary['average_expanded_text_mrr'],
                'text_recall': summary['average_expanded_text_recall'],
                'text_precision': summary['average_expanded_text_precision'],
                'text_f1': summary['average_expanded_text_f1'],
            }
            
            # Calculate improvements (delta and percentage)
            improvements = {
                'page_mrr': expanded_metrics['page_mrr'] - baseline_metrics['page_mrr'],
                'page_recall': expanded_metrics['page_recall'] - baseline_metrics['page_recall'],
                'page_precision': expanded_metrics['page_precision'] - baseline_metrics['page_precision'],
                'page_f1': expanded_metrics['page_f1'] - baseline_metrics['page_f1'],
                'text_mrr': expanded_metrics['text_mrr'] - baseline_metrics['text_mrr'],
                'text_recall': expanded_metrics['text_recall'] - baseline_metrics['text_recall'],
                'text_precision': expanded_metrics['text_precision'] - baseline_metrics['text_precision'],
                'text_f1': expanded_metrics['text_f1'] - baseline_metrics['text_f1'],
            }
            
            improvement_percentages = {
                'page_mrr': (improvements['page_mrr'] / baseline_metrics['page_mrr'] * 100) if baseline_metrics['page_mrr'] > 0 else 0,
                'page_recall': (improvements['page_recall'] / baseline_metrics['page_recall'] * 100) if baseline_metrics['page_recall'] > 0 else 0,
                'text_mrr': (improvements['text_mrr'] / baseline_metrics['text_mrr'] * 100) if baseline_metrics['text_mrr'] > 0 else 0,
                'text_recall': (improvements['text_recall'] / baseline_metrics['text_recall'] * 100) if baseline_metrics['text_recall'] > 0 else 0,
            }
            
            metrics_data[exp_key] = {
                'type': exp_info['type'],
                'subtype': exp_info['subtype'],
                'filename': exp_info['filename'],
                'metrics': expanded_metrics,
                'improvements': improvements,
                'improvement_percentages': improvement_percentages
            }
        
        # ========================================
        # Detailed Results
        # ========================================
        report_lines.append("\n" + "="*80)
        report_lines.append("EXPANSION METHODS - DETAILED RESULTS")
        report_lines.append("="*80)
        
        # Sort by page MRR improvement (descending)
        sorted_by_page_mrr_imp = sorted(
            metrics_data.items(),
            key=lambda x: x[1]['improvements']['page_mrr'],
            reverse=True
        )
        
        for exp_key, data in sorted_by_page_mrr_imp:
            report_lines.append("\n" + "-"*80)
            report_lines.append(f"Method: {data['type']} / {data['subtype']}")
            report_lines.append("-"*80)
            
            report_lines.append("\nPage-Based Metrics:")
            report_lines.append(f"  MRR:       {data['metrics']['page_mrr']:.4f} (Δ {data['improvements']['page_mrr']:+.4f}, {data['improvement_percentages']['page_mrr']:+.2f}%)")
            report_lines.append(f"  Recall:    {data['metrics']['page_recall']:.4f} (Δ {data['improvements']['page_recall']:+.4f}, {data['improvement_percentages']['page_recall']:+.2f}%)")
            report_lines.append(f"  Precision: {data['metrics']['page_precision']:.4f} (Δ {data['improvements']['page_precision']:+.4f})")
            report_lines.append(f"  F1:        {data['metrics']['page_f1']:.4f} (Δ {data['improvements']['page_f1']:+.4f})")
            
            report_lines.append("\nText-Based Metrics:")
            report_lines.append(f"  MRR:       {data['metrics']['text_mrr']:.4f} (Δ {data['improvements']['text_mrr']:+.4f}, {data['improvement_percentages']['text_mrr']:+.2f}%)")
            report_lines.append(f"  Recall:    {data['metrics']['text_recall']:.4f} (Δ {data['improvements']['text_recall']:+.4f}, {data['improvement_percentages']['text_recall']:+.2f}%)")
            report_lines.append(f"  Precision: {data['metrics']['text_precision']:.4f} (Δ {data['improvements']['text_precision']:+.4f})")
            report_lines.append(f"  F1:        {data['metrics']['text_f1']:.4f} (Δ {data['improvements']['text_f1']:+.4f})")
        
        # ========================================
        # Rankings by Improvement
        # ========================================
        report_lines.append("\n" + "="*80)
        report_lines.append("RANKING BY PAGE MRR IMPROVEMENT")
        report_lines.append("="*80)
        report_lines.append("\nRank | Method                                    | Page MRR | Δ MRR    | %")
        report_lines.append("-"*80)
        
        for rank, (exp_key, data) in enumerate(sorted_by_page_mrr_imp, 1):
            method_name = f"{data['type']}/{data['subtype']}"
            report_lines.append(
                f"{rank:4d} | {method_name:42s} | {data['metrics']['page_mrr']:.4f}   | "
                f"{data['improvements']['page_mrr']:+.4f}   | {data['improvement_percentages']['page_mrr']:+.2f}%"
            )
        
        # Sort by text MRR improvement
        sorted_by_text_mrr_imp = sorted(
            metrics_data.items(),
            key=lambda x: x[1]['improvements']['text_mrr'],
            reverse=True
        )
        
        report_lines.append("\n" + "="*80)
        report_lines.append("RANKING BY TEXT MRR IMPROVEMENT")
        report_lines.append("="*80)
        report_lines.append("\nRank | Method                                    | Text MRR | Δ MRR    | %")
        report_lines.append("-"*80)
        
        for rank, (exp_key, data) in enumerate(sorted_by_text_mrr_imp, 1):
            method_name = f"{data['type']}/{data['subtype']}"
            report_lines.append(
                f"{rank:4d} | {method_name:42s} | {data['metrics']['text_mrr']:.4f}   | "
                f"{data['improvements']['text_mrr']:+.4f}   | {data['improvement_percentages']['text_mrr']:+.2f}%"
            )
        
        # ========================================
        # Category Analysis
        # ========================================
        report_lines.append("\n" + "="*80)
        report_lines.append("ANALYSIS BY EXPANSION CATEGORY")
        report_lines.append("="*80)
        
        # Group by category
        categories = {}
        for exp_key, data in metrics_data.items():
            cat = data['type']
            if cat not in categories:
                categories[cat] = []
            categories[cat].append(data)
        
        for category in sorted(categories.keys()):
            methods = categories[category]
            avg_page_mrr_imp = sum(m['improvements']['page_mrr'] for m in methods) / len(methods)
            avg_text_mrr_imp = sum(m['improvements']['text_mrr'] for m in methods) / len(methods)
            
            report_lines.append(f"\nCategory: {category.upper().replace('_', ' ')}")
            report_lines.append(f"  Number of variants: {len(methods)}")
            report_lines.append(f"  Avg Page MRR improvement: {avg_page_mrr_imp:+.4f}")
            report_lines.append(f"  Avg Text MRR improvement: {avg_text_mrr_imp:+.4f}")
            report_lines.append(f"  Variants:")
            for method in methods:
                report_lines.append(
                    f"    - {method['subtype']:30s} "
                    f"Page: {method['improvements']['page_mrr']:+.4f}, Text: {method['improvements']['text_mrr']:+.4f}"
                )
        
        # ========================================
        # Best and Worst Performers
        # ========================================
        report_lines.append("\n" + "="*80)
        report_lines.append("BEST AND WORST PERFORMERS")
        report_lines.append("="*80)
        
        best_page = sorted_by_page_mrr_imp[0]
        worst_page = sorted_by_page_mrr_imp[-1]
        best_text = sorted_by_text_mrr_imp[0]
        worst_text = sorted_by_text_mrr_imp[-1]
        
        report_lines.append(f"\nBest Page MRR Improvement:")
        report_lines.append(f"  Method: {best_page[1]['type']}/{best_page[1]['subtype']}")
        report_lines.append(f"  MRR: {best_page[1]['metrics']['page_mrr']:.4f} (Δ {best_page[1]['improvements']['page_mrr']:+.4f}, {best_page[1]['improvement_percentages']['page_mrr']:+.2f}%)")
        
        report_lines.append(f"\nWorst Page MRR Improvement:")
        report_lines.append(f"  Method: {worst_page[1]['type']}/{worst_page[1]['subtype']}")
        report_lines.append(f"  MRR: {worst_page[1]['metrics']['page_mrr']:.4f} (Δ {worst_page[1]['improvements']['page_mrr']:+.4f}, {worst_page[1]['improvement_percentages']['page_mrr']:+.2f}%)")
        
        report_lines.append(f"\nBest Text MRR Improvement:")
        report_lines.append(f"  Method: {best_text[1]['type']}/{best_text[1]['subtype']}")
        report_lines.append(f"  MRR: {best_text[1]['metrics']['text_mrr']:.4f} (Δ {best_text[1]['improvements']['text_mrr']:+.4f}, {best_text[1]['improvement_percentages']['text_mrr']:+.2f}%)")
        
        report_lines.append(f"\nWorst Text MRR Improvement:")
        report_lines.append(f"  Method: {worst_text[1]['type']}/{worst_text[1]['subtype']}")
        report_lines.append(f"  MRR: {worst_text[1]['metrics']['text_mrr']:.4f} (Δ {worst_text[1]['improvements']['text_mrr']:+.4f}, {worst_text[1]['improvement_percentages']['text_mrr']:+.2f}%)")
        
        # ========================================
        # Key Insights
        # ========================================
        report_lines.append("\n" + "="*80)
        report_lines.append("KEY INSIGHTS")
        report_lines.append("="*80)
        
        positive_page = sum(1 for _, data in metrics_data.items() if data['improvements']['page_mrr'] > 0)
        positive_text = sum(1 for _, data in metrics_data.items() if data['improvements']['text_mrr'] > 0)
        total = len(metrics_data)
        
        report_lines.append(f"\n• {positive_page}/{total} ({positive_page/total*100:.1f}%) methods improved Page MRR over baseline")
        report_lines.append(f"• {positive_text}/{total} ({positive_text/total*100:.1f}%) methods improved Text MRR over baseline")
        
        avg_page_improvement = sum(data['improvements']['page_mrr'] for data in metrics_data.values()) / total
        avg_text_improvement = sum(data['improvements']['text_mrr'] for data in metrics_data.values()) / total
        
        report_lines.append(f"• Average Page MRR change: {avg_page_improvement:+.4f} ({avg_page_improvement/baseline_metrics['page_mrr']*100:+.2f}%)")
        report_lines.append(f"• Average Text MRR change: {avg_text_improvement:+.4f} ({avg_text_improvement/baseline_metrics['text_mrr']*100:+.2f}%)")
        
        report_lines.append("\n" + "="*80)
        report_lines.append(f"END OF REPORT - MODE: {check_mode.upper()}")
        report_lines.append("="*80)
        
        # Store this mode's report
        all_reports.append({
            'mode': check_mode,
            'report_text': "\n".join(report_lines),
            'baseline_metrics': baseline_metrics,
            'metrics_data': metrics_data
        })
    
    # ========================================
    # Print and save all reports
    # ========================================
    full_report = "\n\n".join([r['report_text'] for r in all_reports])
    print(full_report)
    
    if save_to_file:
        mode_str = mode if mode else "both"
        report_filename = f"expansion_report_{provider}_{model}_chunk{chunk_size}_k{k}_{mode_str}.txt"
        report_path = os.path.join(output_dir, report_filename)
        
        with open(report_path, 'w') as f:
            f.write(full_report)
        
        print(f"\n✓ Report saved to: {report_filename}")
    
    return {
        'configuration': {
            'provider': provider,
            'model': model,
            'chunk_size': chunk_size,
            'k': k,
            'mode': mode
        },
        'reports': all_reports
    }


# ============================================================================
# Shortcut function
# ============================================================================

def quick_expansion_report(
    provider: str = "ollama",
    model: str = "nomic-embed-text",
    chunk_size: int = 512,
    k: int = 20,
    mode: str = None,  # None = both modes
    output_dir: str = OUTPUT_DIR
):
    """
    Quick shortcut to generate expansion comparison report.
    
    Example usage:
        quick_expansion_report()  # Both global and singledoc
        quick_expansion_report(mode="global")  # Only global
        quick_expansion_report(chunk_size=1024)  # Override chunk size
    """
    return generate_expansion_comparison_report(
        provider=provider,
        model=model,
        chunk_size=chunk_size,
        k=k,
        mode=mode,
        output_dir=output_dir,
        save_to_file=True
    )


print("✓ Query expansion comparison report functions defined (v3 - with baseline deltas)")
print("\nUsage:")
print("  quick_expansion_report(provider='ollama', model='nomic-embed-text', chunk_size=512)")

✓ Query expansion comparison report functions defined (v3 - with baseline deltas)

Usage:
  quick_expansion_report(provider='ollama', model='nomic-embed-text', chunk_size=512)


In [20]:
# Generate report for ollama, embed-text, chunk 512, global mode
# quick_expansion_report(provider="ollama", model="nomic-embed-text", chunk_size=512, k=20, mode="global")
quick_expansion_report(provider="voyage", model="voyage-finance-2", chunk_size=512, k=20)


SCANNING FOR QUERY EXPANSION RESULTS
  Provider: voyage
  Model: voyage-finance-2
  Chunk Size: 512
  K: 20
  Mode: both (global + singledoc)
  Directory: ../../evaluation_results/query_enhancement

  ✓ Found: hyde_basic_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: hyde_detailed_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: hyde_financial_terminology_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: query_refinement_clarification_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: query_refinement_formal_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: query_refinement_keyword_focused_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: term_expansion_abbreviation_synonym_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: term_expansion_context_addition_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: chain_of_thought_step_by_step_voyage_voyage-finance-2_chunk512_k20_global.json
  ✓ Found: chain_of_th

{'configuration': {'provider': 'voyage',
  'model': 'voyage-finance-2',
  'chunk_size': 512,
  'k': 20,
  'mode': None},
 'reports': [{'mode': 'global',
   'baseline_metrics': {'page_mrr': 0.40164551392879566,
    'page_recall': 0.7388888888888888,
    'page_precision': 0.05766666666666667,
    'page_f1': 0.10576886881234708,
    'text_mrr': 0.2857671580798516,
    'text_recall': 0.5533333333333333,
    'text_precision': 0.053,
    'text_f1': 0.09356616330529373},
   'metrics_data': {'hyde_basic': {'type': 'hyde',
     'subtype': 'basic',
     'filename': 'hyde_basic_voyage_voyage-finance-2_chunk512_k20_global.json',
     'metrics': {'page_mrr': 0.3714884923011858,
      'page_recall': 0.74,
      'page_precision': 0.052000000000000005,
      'page_f1': 0.09598280946107032,
      'text_mrr': 0.2565452568873621,
      'text_recall': 0.5366666666666666,
      'text_precision': 0.046,
      'text_f1': 0.08267380499264557},
     'improvements': {'page_mrr': -0.03015702162760986,
      'pag