In [None]:
# ============================================================================
# Text-Based Evaluation Notebook - FinanceBench RAG
# Evaluating Retrieval with Page-Based AND Text-Based Metrics
# ============================================================================

# %% [markdown]
# # FinanceBench Text-Based Evaluation
# 
# This notebook evaluates RAG retrieval performance using BOTH:
# 1. **Page-based metrics**: MRR, Recall, Precision, F1 (based on page number matching)
# 2. **Text-based metrics**: MRR, Recall, Precision, F1 (based on semantic similarity)
# 
# We use Sentence-BERT (all-MiniLM-L6-v2) to compute cosine similarity between
# retrieved chunks and ground truth evidence text.

# %% [markdown]
# ## 1.1 Standard Imports

# %%
import os
import json
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from collections import defaultdict

# Environment
from dotenv import load_dotenv

# Progress tracking
from tqdm.auto import tqdm

# Data handling
from datasets import load_dataset

# Numerical operations
import numpy as np

# Vector stores and embeddings
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings

print("✓ Standard imports successful")

# %% [markdown]
# ## 1.2 Text Similarity Imports (NEW)

# %%
# Sentence-BERT for semantic similarity
from sentence_transformers import SentenceTransformer

# Cosine similarity calculation
from sklearn.metrics.pairwise import cosine_similarity

print("✓ Text similarity imports successful")

# %% [markdown]
# ## 1.3 Load Environment Variables

# %%
# Load .env file
load_dotenv()

# API Keys and URLs
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")

# Verify API keys
if OPENAI_API_KEY:
    print("✓ OpenAI API key loaded")
else:
    print("⚠ OpenAI API key not found (only needed if using OpenAI embeddings)")

if VOYAGE_API_KEY:
    print("✓ VoyageAI API key loaded")
else:
    print("⚠ VoyageAI API key not found (only needed if using VoyageAI embeddings)")

print(f"✓ Ollama URL: {OLLAMA_BASE_URL}")

# %% [markdown]
# ## 1.4 Configuration Variables

# %%
# Directory paths
VECTOR_DB_BASE_DIR = "../../vector_databases"
OUTPUT_DIR = "../../evaluation_results/text_based_evaluation"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Dataset configuration
DATASET_NAME = "PatronusAI/financebench"
DATASET_SPLIT = "train"

# Vector database configuration
COLLECTION_PREFIX = "financebench_docs_chunk_"

# ============================================================================
# TEXT-BASED EVALUATION PARAMETERS (NEW)
# ============================================================================

# Sentence-BERT model for semantic similarity
SBERT_MODEL_NAME = "all-MiniLM-L6-v2"

# Similarity threshold for text-based matching
# Chunks with cosine similarity >= this threshold are considered matches
TEXT_SIMILARITY_THRESHOLD = 0.7

# Chunk text preview settings
# We store abbreviated chunk text: "first N chars...last N chars"
CHUNK_TEXT_PREFIX_CHARS = 100  # Characters to keep from start
CHUNK_TEXT_SUFFIX_CHARS = 100  # Characters to keep from end

# ============================================================================

print("✓ Configuration set")
print(f"  Vector DB Directory: {VECTOR_DB_BASE_DIR}")
print(f"  Output Directory: {OUTPUT_DIR}")
print(f"\n  Sentence-BERT Model: {SBERT_MODEL_NAME}")
print(f"  Text Similarity Threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  Chunk Text Preview: First {CHUNK_TEXT_PREFIX_CHARS} + Last {CHUNK_TEXT_SUFFIX_CHARS} chars")

# %% [markdown]
# ## 1.5 Load FinanceBench Dataset

# %%
print("\nLoading FinanceBench dataset...")
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
print(f"✓ Loaded {len(dataset)} queries")

# Show sample query with evidence structure
print("\nSample query:")
sample = dataset[0]
print(f"  ID: {sample['financebench_id']}")
print(f"  Company: {sample['company']}")
print(f"  Question: {sample['question'][:100]}...")
print(f"  Doc: {sample['doc_name']}")
print(f"  Evidence items: {len(sample['evidence'])}")

# Show evidence structure
if len(sample['evidence']) > 0:
    print("\n  First evidence item structure:")
    evidence_item = sample['evidence'][0]
    print(f"    - doc_name: {evidence_item['doc_name']}")
    print(f"    - evidence_page_num: {evidence_item['evidence_page_num']}")
    print(f"    - evidence_text (first 100 chars): {evidence_item['evidence_text'][:100]}...")
    print(f"    - Has 'evidence_text_full_page': {'evidence_text_full_page' in evidence_item}")

# %%
print("\n" + "="*60)
print("✓ STEP 1 COMPLETE!")
print("="*60)
print("  ✓ All imports loaded")
print("  ✓ Environment variables configured")
print("  ✓ Paths set up")
print(f"  ✓ Dataset loaded: {len(dataset)} queries")
print(f"  ✓ Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  ✓ Chunk preview length: {CHUNK_TEXT_PREFIX_CHARS} + {CHUNK_TEXT_SUFFIX_CHARS} chars")
print("="*60)

In [None]:
# ============================================================================
# Step 2: Load Sentence-BERT Model
# ============================================================================

# %% [markdown]
# ## 2.1 Load Sentence-BERT Model
# 
# We load the `all-MiniLM-L6-v2` model once at the start.
# This model will be used to:
# 1. Encode evidence texts (done once and cached)
# 2. Encode retrieved chunk texts (done for each retrieval)
# 3. Calculate cosine similarity between them

# %%
def load_sentence_bert_model(model_name: str = SBERT_MODEL_NAME):
    """
    Load Sentence-BERT model for semantic similarity computation.
    
    Args:
        model_name: Name of the sentence-transformers model
        
    Returns:
        SentenceTransformer model instance
    
    Notes:
        - all-MiniLM-L6-v2: 384-dimensional embeddings, ~80MB model
        - First load downloads model from HuggingFace
        - Subsequent loads use cached model
        - Uses CPU by default (can be moved to GPU if available)
    """
    print(f"\nLoading Sentence-BERT model: {model_name}")
    print("  (First run will download model from HuggingFace...)")
    
    try:
        model = SentenceTransformer(model_name)
        print(f"✓ Model loaded successfully")
        print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")
        print(f"  Max sequence length: {model.max_seq_length}")
        
        return model
        
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        raise

# %%
# Load the model
sbert_model = load_sentence_bert_model()

# %% [markdown]
# ## 2.2 Test the Model
# 
# Let's verify the model works correctly by encoding sample texts

# %%
def test_sentence_bert_model(model):
    """
    Test Sentence-BERT model with sample texts.
    Verifies encoding and similarity calculation work correctly.
    """
    print("\n" + "="*60)
    print("TESTING SENTENCE-BERT MODEL")
    print("="*60)
    
    # Sample texts
    text1 = "The company's revenue increased by 15% in Q4 2023."
    text2 = "Revenue grew 15 percent in the fourth quarter of 2023."
    text3 = "The weather was sunny and pleasant today."
    
    print("\nTest texts:")
    print(f"  Text 1: {text1}")
    print(f"  Text 2: {text2}")
    print(f"  Text 3: {text3}")
    
    # Encode texts
    print("\nEncoding texts...")
    embeddings = model.encode([text1, text2, text3])
    
    print(f"✓ Generated embeddings shape: {embeddings.shape}")
    print(f"  (3 texts × {embeddings.shape[1]} dimensions)")
    
    # Calculate similarities
    print("\nCalculating cosine similarities:")
    
    # Similarity between text1 and text2 (semantically similar)
    sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    print(f"  Text 1 ↔ Text 2: {sim_1_2:.4f} (should be HIGH - same meaning)")
    
    # Similarity between text1 and text3 (semantically different)
    sim_1_3 = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
    print(f"  Text 1 ↔ Text 3: {sim_1_3:.4f} (should be LOW - different topics)")
    
    # Similarity between text2 and text3 (semantically different)
    sim_2_3 = cosine_similarity([embeddings[1]], [embeddings[2]])[0][0]
    print(f"  Text 2 ↔ Text 3: {sim_2_3:.4f} (should be LOW - different topics)")
    
    # Verify results make sense
    print("\nValidation:")
    if sim_1_2 > 0.7:
        print(f"  ✓ Similar texts have high similarity ({sim_1_2:.4f} > 0.7)")
    else:
        print(f"  ⚠ Similar texts have lower similarity than expected ({sim_1_2:.4f})")
    
    if sim_1_3 < 0.5:
        print(f"  ✓ Different texts have low similarity ({sim_1_3:.4f} < 0.5)")
    else:
        print(f"  ⚠ Different texts have higher similarity than expected ({sim_1_3:.4f})")
    
    print("\n" + "="*60)
    print("✓ MODEL TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run the test
test_result = test_sentence_bert_model(sbert_model)

# %% [markdown]
# ## 2.3 Test with Actual FinanceBench Evidence
# 
# Let's test with real evidence text from the dataset

# %%
def test_with_real_evidence(model, dataset):
    """
    Test model with actual FinanceBench evidence text.
    This helps verify the model works well with financial domain text.
    """
    print("\n" + "="*60)
    print("TESTING WITH REAL FINANCEBENCH EVIDENCE")
    print("="*60)
    
    # Get first query with evidence
    sample = dataset[0]
    evidence_text = sample['evidence'][0]['evidence_text']
    
    print(f"\nQuery: {sample['question'][:100]}...")
    print(f"\nEvidence text (first 200 chars):")
    print(f"  {evidence_text[:200]}...")
    
    # Create some test chunks
    # Chunk 1: Exact match (should have very high similarity)
    chunk1 = evidence_text
    
    # Chunk 2: Paraphrased version (should have high similarity)
    chunk2 = "Capital expenditures totaled $1,577 million in fiscal year 2018."
    
    # Chunk 3: Different financial topic (should have lower similarity)
    chunk3 = "The company reported strong earnings growth driven by increased sales."
    
    print("\nTest chunks:")
    print(f"  Chunk 1: Exact match - {chunk1[:80]}...")
    print(f"  Chunk 2: Paraphrased - {chunk2}")
    print(f"  Chunk 3: Different topic - {chunk3}")
    
    # Encode
    evidence_embedding = model.encode([evidence_text])
    chunk_embeddings = model.encode([chunk1, chunk2, chunk3])
    
    # Calculate similarities
    print("\nSimilarities with evidence:")
    for i, chunk_emb in enumerate(chunk_embeddings):
        sim = cosine_similarity(evidence_embedding, [chunk_emb])[0][0]
        match_status = "✓ MATCH" if sim >= TEXT_SIMILARITY_THRESHOLD else "✗ NO MATCH"
        print(f"  Chunk {i+1}: {sim:.4f} {match_status}")
    
    print("\n" + "="*60)
    print("✓ REAL EVIDENCE TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test with real evidence
real_evidence_test = test_with_real_evidence(sbert_model, dataset)

# %%
print("\n" + "="*60)
print("✓ STEP 2 COMPLETE!")
print("="*60)
print("  ✓ Sentence-BERT model loaded")
print(f"  ✓ Model: {SBERT_MODEL_NAME}")
print(f"  ✓ Embedding dimension: {sbert_model.get_sentence_embedding_dimension()}")
print("  ✓ Model tested with sample texts")
print("  ✓ Model tested with real FinanceBench evidence")
print("  ✓ Ready for evidence embedding pre-computation")
print("="*60)

In [None]:
# ============================================================================
# Step 3: Pre-compute Evidence Embeddings
# ============================================================================

# %% [markdown]
# ## 3.1 Extract All Evidence Texts
# 
# We need to:
# 1. Extract all unique evidence texts from the dataset
# 2. Create a mapping structure for quick lookup
# 3. Pre-compute embeddings once (instead of computing them 150 times)

# %%
def extract_all_evidence_from_dataset(dataset):
    """
    Extract all evidence items from the dataset.
    
    Returns a list of evidence items with metadata:
    - query_id: Which query this evidence belongs to
    - evidence_index: Index within that query's evidence list
    - doc_name: Source document
    - page_number: Evidence page (1-indexed for consistency)
    - evidence_text: The actual text content
    
    This structure allows us to:
    1. Pre-compute embeddings for all evidence
    2. Map back to original queries during evaluation
    """
    print("\n" + "="*60)
    print("EXTRACTING EVIDENCE FROM DATASET")
    print("="*60)
    
    all_evidence = []
    evidence_texts = []
    
    print(f"\nProcessing {len(dataset)} queries...")
    
    for record in tqdm(dataset, desc="Extracting evidence"):
        query_id = record['financebench_id']
        evidence_list = record['evidence']
        
        for evidence_idx, evidence_item in enumerate(evidence_list):
            # Extract evidence information
            evidence_entry = {
                'query_id': query_id,
                'evidence_index': evidence_idx,
                'doc_name': evidence_item['doc_name'],
                'page_number': evidence_item['evidence_page_num'] + 1,  # Convert to 1-indexed
                'evidence_text': evidence_item['evidence_text']
            }
            
            all_evidence.append(evidence_entry)
            evidence_texts.append(evidence_item['evidence_text'])
    
    print(f"\n✓ Extracted {len(all_evidence)} evidence items")
    print(f"  From {len(dataset)} queries")
    print(f"  Average evidence per query: {len(all_evidence)/len(dataset):.2f}")
    
    # Show statistics
    evidence_per_query = {}
    for record in dataset:
        query_id = record['financebench_id']
        evidence_per_query[query_id] = len(record['evidence'])
    
    print(f"\nEvidence distribution:")
    print(f"  Min evidence per query: {min(evidence_per_query.values())}")
    print(f"  Max evidence per query: {max(evidence_per_query.values())}")
    print(f"  Median evidence per query: {sorted(evidence_per_query.values())[len(evidence_per_query)//2]}")
    
    return all_evidence, evidence_texts

# %%
# Extract all evidence
all_evidence, evidence_texts = extract_all_evidence_from_dataset(dataset)

# Show sample evidence
print("\nSample evidence items:")
for i in range(min(3, len(all_evidence))):
    ev = all_evidence[i]
    print(f"\n  Evidence {i+1}:")
    print(f"    Query ID: {ev['query_id']}")
    print(f"    Doc: {ev['doc_name']}, Page: {ev['page_number']}")
    print(f"    Text (first 100 chars): {ev['evidence_text'][:100]}...")

# %% [markdown]
# ## 3.2 Pre-compute Evidence Embeddings
# 
# This is a critical optimization:
# - Without pre-computation: 150 queries × avg 1.5 evidence × encoding time
# - With pre-computation: Encode once, reuse 150 times
# - Estimated time savings: ~98%

# %%
def compute_evidence_embeddings(
    evidence_texts: List[str],
    model: SentenceTransformer,
    batch_size: int = 32
) -> np.ndarray:
    """
    Pre-compute embeddings for all evidence texts.
    
    Args:
        evidence_texts: List of evidence text strings
        model: Sentence-BERT model
        batch_size: Number of texts to encode at once (larger = faster but more memory)
        
    Returns:
        numpy array of shape (n_evidence, embedding_dim)
        
    Notes:
        - Processes in batches for efficiency
        - Shows progress bar
        - Uses CPU by default (can be moved to GPU if available)
    """
    print("\n" + "="*60)
    print("COMPUTING EVIDENCE EMBEDDINGS")
    print("="*60)
    
    print(f"\nEncoding {len(evidence_texts)} evidence texts...")
    print(f"  Batch size: {batch_size}")
    print(f"  Embedding dimension: {model.get_sentence_embedding_dimension()}")
    
    # Encode all texts with progress bar
    # show_progress_bar=True displays tqdm progress
    embeddings = model.encode(
        evidence_texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    
    print(f"\n✓ Embeddings computed")
    print(f"  Shape: {embeddings.shape}")
    print(f"  Memory: {embeddings.nbytes / 1024 / 1024:.2f} MB")
    
    return embeddings

# %%
# Compute embeddings
evidence_embeddings = compute_evidence_embeddings(
    evidence_texts=evidence_texts,
    model=sbert_model,
    batch_size=32
)

# %% [markdown]
# ## 3.3 Create Evidence Lookup Structure
# 
# Create a convenient structure to look up evidence by query_id

# %%
def create_evidence_lookup(all_evidence: List[Dict], evidence_embeddings: np.ndarray) -> Dict:
    """
    Create a lookup dictionary mapping query_id to evidence items with embeddings.
    
    Structure:
    {
        'query_id_1': [
            {
                'evidence_index': 0,
                'doc_name': 'DOC_NAME',
                'page_number': 60,
                'evidence_text': 'text...',
                'embedding': numpy array
            },
            ...
        ],
        ...
    }
    
    This allows fast lookup: evidence_lookup[query_id] returns all evidence for that query
    """
    print("\n" + "="*60)
    print("CREATING EVIDENCE LOOKUP STRUCTURE")
    print("="*60)
    
    lookup = defaultdict(list)
    
    print(f"\nBuilding lookup for {len(all_evidence)} evidence items...")
    
    for i, evidence_item in enumerate(all_evidence):
        query_id = evidence_item['query_id']
        
        # Add embedding to evidence item
        evidence_with_embedding = evidence_item.copy()
        evidence_with_embedding['embedding'] = evidence_embeddings[i]
        
        lookup[query_id].append(evidence_with_embedding)
    
    print(f"✓ Lookup created for {len(lookup)} queries")
    
    # Verify
    sample_query_id = list(lookup.keys())[0]
    print(f"\nVerification - Sample query: {sample_query_id}")
    print(f"  Evidence items: {len(lookup[sample_query_id])}")
    print(f"  First evidence embedding shape: {lookup[sample_query_id][0]['embedding'].shape}")
    
    return dict(lookup)

# %%
# Create lookup
evidence_lookup = create_evidence_lookup(all_evidence, evidence_embeddings)

# %% [markdown]
# ## 3.4 Test Evidence Lookup
# 
# Verify we can retrieve evidence for any query

# %%
def test_evidence_lookup(dataset, evidence_lookup):
    """
    Test that evidence lookup works correctly.
    """
    print("\n" + "="*60)
    print("TESTING EVIDENCE LOOKUP")
    print("="*60)
    
    # Test with first query
    sample_record = dataset[0]
    query_id = sample_record['financebench_id']
    
    print(f"\nTest query: {query_id}")
    print(f"  Question: {sample_record['question'][:100]}...")
    
    # Retrieve from lookup
    evidence_items = evidence_lookup.get(query_id, [])
    
    print(f"\n✓ Retrieved {len(evidence_items)} evidence items")
    
    for i, ev in enumerate(evidence_items):
        print(f"\n  Evidence {i+1}:")
        print(f"    Doc: {ev['doc_name']}, Page: {ev['page_number']}")
        print(f"    Text (first 80 chars): {ev['evidence_text'][:80]}...")
        print(f"    Embedding shape: {ev['embedding'].shape}")
        print(f"    Embedding sample (first 5 dims): {ev['embedding'][:5]}")
    
    # Verify count matches original
    original_evidence_count = len(sample_record['evidence'])
    retrieved_evidence_count = len(evidence_items)
    
    if original_evidence_count == retrieved_evidence_count:
        print(f"\n✓ Count matches: {original_evidence_count} evidence items")
    else:
        print(f"\n✗ Count mismatch: {original_evidence_count} vs {retrieved_evidence_count}")
    
    print("\n" + "="*60)
    print("✓ LOOKUP TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Test lookup
test_evidence_lookup(dataset, evidence_lookup)

# %%
print("\n" + "="*60)
print("✓ STEP 3 COMPLETE!")
print("="*60)
print(f"  ✓ Extracted {len(all_evidence)} evidence items from {len(dataset)} queries")
print(f"  ✓ Computed {evidence_embeddings.shape[0]} embeddings")
print(f"  ✓ Embedding dimension: {evidence_embeddings.shape[1]}")
print(f"  ✓ Memory used: {evidence_embeddings.nbytes / 1024 / 1024:.2f} MB")
print(f"  ✓ Evidence lookup created for {len(evidence_lookup)} queries")
print("  ✓ Ready for evaluation with pre-computed embeddings")
print("="*60)

In [None]:
# ============================================================================
# Step 4: Helper Functions - Metadata Extraction and Vector Store Loading
# ============================================================================

# %% [markdown]
# ## 4.1 Metadata Extraction Functions
# 
# These functions extract document name and page number from retrieved chunks

# %%
def extract_doc_name_from_path(file_path: str) -> str:
    """
    Extract document name from file path.
    
    Example:
        "../../documents/3M_2018_10K.pdf" → "3M_2018_10K"
    
    Args:
        file_path: Full path to document
        
    Returns:
        Document name without extension
    """
    return Path(file_path).stem


def extract_metadata_from_retrieved_doc(doc) -> Dict:
    """
    Extract metadata from a retrieved LangChain document.
    
    FIXED: Correctly extracts from ChromaDB metadata structure:
        - file_path: Full path to PDF
        - source: Page number (as integer or string)
    
    Args:
        doc: LangChain Document object from vectorstore.similarity_search()
        
    Returns:
        Dict with:
            - doc_name: Document name (e.g., "3M_2018_10K")
            - page_number: Page number (integer, 0-indexed from ChromaDB)
            - chunk_text: The chunk content
    """
    metadata = doc.metadata
    
    # Extract file path and convert to doc name
    file_path = metadata.get('file_path', '')
    doc_name = extract_doc_name_from_path(file_path)
    
    # Extract page number from 'source' field
    page_num = metadata.get('source', 0)
    
    # Ensure page_num is an integer
    if isinstance(page_num, str):
        try:
            page_num = int(page_num)
        except ValueError:
            page_num = 0
    
    return {
        'doc_name': doc_name,
        'page_number': page_num,  # Keep 0-indexed as stored in ChromaDB
        'chunk_text': doc.page_content
    }

print("✓ Metadata extraction functions defined")

# %% [markdown]
# ## 4.2 Embedding Function Factory
# 
# Creates the appropriate embedding function based on provider

# %%
def get_embedding_function(provider: str, model: str):
    """
    Get embedding function for vector store loading.
    
    Args:
        provider: "ollama", "openai", or "voyage"
        model: Model name (e.g., "nomic-embed-text", "text-embedding-3-small")
        
    Returns:
        Embedding function compatible with LangChain/ChromaDB
        
    Raises:
        ValueError: If provider is unknown
    """
    if provider == "ollama":
        return OllamaEmbeddings(
            model=model,
            base_url=OLLAMA_BASE_URL
        )
    elif provider == "openai":
        return OpenAIEmbeddings(
            model=model,
            openai_api_key=OPENAI_API_KEY
        )
    elif provider == "voyage":
        return VoyageAIEmbeddings(
            model=model,
            voyage_api_key=VOYAGE_API_KEY
        )
    else:
        raise ValueError(f"Unknown provider: {provider}")

print("✓ Embedding function factory defined")

# %% [markdown]
# ## 4.3 Vector Store Loading
# 
# Load pre-built vector databases from disk

# %%
def load_vectorstore(
    provider: str,
    model: str,
    chunk_size: int,
    base_dir: str = VECTOR_DB_BASE_DIR,
    collection_prefix: str = COLLECTION_PREFIX
) -> Chroma:
    """
    Load a pre-built vector store from disk.
    
    The vector databases were created by build_vectore_database.ipynb
    and stored in a specific directory structure:
    
    {base_dir}/{provider}_{model}/financebench_docs_chunk_{chunk_size}/
    
    Args:
        provider: "ollama", "openai", or "voyage"
        model: Model name
        chunk_size: Chunk size (256, 512, 1024, 2048, 4096)
        base_dir: Base directory for vector databases
        collection_prefix: Prefix for collection names
        
    Returns:
        Loaded ChromaDB vectorstore
        
    Example:
        vectorstore = load_vectorstore("voyage", "voyage-finance-2", 1024)
    """
    # Construct paths
    model_id = f"{provider}_{model.replace('/', '_')}"
    db_path = os.path.join(base_dir, model_id)
    collection_name = f"{collection_prefix}{chunk_size}"
    
    # Get embedding function
    emb_fn = get_embedding_function(provider, model)
    
    # Load vectorstore
    vectorstore = Chroma(
        collection_name=collection_name,
        embedding_function=emb_fn,
        persist_directory=db_path
    )
    
    return vectorstore

print("✓ Vector store loading function defined")

# %% [markdown]
# ## 4.4 Test Vector Store Loading
# 
# Verify we can load a vector store and retrieve documents

# %%
def test_vectorstore_loading():
    """
    Test loading a vector store and performing a sample retrieval.
    """
    print("\n" + "="*60)
    print("TESTING VECTOR STORE LOADING")
    print("="*60)
    
    # Test with a common configuration
    test_provider = "voyage"
    test_model = "voyage-finance-2"
    test_chunk_size = 1024
    
    print(f"\nTest configuration:")
    print(f"  Provider: {test_provider}")
    print(f"  Model: {test_model}")
    print(f"  Chunk size: {test_chunk_size}")
    
    try:
        # Load vectorstore
        print("\nLoading vectorstore...")
        vectorstore = load_vectorstore(test_provider, test_model, test_chunk_size)
        
        # Check collection
        doc_count = vectorstore._collection.count()
        print(f"✓ Vectorstore loaded")
        print(f"  Documents in collection: {doc_count:,}")
        
        # Test retrieval
        print("\nTesting retrieval...")
        test_query = "What was the revenue in 2018?"
        results = vectorstore.similarity_search(test_query, k=3)
        
        print(f"✓ Retrieved {len(results)} documents")
        
        # Show sample result
        print("\nSample retrieved document:")
        sample_doc = results[0]
        metadata = extract_metadata_from_retrieved_doc(sample_doc)
        
        print(f"  Doc name: {metadata['doc_name']}")
        print(f"  Page number: {metadata['page_number']}")
        print(f"  Chunk text (first 150 chars): {metadata['chunk_text'][:150]}...")
        print(f"  Chunk text length: {len(metadata['chunk_text'])} characters")
        
        print("\n" + "="*60)
        print("✓ VECTOR STORE TEST COMPLETE")
        print("="*60)
        
        return True
        
    except Exception as e:
        print(f"\n✗ Error during test: {e}")
        print("\nPossible issues:")
        print("  1. Vector database doesn't exist for this configuration")
        print("  2. Path is incorrect")
        print("  3. ChromaDB version mismatch")
        print(f"\nExpected path: {VECTOR_DB_BASE_DIR}/{test_provider}_{test_model}/")
        return False

# %%
# Run test
test_result = test_vectorstore_loading()

# %%
print("\n" + "="*60)
print("✓ STEP 4 COMPLETE!")
print("="*60)
print("  ✓ Metadata extraction functions defined")
print("  ✓ Embedding function factory defined")
print("  ✓ Vector store loading function defined")
print("  ✓ Vector store loading tested successfully")
print("  ✓ Ready to perform retrievals with chunk text extraction")
print("="*60)

In [None]:
# ============================================================================
# Step 5: Helper Functions - Page-Based Evaluation
# ============================================================================

# %% [markdown]
# ## 5.1 Page-Based Matching Function
# 
# This function checks if a retrieved chunk matches evidence based on page numbers

# %%
def check_page_match(
    retrieved_doc: Dict, 
    evidence_list: List[Dict],
    chunk_size: int = 512,
    use_page_tolerance: bool = True
) -> bool:
    """
    Check if retrieved document matches any evidence based on PAGE NUMBERS.
    
    Uses chunk-size-aware page tolerance:
    - Larger chunks can span multiple pages
    - Retrieved page must be BEFORE or AT evidence page (within tolerance)
    - Retrieved page AFTER evidence page = no match
    
    Page tolerance (when use_page_tolerance=True):
    - chunk_size <= 512: tolerance = 0 (exact match)
    - chunk_size 513-1024: tolerance = 1
    - chunk_size 1025-2048: tolerance = 2
    - chunk_size > 2048: tolerance = 2
    
    Args:
        retrieved_doc: Dict with 'doc_name' and 'page_number' (1-indexed)
        evidence_list: List of evidence dicts (page_number is 1-indexed)
        chunk_size: Chunk size for tolerance calculation
        use_page_tolerance: If True, use tolerance; if False, exact match only
        
    Returns:
        True if match found, False otherwise
        
    Example:
        Evidence on page 50, chunk_size=1024, tolerance=1:
        - Page 49: MATCH (within tolerance, before evidence)
        - Page 50: MATCH (exact match)
        - Page 51: NO MATCH (after evidence page)
    """
    retrieved_doc_name = retrieved_doc['doc_name']
    retrieved_page = retrieved_doc['page_number']
    
    # Calculate page tolerance based on chunk size
    if use_page_tolerance:
        if chunk_size <= 512:
            page_tolerance = 0
        elif chunk_size <= 1024:
            page_tolerance = 1
        elif chunk_size <= 2048:
            page_tolerance = 2
        else:
            page_tolerance = 2
    else:
        page_tolerance = 0  # Exact match only
    
    # Check against all evidence items
    for evidence in evidence_list:
        evidence_doc_name = evidence['doc_name']
        evidence_page = evidence['page_number']  # Already 1-indexed from evidence_lookup
        
        # Check document name match
        if retrieved_doc_name != evidence_doc_name:
            continue
        
        # Check page match with tolerance
        # Only match if retrieved page is BEFORE or AT evidence page
        if retrieved_page <= evidence_page <= retrieved_page + page_tolerance:
            return True
    
    return False

print("✓ Page-based matching function defined")

# %% [markdown]
# ## 5.2 Page-Based MRR Calculation
# 
# Calculate Mean Reciprocal Rank based on page matching

# %%
def calculate_page_mrr_for_query(
    retrieved_docs: List[Dict], 
    evidence_list: List[Dict],
    chunk_size: int = 512,
    use_page_tolerance: bool = True
) -> Tuple[float, int]:
    """
    Calculate PAGE-BASED MRR for a single query.
    
    Finds the rank of the first retrieved document that matches
    any evidence based on page numbers.
    
    Args:
        retrieved_docs: List of retrieved docs with 'doc_name', 'page_number'
        evidence_list: List of evidence items from evidence_lookup
        chunk_size: Chunk size for tolerance calculation
        use_page_tolerance: If True, use chunk-size-aware tolerance
        
    Returns:
        Tuple of (mrr_score, rank):
        - mrr_score: 1/rank if found, 0 if not found
        - rank: Position of first match (1-indexed), -1 if not found
        
    Example:
        First match at position 3: mrr_score = 1/3 = 0.333, rank = 3
        No match found: mrr_score = 0.0, rank = -1
    """
    for rank, retrieved_doc in enumerate(retrieved_docs, start=1):
        if check_page_match(retrieved_doc, evidence_list, chunk_size, use_page_tolerance):
            mrr_score = 1.0 / rank
            return mrr_score, rank
    
    # No match found
    return 0.0, -1

print("✓ Page-based MRR calculation defined")

# %% [markdown]
# ## 5.3 Page-Based Recall, Precision, and F1
# 
# NEW: Calculate precision, recall, and F1 based on page matching

# %%
def calculate_page_metrics_for_query(
    retrieved_docs: List[Dict],
    evidence_list: List[Dict],
    chunk_size: int = 512,
    use_page_tolerance: bool = True
) -> Tuple[float, float, float]:
    """
    Calculate PAGE-BASED Recall, Precision, and F1 for a single query.
    
    Recall: What proportion of evidence pages were found in retrieved chunks?
        recall = (# evidence items matched) / (# total evidence items)
    
    Precision: What proportion of retrieved chunks matched evidence?
        precision = (# retrieved chunks matching evidence) / (# total retrieved chunks)
    
    F1: Harmonic mean of precision and recall
        f1 = 2 × (precision × recall) / (precision + recall)
    
    Args:
        retrieved_docs: List of retrieved docs with 'doc_name', 'page_number'
        evidence_list: List of evidence items from evidence_lookup
        chunk_size: Chunk size for tolerance calculation
        use_page_tolerance: If True, use chunk-size-aware tolerance
        
    Returns:
        Tuple of (recall, precision, f1)
        
    Example:
        Evidence items: 2 (pages 50, 75)
        Retrieved: 20 chunks
        Matches: Found page 50 in 2 chunks, page 75 in 1 chunk
        
        Evidence matched: {page 50, page 75} = 2 unique evidence
        Chunks matching: 3 chunks matched at least one evidence
        
        Recall = 2/2 = 1.0 (found all evidence)
        Precision = 3/20 = 0.15 (3 out of 20 chunks matched)
        F1 = 2 × (1.0 × 0.15) / (1.0 + 0.15) = 0.26
    """
    if len(evidence_list) == 0:
        return 0.0, 0.0, 0.0
    
    if len(retrieved_docs) == 0:
        return 0.0, 0.0, 0.0
    
    # Track which evidence items were found
    evidence_found = set()  # Set of evidence indices that were matched
    
    # Track which retrieved chunks matched at least one evidence
    chunks_matching = 0
    
    # Check each retrieved chunk
    for retrieved_doc in retrieved_docs:
        chunk_matched_any_evidence = False
        
        # Check against each evidence item
        for evidence_idx, evidence in enumerate(evidence_list):
            # Create single-item list for check_page_match
            if check_page_match(retrieved_doc, [evidence], chunk_size, use_page_tolerance):
                evidence_found.add(evidence_idx)
                chunk_matched_any_evidence = True
        
        if chunk_matched_any_evidence:
            chunks_matching += 1
    
    # Calculate metrics
    recall = len(evidence_found) / len(evidence_list)
    precision = chunks_matching / len(retrieved_docs)
    
    # Calculate F1
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0
    
    return recall, precision, f1

print("✓ Page-based metrics (recall, precision, F1) defined")

# %% [markdown]
# ## 5.4 Test Page-Based Evaluation Functions
# 
# Verify all page-based metrics work correctly

# %%
def test_page_based_evaluation():
    """
    Test page-based evaluation functions with sample data.
    """
    print("\n" + "="*60)
    print("TESTING PAGE-BASED EVALUATION")
    print("="*60)
    
    # Create sample evidence (using 1-indexed pages)
    evidence_list = [
        {'doc_name': 'TEST_DOC', 'page_number': 50},
        {'doc_name': 'TEST_DOC', 'page_number': 75}
    ]
    
    # Create sample retrieved documents
    retrieved_docs = [
        {'doc_name': 'OTHER_DOC', 'page_number': 10},  # No match - wrong doc
        {'doc_name': 'TEST_DOC', 'page_number': 50},   # MATCH - exact evidence page 50
        {'doc_name': 'TEST_DOC', 'page_number': 49},   # MATCH - within tolerance of page 50
        {'doc_name': 'TEST_DOC', 'page_number': 30},   # No match - not near evidence
        {'doc_name': 'TEST_DOC', 'page_number': 75},   # MATCH - exact evidence page 75
    ]
    
    chunk_size = 1024  # tolerance = 1
    
    print("\nTest setup:")
    print(f"  Evidence pages: [50, 75]")
    print(f"  Retrieved pages: [10 (OTHER_DOC), 50, 49, 30, 75]")
    print(f"  Chunk size: {chunk_size} (tolerance = 1)")
    
    # Test MRR
    print("\n--- Page-Based MRR ---")
    mrr_score, rank = calculate_page_mrr_for_query(
        retrieved_docs, evidence_list, chunk_size, use_page_tolerance=True
    )
    print(f"  First match at rank: {rank}")
    print(f"  MRR score: {mrr_score:.4f}")
    print(f"  Expected: rank=2 (second doc matches page 50), MRR=0.5000")
    
    # Test Recall, Precision, F1
    print("\n--- Page-Based Recall, Precision, F1 ---")
    recall, precision, f1 = calculate_page_metrics_for_query(
        retrieved_docs, evidence_list, chunk_size, use_page_tolerance=True
    )
    print(f"  Recall: {recall:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1: {f1:.4f}")
    
    print("\n  Expected calculations:")
    print("    Evidence found: {page 50, page 75} = 2/2 evidence items")
    print("    Chunks matching: 3 chunks (pages 50, 49, 75) matched evidence")
    print("    Recall = 2/2 = 1.0000")
    print("    Precision = 3/5 = 0.6000")
    print("    F1 = 2 × (1.0 × 0.6) / (1.0 + 0.6) = 0.7500")
    
    # Verify results
    print("\n--- Verification ---")
    if rank == 2 and abs(mrr_score - 0.5) < 0.001:
        print("  ✓ MRR calculation correct")
    else:
        print("  ✗ MRR calculation incorrect")
    
    if abs(recall - 1.0) < 0.001 and abs(precision - 0.6) < 0.001 and abs(f1 - 0.75) < 0.001:
        print("  ✓ Recall, Precision, F1 calculations correct")
    else:
        print("  ✗ Metrics calculation incorrect")
    
    print("\n" + "="*60)
    print("✓ PAGE-BASED EVALUATION TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test
test_page_based = test_page_based_evaluation()

# %%
print("\n" + "="*60)
print("✓ STEP 5 COMPLETE!")
print("="*60)
print("  ✓ Page-based matching function defined")
print("  ✓ Page-based MRR calculation defined")
print("  ✓ Page-based Recall, Precision, F1 calculation defined")
print("  ✓ All page-based functions tested successfully")
print("  ✓ Ready to implement text-based evaluation")
print("="*60)

In [None]:
# ============================================================================
# Step 6: Helper Functions - Text-Based Evaluation
# ============================================================================

# %% [markdown]
# ## 6.1 Chunk Text Preview Formatting
# 
# Format chunk text as "first N chars...last N chars" for JSON storage

# %%
def format_chunk_text_preview(
    text: str,
    prefix_chars: int = CHUNK_TEXT_PREFIX_CHARS,
    suffix_chars: int = CHUNK_TEXT_SUFFIX_CHARS
) -> str:
    """
    Format chunk text as abbreviated preview for JSON storage.
    
    Format: "first N characters...last N characters"
    
    This keeps JSON files manageable while providing enough context
    to manually verify matches.
    
    Args:
        text: Full chunk text
        prefix_chars: Number of characters from start
        suffix_chars: Number of characters from end
        
    Returns:
        Formatted preview string
        
    Examples:
        Short text (< prefix + suffix): Returns full text
        Long text: "Capital expenditures were $1,577...in fiscal year 2018."
    """
    if len(text) <= prefix_chars + suffix_chars:
        # Text is short enough, return as-is
        return text
    
    # Extract prefix and suffix
    prefix = text[:prefix_chars]
    suffix = text[-suffix_chars:]
    
    # Format with ellipsis
    return f"{prefix}...{suffix}"

print("✓ Chunk text preview formatting defined")

# %% [markdown]
# ## 6.2 Cosine Similarity Calculation
# 
# Calculate cosine similarity between chunk and evidence embeddings

# %%
def compute_cosine_similarity(
    chunk_embedding: np.ndarray,
    evidence_embedding: np.ndarray
) -> float:
    """
    Calculate cosine similarity between two embeddings.
    
    Cosine similarity ranges from -1 to 1:
    - 1.0: Identical/very similar
    - 0.7-0.9: Strong similarity
    - 0.5-0.7: Moderate similarity
    - 0.0-0.5: Weak/no similarity
    - Negative: Opposite meaning (rare in practice)
    
    Args:
        chunk_embedding: Embedding vector for retrieved chunk (384-dim)
        evidence_embedding: Embedding vector for evidence (384-dim)
        
    Returns:
        Cosine similarity score (float)
        
    Note:
        sklearn's cosine_similarity expects 2D arrays, so we reshape
    """
    # Reshape to 2D arrays: (1, 384)
    chunk_emb_2d = chunk_embedding.reshape(1, -1)
    evidence_emb_2d = evidence_embedding.reshape(1, -1)
    
    # Calculate similarity
    similarity = cosine_similarity(chunk_emb_2d, evidence_emb_2d)[0][0]
    
    return float(similarity)

print("✓ Cosine similarity calculation defined")

# %% [markdown]
# ## 6.3 Calculate Text Similarities for Retrieved Chunk
# 
# For each retrieved chunk, calculate similarity with ALL evidence items

# %%
def calculate_text_similarities_for_chunk(
    chunk_text: str,
    evidence_items: List[Dict],
    sbert_model: SentenceTransformer
) -> List[Dict]:
    """
    Calculate cosine similarity between a chunk and all evidence items.
    
    Args:
        chunk_text: Text content of retrieved chunk
        evidence_items: List of evidence items (each has 'embedding', 'doc_name', 'page_number')
        sbert_model: Sentence-BERT model for encoding chunk
        
    Returns:
        List of similarity results:
        [
            {
                'evidence_index': 0,
                'evidence_doc': 'DOC_NAME',
                'evidence_page': 60,
                'cosine_similarity': 0.7823
            },
            ...
        ]
        
    Note:
        Evidence embeddings are pre-computed, so we only encode the chunk once
    """
    # Encode chunk text
    chunk_embedding = sbert_model.encode(chunk_text, convert_to_numpy=True)
    
    # Calculate similarity with each evidence
    similarities = []
    
    for evidence_idx, evidence in enumerate(evidence_items):
        similarity_score = compute_cosine_similarity(
            chunk_embedding,
            evidence['embedding']
        )
        
        similarities.append({
            'evidence_index': evidence_idx,
            'evidence_doc': evidence['doc_name'],
            'evidence_page': evidence['page_number'],
            'cosine_similarity': similarity_score
        })
    
    return similarities

print("✓ Text similarities calculation for chunk defined")

# %% [markdown]
# ## 6.4 Text-Based Metrics Calculation
# 
# Calculate text-based MRR, Recall, Precision, and F1

# %%
def calculate_text_metrics_for_query(
    retrieved_docs: List[Dict],
    evidence_items: List[Dict],
    sbert_model: SentenceTransformer,
    threshold: float = TEXT_SIMILARITY_THRESHOLD
) -> Tuple[float, int, float, float, float, List[List[Dict]]]:
    """
    Calculate TEXT-BASED metrics for a single query.
    
    For each retrieved chunk:
    1. Encode chunk text
    2. Calculate similarity with all evidence
    3. Determine if chunk matches (max_similarity >= threshold)
    
    Metrics:
    - Text MRR: Rank of first chunk where max(similarities) >= threshold
    - Text Recall: # evidence matched / # total evidence
    - Text Precision: # chunks matching / # total chunks
    - Text F1: Harmonic mean of precision and recall
    
    Args:
        retrieved_docs: List of retrieved docs with 'chunk_text'
        evidence_items: List of evidence items with 'embedding'
        sbert_model: Sentence-BERT model for encoding chunks
        threshold: Similarity threshold for matching (default: 0.7)
        
    Returns:
        Tuple of (text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities)
        - text_mrr: MRR score (0.0 if no match)
        - text_rank: Rank of first match (-1 if no match)
        - text_recall: Proportion of evidence found
        - text_precision: Proportion of chunks matching
        - text_f1: F1 score
        - all_similarities: List of similarity lists for each chunk (for JSON storage)
        
    Example:
        Evidence: 2 items
        Retrieved: 20 chunks
        Chunk 5 has max_similarity=0.82 with evidence[0] (FIRST MATCH)
        Chunk 12 has max_similarity=0.75 with evidence[1]
        
        text_mrr = 1/5 = 0.2
        text_rank = 5
        evidence_matched = {evidence[0], evidence[1]} = 2
        chunks_matching = 2
        text_recall = 2/2 = 1.0
        text_precision = 2/20 = 0.1
        text_f1 = 2 × (1.0 × 0.1) / (1.0 + 0.1) = 0.18
    """
    if len(evidence_items) == 0 or len(retrieved_docs) == 0:
        return 0.0, -1, 0.0, 0.0, 0.0, []
    
    # Track results
    all_similarities = []  # Store all similarities for JSON
    text_mrr = 0.0
    text_rank = -1
    evidence_found = set()  # Set of evidence indices matched
    chunks_matching = 0
    
    # Process each retrieved chunk
    for rank, retrieved_doc in enumerate(retrieved_docs, start=1):
        chunk_text = retrieved_doc.get('chunk_text', '')
        
        if not chunk_text:
            # No text available
            all_similarities.append([])
            continue
        
        # Calculate similarities with all evidence
        similarities = calculate_text_similarities_for_chunk(
            chunk_text,
            evidence_items,
            sbert_model
        )
        
        all_similarities.append(similarities)
        
        # Find maximum similarity
        max_similarity = max([s['cosine_similarity'] for s in similarities])
        
        # Check if this chunk matches (above threshold)
        chunk_matches_any_evidence = (max_similarity >= threshold)
        
        if chunk_matches_any_evidence:
            chunks_matching += 1
            
            # Record which evidence items this chunk matched
            for i, sim in enumerate(similarities):
                if sim['cosine_similarity'] >= threshold:
                    evidence_found.add(i)
            
            # Check for MRR (first match)
            if text_mrr == 0.0:  # First match found
                text_mrr = 1.0 / rank
                text_rank = rank
    
    # Calculate recall and precision
    text_recall = len(evidence_found) / len(evidence_items)
    text_precision = chunks_matching / len(retrieved_docs)
    
    # Calculate F1
    if text_precision + text_recall > 0:
        text_f1 = 2 * (text_precision * text_recall) / (text_precision + text_recall)
    else:
        text_f1 = 0.0
    
    return text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities

print("✓ Text-based metrics calculation defined")

# %% [markdown]
# ## 6.5 Test Text-Based Evaluation Functions
# 
# Verify text-based metrics work correctly with sample data

# %%
def test_text_based_evaluation():
    """
    Test text-based evaluation functions with real FinanceBench data.
    """
    print("\n" + "="*60)
    print("TESTING TEXT-BASED EVALUATION")
    print("="*60)
    
    # Get sample query
    sample_record = dataset[0]
    query_id = sample_record['financebench_id']
    
    print(f"\nTest query: {query_id}")
    print(f"  Question: {sample_record['question'][:100]}...")
    
    # Get evidence for this query
    evidence_items = evidence_lookup[query_id]
    print(f"\n  Evidence items: {len(evidence_items)}")
    for i, ev in enumerate(evidence_items):
        print(f"    {i+1}. {ev['doc_name']}, page {ev['page_number']}")
        print(f"       Text (first 80 chars): {ev['evidence_text'][:80]}...")
    
    # Create sample retrieved chunks
    # Chunk 1: Contains exact evidence text (should have very high similarity)
    chunk1_text = evidence_items[0]['evidence_text']
    
    # Chunk 2: Paraphrased financial content (moderate similarity)
    chunk2_text = "The company's capital spending was approximately $1.6 billion for the fiscal year."
    
    # Chunk 3: Different financial topic (low similarity)
    chunk3_text = "Revenue increased by 8% year-over-year driven by strong product sales."
    
    # Chunk 4: Unrelated content (very low similarity)
    chunk4_text = "The weather forecast predicts sunny skies for the weekend."
    
    retrieved_docs = [
        {'chunk_text': chunk1_text},
        {'chunk_text': chunk2_text},
        {'chunk_text': chunk3_text},
        {'chunk_text': chunk4_text}
    ]
    
    print("\n  Retrieved chunks: 4")
    print("    1. Exact evidence text")
    print("    2. Paraphrased financial content")
    print("    3. Different financial topic")
    print("    4. Unrelated content")
    
    # Calculate text-based metrics
    print(f"\n  Calculating similarities with threshold={TEXT_SIMILARITY_THRESHOLD}...")
    
    text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities = \
        calculate_text_metrics_for_query(
            retrieved_docs,
            evidence_items,
            sbert_model,
            threshold=TEXT_SIMILARITY_THRESHOLD
        )
    
    # Display results
    print("\n--- Text-Based Metrics ---")
    print(f"  Text MRR: {text_mrr:.4f}")
    print(f"  Text Rank: {text_rank}")
    print(f"  Text Recall: {text_recall:.4f}")
    print(f"  Text Precision: {text_precision:.4f}")
    print(f"  Text F1: {text_f1:.4f}")
    
    # Show similarities for each chunk
    print("\n--- Chunk Similarities ---")
    for i, (chunk, similarities) in enumerate(zip(retrieved_docs, all_similarities), start=1):
        print(f"\n  Chunk {i}:")
        print(f"    Text (first 60 chars): {chunk['chunk_text'][:60]}...")
        for sim in similarities:
            match_status = "✓ MATCH" if sim['cosine_similarity'] >= TEXT_SIMILARITY_THRESHOLD else "✗ NO MATCH"
            print(f"    Evidence {sim['evidence_index']}: {sim['cosine_similarity']:.4f} {match_status}")
    
    # Expected behavior
    print("\n--- Expected Behavior ---")
    print("  Chunk 1 (exact evidence): Should have similarity ~0.99, MATCH")
    print("  Chunk 2 (paraphrased): Should have similarity ~0.7-0.8, likely MATCH")
    print("  Chunk 3 (different topic): Should have similarity ~0.3-0.5, NO MATCH")
    print("  Chunk 4 (unrelated): Should have similarity ~0.1-0.2, NO MATCH")
    
    print("\n" + "="*60)
    print("✓ TEXT-BASED EVALUATION TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test
test_text_based = test_text_based_evaluation()

# %%
print("\n" + "="*60)
print("✓ STEP 6 COMPLETE!")
print("="*60)
print("  ✓ Chunk text preview formatting defined")
print("  ✓ Cosine similarity calculation defined")
print("  ✓ Text similarities for chunks defined")
print("  ✓ Text-based MRR, Recall, Precision, F1 calculation defined")
print("  ✓ All text-based functions tested with real data")
print(f"  ✓ Similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print("  ✓ Ready for retrieval functions")
print("="*60)

In [None]:
# ============================================================================
# Step 7: Retrieval Functions
# ============================================================================

# %% [markdown]
# ## 7.1 Global Retrieval
# 
# Retrieve documents from the entire corpus (all documents)

# %%
def retrieve_global(
    vectorstore: Chroma,
    query: str,
    k: int
) -> List[Dict]:
    """
    Retrieve documents globally (search all documents in the corpus).
    
    This mode searches across all 84 documents in FinanceBench.
    Use case: Testing if the system can identify the correct document
    among many documents.
    
    Args:
        vectorstore: Loaded ChromaDB vectorstore
        query: Query text
        k: Number of documents to retrieve
        
    Returns:
        List of retrieved documents with metadata:
        [
            {
                'doc_name': 'DOC_NAME',
                'page_number': 60,
                'rank': 1,
                'chunk_text': 'Full chunk text...'
            },
            ...
        ]
        
    Note:
        Results are ordered by similarity score (most similar first)
    """
    # Perform similarity search
    results = vectorstore.similarity_search(query, k=k)
    
    # Extract metadata and add rank
    retrieved = []
    for rank, doc in enumerate(results, start=1):
        metadata = extract_metadata_from_retrieved_doc(doc)
        metadata['rank'] = rank
        retrieved.append(metadata)
    
    return retrieved

print("✓ Global retrieval function defined")

# %% [markdown]
# ## 7.2 Single-Document Retrieval
# 
# Retrieve documents filtered to a specific target document

# %%
def retrieve_single_doc(
    vectorstore: Chroma,
    query: str,
    target_doc_name: str,
    k: int
) -> List[Dict]:
    """
    Retrieve documents filtered to a single target document.
    
    This mode assumes we already know which document contains the answer
    and only searches within that document.
    Use case: Testing passage retrieval accuracy when document is known.
    
    Implementation:
        ChromaDB doesn't support substring matching in filters, so we:
        1. Retrieve more documents (k × 10)
        2. Filter to target document
        3. Return top k from filtered results
    
    Args:
        vectorstore: Loaded ChromaDB vectorstore
        query: Query text
        target_doc_name: Target document name (e.g., "3M_2018_10K")
        k: Number of documents to retrieve
        
    Returns:
        List of retrieved documents from target document:
        [
            {
                'doc_name': '3M_2018_10K',
                'page_number': 47,
                'rank': 1,
                'chunk_text': 'Full chunk text...'
            },
            ...
        ]
        
    Note:
        If target document has fewer than k chunks, returns all available chunks
    """
    # Retrieve more documents to ensure we get enough from target doc
    # Factor of 10 is usually sufficient
    retrieve_count = k * 10
    results = vectorstore.similarity_search(query, k=retrieve_count)
    
    # Filter to target document and extract metadata
    filtered = []
    for doc in results:
        metadata = extract_metadata_from_retrieved_doc(doc)
        if metadata['doc_name'] == target_doc_name:
            filtered.append(metadata)
            # Stop once we have enough
            if len(filtered) >= k:
                break
    
    # Take top k from filtered results
    top_k_filtered = filtered[:k]
    
    # Add rank
    for rank, doc_metadata in enumerate(top_k_filtered, start=1):
        doc_metadata['rank'] = rank
    
    return top_k_filtered

print("✓ Single-document retrieval function defined")

# %% [markdown]
# ## 7.3 Test Retrieval Functions
# 
# Verify both retrieval modes work correctly

# %%
def test_retrieval_functions():
    """
    Test both global and single-document retrieval.
    """
    print("\n" + "="*60)
    print("TESTING RETRIEVAL FUNCTIONS")
    print("="*60)
    
    # Test configuration
    test_provider = "voyage"
    test_model = "voyage-finance-2"
    test_chunk_size = 1024
    test_k = 5
    
    print(f"\nTest configuration:")
    print(f"  Provider: {test_provider}")
    print(f"  Model: {test_model}")
    print(f"  Chunk size: {test_chunk_size}")
    print(f"  K: {test_k}")
    
    # Load vectorstore
    print("\nLoading vectorstore...")
    vectorstore = load_vectorstore(test_provider, test_model, test_chunk_size)
    doc_count = vectorstore._collection.count()
    print(f"✓ Loaded ({doc_count:,} documents)")
    
    # Test query
    test_query = "What was the capital expenditure in 2018?"
    print(f"\nTest query: {test_query}")
    
    # Test 1: Global retrieval
    print("\n" + "-"*60)
    print("TEST 1: Global Retrieval")
    print("-"*60)
    
    global_results = retrieve_global(vectorstore, test_query, test_k)
    
    print(f"✓ Retrieved {len(global_results)} documents")
    print("\nTop 3 results:")
    for i, result in enumerate(global_results[:3], start=1):
        print(f"\n  {i}. Rank {result['rank']}")
        print(f"     Doc: {result['doc_name']}")
        print(f"     Page: {result['page_number']}")
        print(f"     Text (first 100 chars): {result['chunk_text'][:100]}...")
        print(f"     Text length: {len(result['chunk_text'])} chars")
    
    # Test 2: Single-document retrieval
    print("\n" + "-"*60)
    print("TEST 2: Single-Document Retrieval")
    print("-"*60)
    
    target_doc = "3M_2018_10K"
    print(f"Target document: {target_doc}")
    
    singledoc_results = retrieve_single_doc(vectorstore, test_query, target_doc, test_k)
    
    print(f"✓ Retrieved {len(singledoc_results)} documents from target")
    print("\nTop 3 results:")
    for i, result in enumerate(singledoc_results[:3], start=1):
        print(f"\n  {i}. Rank {result['rank']}")
        print(f"     Doc: {result['doc_name']}")
        print(f"     Page: {result['page_number']}")
        print(f"     Text (first 100 chars): {result['chunk_text'][:100]}...")
        print(f"     Text length: {len(result['chunk_text'])} chars")
    
    # Verify all results are from target document
    all_from_target = all(r['doc_name'] == target_doc for r in singledoc_results)
    if all_from_target:
        print(f"\n✓ All results correctly filtered to {target_doc}")
    else:
        print(f"\n✗ Some results not from target document!")
    
    print("\n" + "="*60)
    print("✓ RETRIEVAL FUNCTIONS TEST COMPLETE")
    print("="*60)
    
    return True

# %%
# Run test
test_retrieval = test_retrieval_functions()

# %%
print("\n" + "="*60)
print("✓ STEP 7 COMPLETE!")
print("="*60)
print("  ✓ Global retrieval function defined")
print("  ✓ Single-document retrieval function defined")
print("  ✓ Both retrieval modes tested successfully")
print("  ✓ Chunk text extraction verified")
print("  ✓ Ready for main evaluation function")
print("="*60)

In [None]:
# ============================================================================
# Step 8: Main Evaluation Function
# ============================================================================

# %% [markdown]
# ## 8.1 File Management Functions
# 
# Helper functions for saving and checking results

# %%
def get_output_filename(
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str
) -> str:
    """
    Generate standardized output filename.
    
    Format: {provider}_{model}_chunk{size}_k{k}_{mode}.json
    
    Example:
        voyage_voyage-finance-2_chunk1024_k20_global.json
    """
    # Replace slashes in model name
    model_safe = model.replace('/', '_')
    filename = f"{provider}_{model_safe}_chunk{chunk_size}_k{k}_{mode}.json"
    return filename


def check_if_results_exist(
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    output_dir: str
) -> bool:
    """
    Check if results file already exists.
    
    Used to skip configurations that have already been evaluated.
    """
    filename = get_output_filename(provider, model, chunk_size, k, mode)
    filepath = os.path.join(output_dir, filename)
    return os.path.exists(filepath)


def save_results(
    results: List[Dict],
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    output_dir: str
):
    """
    Save evaluation results to JSON file.
    
    Args:
        results: List of result dictionaries (queries + summary)
        provider: Embedding provider
        model: Model name
        chunk_size: Chunk size
        k: Number of retrieved documents
        mode: "global" or "singledoc"
        output_dir: Output directory
    """
    filename = get_output_filename(provider, model, chunk_size, k, mode)
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"✓ Saved: {filename}")

print("✓ File management functions defined")

# %% [markdown]
# ## 8.2 Single Configuration Evaluation
# 
# Evaluate a single configuration: (provider, model, chunk_size, k, mode)

# %%
def evaluate_single_configuration(
    dataset,
    evidence_lookup: Dict,
    sbert_model: SentenceTransformer,
    provider: str,
    model: str,
    chunk_size: int,
    k: int,
    mode: str,
    use_page_tolerance: bool = True,
    text_similarity_threshold: float = TEXT_SIMILARITY_THRESHOLD,
    output_dir: str = OUTPUT_DIR
) -> Dict:
    """
    Evaluate a single configuration with BOTH page-based AND text-based metrics.
    
    This is the main evaluation function that:
    1. Loads the vector store
    2. For each query:
       - Retrieves documents
       - Calculates PAGE-based metrics (MRR, Recall, Precision, F1)
       - Calculates TEXT-based metrics (MRR, Recall, Precision, F1)
       - Stores all results
    3. Calculates average metrics
    4. Saves results to JSON
    
    Args:
        dataset: FinanceBench dataset
        evidence_lookup: Pre-computed evidence embeddings
        sbert_model: Sentence-BERT model for text similarity
        provider: "ollama", "openai", or "voyage"
        model: Model name
        chunk_size: Chunk size
        k: Number of documents to retrieve
        mode: "global" or "singledoc"
        use_page_tolerance: If True, use chunk-size-aware page tolerance
        text_similarity_threshold: Threshold for text-based matching
        output_dir: Output directory for results
        
    Returns:
        Dictionary with status and metrics
    """
    print(f"\n{'='*60}")
    print(f"EVALUATING: {provider}/{model}")
    print(f"  Chunk size: {chunk_size}")
    print(f"  K: {k}")
    print(f"  Mode: {mode}")
    print(f"  Page tolerance: {'ENABLED' if use_page_tolerance else 'DISABLED'}")
    print(f"  Text similarity threshold: {text_similarity_threshold}")
    print(f"{'='*60}")
    
    # Check if already exists
    if check_if_results_exist(provider, model, chunk_size, k, mode, output_dir):
        print("✓ Results already exist - SKIPPING")
        return {'status': 'skipped'}
    
    # Load vectorstore
    print("\nLoading vectorstore...")
    try:
        vectorstore = load_vectorstore(provider, model, chunk_size)
        doc_count = vectorstore._collection.count()
        print(f"✓ Loaded ({doc_count:,} documents)")
    except Exception as e:
        print(f"✗ Failed to load vectorstore: {e}")
        return {'status': 'failed', 'error': str(e)}
    
    # Initialize tracking lists
    results = []
    
    # Page-based metrics
    page_mrr_scores = []
    page_recall_scores = []
    page_precision_scores = []
    page_f1_scores = []
    
    # Text-based metrics
    text_mrr_scores = []
    text_recall_scores = []
    text_precision_scores = []
    text_f1_scores = []
    
    # Process all queries
    print(f"\nProcessing {len(dataset)} queries...")
    print("(This may take a while due to text similarity calculations...)")
    
    for record in tqdm(dataset, desc="Queries"):
        query_id = record['financebench_id']
        query = record['question']
        doc_name = record['doc_name']
        
        # Get evidence for this query
        evidence_items = evidence_lookup.get(query_id, [])
        
        if len(evidence_items) == 0:
            # No evidence for this query, skip
            continue
        
        try:
            # ========================================
            # STEP 1: RETRIEVE DOCUMENTS
            # ========================================
            if mode == "global":
                retrieved_docs = retrieve_global(vectorstore, query, k)
            elif mode == "singledoc":
                retrieved_docs = retrieve_single_doc(vectorstore, query, doc_name, k)
            else:
                raise ValueError(f"Unknown mode: {mode}")
            
            # ========================================
            # STEP 2: CALCULATE PAGE-BASED METRICS
            # ========================================
            
            # Page-based MRR
            page_mrr, page_rank = calculate_page_mrr_for_query(
                retrieved_docs, evidence_items, chunk_size, use_page_tolerance
            )
            page_mrr_scores.append(page_mrr)
            
            # Page-based Recall, Precision, F1
            page_recall, page_precision, page_f1 = calculate_page_metrics_for_query(
                retrieved_docs, evidence_items, chunk_size, use_page_tolerance
            )
            page_recall_scores.append(page_recall)
            page_precision_scores.append(page_precision)
            page_f1_scores.append(page_f1)
            
            # ========================================
            # STEP 3: CALCULATE TEXT-BASED METRICS
            # ========================================
            
            # Show progress for text similarity calculations
            # (This is the slowest part)
            text_mrr, text_rank, text_recall, text_precision, text_f1, all_similarities = \
                calculate_text_metrics_for_query(
                    retrieved_docs,
                    evidence_items,
                    sbert_model,
                    threshold=text_similarity_threshold
                )
            
            text_mrr_scores.append(text_mrr)
            text_recall_scores.append(text_recall)
            text_precision_scores.append(text_precision)
            text_f1_scores.append(text_f1)
            
            # ========================================
            # STEP 4: FORMAT RESULTS FOR JSON
            # ========================================
            
            # Format expected evidence
            expected_evidence = [
                {
                    'doc_name': ev['doc_name'],
                    'page_number': ev['page_number'],
                    'evidence_text': ev['evidence_text'][:200] + '...' if len(ev['evidence_text']) > 200 else ev['evidence_text']
                }
                for ev in evidence_items
            ]
            
            # Format retrieved docs with text similarities
            retrieved_docs_formatted = []
            for i, doc in enumerate(retrieved_docs):
                doc_formatted = {
                    'doc_name': doc['doc_name'],
                    'page_number': doc['page_number'],
                    'rank': doc['rank'],
                    'chunk_text': format_chunk_text_preview(doc['chunk_text']),
                    'text_similarities': all_similarities[i] if i < len(all_similarities) else []
                }
                retrieved_docs_formatted.append(doc_formatted)
            
            # Store complete result
            result = {
                'query_id': query_id,
                'query': query,
                'expected_doc': doc_name,
                'expected_evidence': expected_evidence,
                'retrieved_docs': retrieved_docs_formatted,
                
                # Page-based metrics
                'page_mrr_score': page_mrr,
                'page_rank': page_rank,
                'page_recall': page_recall,
                'page_precision': page_precision,
                'page_f1': page_f1,
                
                # Text-based metrics
                'text_mrr_score': text_mrr,
                'text_rank': text_rank,
                'text_recall': text_recall,
                'text_precision': text_precision,
                'text_f1': text_f1
            }
            results.append(result)
            
        except Exception as e:
            print(f"\n✗ Error processing query {query_id}: {e}")
            # Store error result
            results.append({
                'query_id': query_id,
                'query': query,
                'error': str(e),
                'page_mrr_score': 0.0,
                'page_rank': -1,
                'page_recall': 0.0,
                'page_precision': 0.0,
                'page_f1': 0.0,
                'text_mrr_score': 0.0,
                'text_rank': -1,
                'text_recall': 0.0,
                'text_precision': 0.0,
                'text_f1': 0.0
            })
            # Append zeros to tracking lists
            page_mrr_scores.append(0.0)
            page_recall_scores.append(0.0)
            page_precision_scores.append(0.0)
            page_f1_scores.append(0.0)
            text_mrr_scores.append(0.0)
            text_recall_scores.append(0.0)
            text_precision_scores.append(0.0)
            text_f1_scores.append(0.0)
    
    # ========================================
    # CALCULATE AVERAGE METRICS
    # ========================================
    
    avg_page_mrr = sum(page_mrr_scores) / len(page_mrr_scores) if page_mrr_scores else 0.0
    avg_page_recall = sum(page_recall_scores) / len(page_recall_scores) if page_recall_scores else 0.0
    avg_page_precision = sum(page_precision_scores) / len(page_precision_scores) if page_precision_scores else 0.0
    avg_page_f1 = sum(page_f1_scores) / len(page_f1_scores) if page_f1_scores else 0.0
    
    avg_text_mrr = sum(text_mrr_scores) / len(text_mrr_scores) if text_mrr_scores else 0.0
    avg_text_recall = sum(text_recall_scores) / len(text_recall_scores) if text_recall_scores else 0.0
    avg_text_precision = sum(text_precision_scores) / len(text_precision_scores) if text_precision_scores else 0.0
    avg_text_f1 = sum(text_f1_scores) / len(text_f1_scores) if text_f1_scores else 0.0
    
    # Add summary to results
    results.append({
        'summary': {
            'provider': provider,
            'model': model,
            'chunk_size': chunk_size,
            'k': k,
            'mode': mode,
            'use_page_tolerance': use_page_tolerance,
            'text_similarity_threshold': text_similarity_threshold,
            'total_queries': len(dataset),
            
            # Page-based averages
            'average_page_mrr': avg_page_mrr,
            'average_page_recall': avg_page_recall,
            'average_page_precision': avg_page_precision,
            'average_page_f1': avg_page_f1,
            
            # Text-based averages
            'average_text_mrr': avg_text_mrr,
            'average_text_recall': avg_text_recall,
            'average_text_precision': avg_text_precision,
            'average_text_f1': avg_text_f1
        }
    })
    
    # Save results
    save_results(results, provider, model, chunk_size, k, mode, output_dir)
    
    # Print summary
    print(f"\n{'='*60}")
    print("RESULTS SUMMARY")
    print(f"{'='*60}")
    print("\nPage-Based Metrics:")
    print(f"  Average MRR:       {avg_page_mrr:.4f}")
    print(f"  Average Recall:    {avg_page_recall:.4f}")
    print(f"  Average Precision: {avg_page_precision:.4f}")
    print(f"  Average F1:        {avg_page_f1:.4f}")
    print("\nText-Based Metrics:")
    print(f"  Average MRR:       {avg_text_mrr:.4f}")
    print(f"  Average Recall:    {avg_text_recall:.4f}")
    print(f"  Average Precision: {avg_text_precision:.4f}")
    print(f"  Average F1:        {avg_text_f1:.4f}")
    print(f"{'='*60}")
    
    return {
        'status': 'completed',
        'average_page_mrr': avg_page_mrr,
        'average_page_recall': avg_page_recall,
        'average_page_precision': avg_page_precision,
        'average_page_f1': avg_page_f1,
        'average_text_mrr': avg_text_mrr,
        'average_text_recall': avg_text_recall,
        'average_text_precision': avg_text_precision,
        'average_text_f1': avg_text_f1,
        'total_queries': len(dataset)
    }

print("✓ Single configuration evaluation function defined")

# %%
print("\n" + "="*60)
print("✓ STEP 8 COMPLETE!")
print("="*60)
print("  ✓ File management functions defined")
print("  ✓ Main evaluation function defined")
print("  ✓ Processes both page-based AND text-based metrics")
print("  ✓ Saves comprehensive results to JSON")
print("  ✓ Ready for batch evaluation")
print("="*60)

In [None]:
# ============================================================================
# Step 9: Batch Evaluation Function
# ============================================================================

# %% [markdown]
# ## 9.1 Batch Evaluation
# 
# Evaluate multiple configurations automatically

# %%
def evaluate_multiple_configurations(
    dataset,
    evidence_lookup: Dict,
    sbert_model: SentenceTransformer,
    configurations: List[Dict],
    k_values: List[int],
    modes: List[str],
    use_page_tolerance: bool = True,
    text_similarity_threshold: float = TEXT_SIMILARITY_THRESHOLD,
    output_dir: str = OUTPUT_DIR
) -> Dict:
    """
    Evaluate multiple configurations in batch.
    
    This function iterates through all combinations of:
    - Configurations (provider, model, chunk_sizes)
    - K values (number of documents to retrieve)
    - Modes (global, singledoc)
    
    And evaluates each combination using evaluate_single_configuration().
    
    Args:
        dataset: FinanceBench dataset
        evidence_lookup: Pre-computed evidence embeddings
        sbert_model: Sentence-BERT model
        configurations: List of {provider, model, chunk_sizes}
        k_values: List of k values to test
        modes: List of modes ["global", "singledoc"]
        use_page_tolerance: If True, use chunk-size-aware tolerance
        text_similarity_threshold: Threshold for text-based matching
        output_dir: Output directory
        
    Returns:
        Summary dictionary with all results
        
    Example configurations:
        [
            {
                'provider': 'voyage',
                'model': 'voyage-finance-2',
                'chunk_sizes': [512, 1024, 2048]
            },
            ...
        ]
    """
    print(f"\n{'='*60}")
    print("BATCH EVALUATION")
    print(f"{'='*60}")
    print(f"Configurations: {len(configurations)}")
    print(f"K values: {k_values}")
    print(f"Modes: {modes}")
    print(f"Page tolerance: {'ENABLED' if use_page_tolerance else 'DISABLED'}")
    print(f"Text similarity threshold: {text_similarity_threshold}")
    
    # Calculate total runs
    total_runs = 0
    for config in configurations:
        total_runs += len(config['chunk_sizes']) * len(k_values) * len(modes)
    
    print(f"Total evaluation runs: {total_runs}")
    print(f"{'='*60}")
    
    # Track results
    all_results = []
    completed = 0
    skipped = 0
    failed = 0
    
    # Start time
    import time
    start_time = time.time()
    
    # Iterate through all combinations
    for config in configurations:
        provider = config['provider']
        model = config['model']
        chunk_sizes = config['chunk_sizes']
        
        for chunk_size in chunk_sizes:
            for k in k_values:
                for mode in modes:
                    print(f"\n{'#'*60}")
                    print(f"CONFIGURATION {completed + skipped + failed + 1}/{total_runs}")
                    print(f"{'#'*60}")
                    
                    result = evaluate_single_configuration(
                        dataset=dataset,
                        evidence_lookup=evidence_lookup,
                        sbert_model=sbert_model,
                        provider=provider,
                        model=model,
                        chunk_size=chunk_size,
                        k=k,
                        mode=mode,
                        use_page_tolerance=use_page_tolerance,
                        text_similarity_threshold=text_similarity_threshold,
                        output_dir=output_dir
                    )
                    
                    all_results.append({
                        'provider': provider,
                        'model': model,
                        'chunk_size': chunk_size,
                        'k': k,
                        'mode': mode,
                        'result': result
                    })
                    
                    if result['status'] == 'completed':
                        completed += 1
                    elif result['status'] == 'skipped':
                        skipped += 1
                    else:
                        failed += 1
    
    # End time
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Summary
    print(f"\n{'='*60}")
    print("BATCH EVALUATION SUMMARY")
    print(f"{'='*60}")
    print(f"Total runs: {total_runs}")
    print(f"Completed: {completed}")
    print(f"Skipped: {skipped}")
    print(f"Failed: {failed}")
    print(f"Total time: {elapsed_time/60:.2f} minutes")
    print(f"Average time per run: {elapsed_time/total_runs:.2f} seconds")
    print(f"{'='*60}")
    
    return {
        'total_runs': total_runs,
        'completed': completed,
        'skipped': skipped,
        'failed': failed,
        'elapsed_time': elapsed_time,
        'results': all_results
    }

print("✓ Batch evaluation function defined")

# %% [markdown]
# ## 9.2 Results Analysis Helper
# 
# Helper function to display results in a readable format

# %%
def display_batch_results(summary: Dict):
    """
    Display batch evaluation results in a readable table format.
    
    Shows both page-based and text-based metrics for easy comparison.
    """
    print("\n" + "="*80)
    print("DETAILED RESULTS - ALL CONFIGURATIONS")
    print("="*80)
    
    # Group results by status
    completed_results = [r for r in summary['results'] if r['result']['status'] == 'completed']
    skipped_results = [r for r in summary['results'] if r['result']['status'] == 'skipped']
    failed_results = [r for r in summary['results'] if r['result']['status'] == 'failed']
    
    if completed_results:
        print("\n" + "-"*80)
        print(f"COMPLETED EVALUATIONS ({len(completed_results)})")
        print("-"*80)
        
        # Table header
        print(f"\n{'Config':<40} {'Page Metrics':<25} {'Text Metrics':<25}")
        print(f"{'-'*40} {'-'*25} {'-'*25}")
        print(f"{'Provider/Model/Chunk/K/Mode':<40} {'MRR':>6} {'Rec':>6} {'Prec':>6} {'F1':>6} {'MRR':>6} {'Rec':>6} {'Prec':>6} {'F1':>6}")
        print("-"*90)
        
        # Sort by provider, model, chunk_size, k, mode
        sorted_results = sorted(
            completed_results,
            key=lambda x: (x['provider'], x['model'], x['chunk_size'], x['k'], x['mode'])
        )
        
        for r in sorted_results:
            config_str = f"{r['provider']}/{r['model']}/ch{r['chunk_size']}/k{r['k']}/{r['mode']}"
            result = r['result']
            
            # Page-based metrics
            page_mrr = result['average_page_mrr']
            page_rec = result['average_page_recall']
            page_prec = result['average_page_precision']
            page_f1 = result['average_page_f1']
            
            # Text-based metrics
            text_mrr = result['average_text_mrr']
            text_rec = result['average_text_recall']
            text_prec = result['average_text_precision']
            text_f1 = result['average_text_f1']
            
            print(f"{config_str:<40} {page_mrr:>6.3f} {page_rec:>6.3f} {page_prec:>6.3f} {page_f1:>6.3f} {text_mrr:>6.3f} {text_rec:>6.3f} {text_prec:>6.3f} {text_f1:>6.3f}")
    
    if skipped_results:
        print("\n" + "-"*80)
        print(f"SKIPPED EVALUATIONS ({len(skipped_results)})")
        print("-"*80)
        for r in skipped_results:
            config_str = f"{r['provider']}/{r['model']}/chunk{r['chunk_size']}/k{r['k']}/{r['mode']}"
            print(f"  - {config_str}")
    
    if failed_results:
        print("\n" + "-"*80)
        print(f"FAILED EVALUATIONS ({len(failed_results)})")
        print("-"*80)
        for r in failed_results:
            config_str = f"{r['provider']}/{r['model']}/chunk{r['chunk_size']}/k{r['k']}/{r['mode']}"
            error = r['result'].get('error', 'Unknown error')
            print(f"  - {config_str}: {error}")
    
    print("\n" + "="*80)

print("✓ Results analysis helper defined")

# %% [markdown]
# ## 9.3 List Generated Files
# 
# Helper to show all generated JSON files

# %%
def list_generated_files(output_dir: str = OUTPUT_DIR):
    """
    List all generated JSON files with their sizes.
    """
    print("\n" + "="*60)
    print("GENERATED FILES")
    print("="*60)
    
    output_path = Path(output_dir)
    json_files = sorted(output_path.glob("*.json"))
    
    print(f"\nTotal JSON files: {len(json_files)}")
    print(f"Location: {output_dir}\n")
    
    if json_files:
        # Calculate total size
        total_size = sum(f.stat().st_size for f in json_files)
        
        print(f"{'Filename':<60} {'Size':>10}")
        print("-"*72)
        
        for filepath in json_files:
            file_size = filepath.stat().st_size / 1024  # KB
            print(f"{filepath.name:<60} {file_size:>8.1f} KB")
        
        print("-"*72)
        print(f"{'TOTAL':<60} {total_size/1024:>8.1f} KB")
    else:
        print("No JSON files found.")
    
    print("\n" + "="*60)

print("✓ File listing helper defined")

# %%
print("\n" + "="*60)
print("✓ STEP 9 COMPLETE!")
print("="*60)
print("  ✓ Batch evaluation function defined")
print("  ✓ Results display helper defined")
print("  ✓ File listing helper defined")
print("  ✓ Ready for configuration and execution")
print("="*60)

In [None]:
# ============================================================================
# Step 10: Configuration and Execution
# ============================================================================

# %% [markdown]
# ## 10.1 Define Configurations to Test
# 
# Specify which embedding models and chunk sizes to evaluate

# %%
# Define configurations to evaluate
# Each configuration specifies: provider, model, and chunk sizes to test

configurations = [
    {
        'provider': 'ollama',
        'model': 'nomic-embed-text',
        'chunk_sizes': [512, 1024]
    },
    {
        'provider': 'ollama',
        'model': 'bge-m3',
        'chunk_sizes': [512, 1024]
    },
    # {
    #     'provider': 'openai',
    #     'model': 'text-embedding-3-small',
    #     'chunk_sizes': [256, 512, 1024, 2048]
    # },
    # {
    #     'provider': 'openai',
    #     'model': 'text-embedding-3-large',
    #     'chunk_sizes': [512, 1024]
    # },
    {
        'provider': 'voyage',
        'model': 'voyage-3-large',
        'chunk_sizes': [512, 1024, 2048, 4096]
    },
    # {
    #     'provider': 'voyage',
    #     'model': 'voyage-finance-2',
    #     'chunk_sizes': [512, 1024]
    # },
]

print("✓ Configurations defined")

# %% [markdown]
# ## 10.2 Define Evaluation Parameters

# %%
# K values to test (number of documents to retrieve)
k_values = [20, 40, 60, 80]

# Modes to test
modes = ['global', 'singledoc']

# Page tolerance setting
# - True: Use chunk-size-aware page tolerance (lenient matching for large chunks)
# - False: Exact page match only (strict evaluation)
USE_PAGE_TOLERANCE = True

# Text similarity threshold
# - Chunks with cosine similarity >= this value are considered matches
# - Higher = stricter matching, Lower = more lenient matching
TEXT_SIMILARITY_THRESHOLD = 0.7

print("✓ Evaluation parameters defined")

# %% [markdown]
# ## 10.3 Display Evaluation Plan

# %%
print("\n" + "="*60)
print("EVALUATION PLAN")
print("="*60)

print(f"\nDataset: FinanceBench ({len(dataset)} queries)")
print(f"Evidence items: {len(all_evidence)}")
print(f"Pre-computed embeddings: {evidence_embeddings.shape[0]}")

print(f"\nEvaluation Settings:")
print(f"  K values: {k_values}")
print(f"  Modes: {modes}")
print(f"  Page tolerance: {'ENABLED' if USE_PAGE_TOLERANCE else 'DISABLED'}")
print(f"  Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")

print(f"\nConfigurations to evaluate:")
total_runs = 0
for i, config in enumerate(configurations, start=1):
    provider = config['provider']
    model = config['model']
    chunk_sizes = config['chunk_sizes']
    
    runs_for_config = len(chunk_sizes) * len(k_values) * len(modes)
    total_runs += runs_for_config
    
    print(f"\n  {i}. {provider}/{model}")
    print(f"     Chunk sizes: {chunk_sizes}")
    print(f"     Evaluation runs: {runs_for_config}")
    
    # Show output filenames that will be generated
    print(f"     Output files:")
    for chunk_size in chunk_sizes:
        for k in k_values:
            for mode in modes:
                filename = get_output_filename(provider, model, chunk_size, k, mode)
                exists = check_if_results_exist(provider, model, chunk_size, k, mode, OUTPUT_DIR)
                status = "EXISTS" if exists else "TO CREATE"
                print(f"       - {filename} [{status}]")

print(f"\n{'='*60}")
print(f"Total evaluation runs: {total_runs}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"{'='*60}")

# %% [markdown]
# ## 10.4 Execute Batch Evaluation
# 
# **IMPORTANT**: This cell will run the full evaluation.
# - Depending on configurations, this may take 30 minutes to several hours
# - Progress will be shown for each configuration
# - Results are saved incrementally (existing results are skipped)

# %%
# Run batch evaluation
print("\n" + "#"*60)
print("STARTING BATCH EVALUATION")
print("#"*60)
print("\nNOTE: This may take a while. Progress will be shown for each configuration.")
print("You can interrupt and resume later - completed evaluations will be skipped.\n")

# Uncomment the line below to run the evaluation
summary = evaluate_multiple_configurations(
    dataset=dataset,
    evidence_lookup=evidence_lookup,
    sbert_model=sbert_model,
    configurations=configurations,
    k_values=k_values,
    modes=modes,
    use_page_tolerance=USE_PAGE_TOLERANCE,
    text_similarity_threshold=TEXT_SIMILARITY_THRESHOLD,
    output_dir=OUTPUT_DIR
)

print("\n⚠️  EVALUATION NOT RUN - Uncomment the code above to execute")
print("This is intentional to prevent accidental execution during testing.")

# %% [markdown]
# ## 10.5 Display Results (Run after evaluation completes)
# 
# Uncomment and run this cell after the evaluation completes

# %%
# Display detailed results in table format
# Uncomment after evaluation completes:
display_batch_results(summary)

print("\n⚠️  Results display not run - uncomment after evaluation completes")

# %% [markdown]
# ## 10.6 List Generated Files
# 
# View all generated JSON files

# %%
# List all generated files
list_generated_files(OUTPUT_DIR)

# %% [markdown]
# ## 10.7 Load and Analyze a Single Result
# 
# Example: How to load and inspect a single result file

# %%
def load_and_inspect_result(filename: str, output_dir: str = OUTPUT_DIR):
    """
    Load and display a single result file.
    
    Args:
        filename: Name of the JSON file (e.g., "voyage_voyage-finance-2_chunk1024_k20_global.json")
        output_dir: Output directory
    """
    filepath = os.path.join(output_dir, filename)
    
    if not os.path.exists(filepath):
        print(f"❌ File not found: {filename}")
        return None
    
    # Load JSON
    with open(filepath, 'r') as f:
        results = json.load(f)
    
    # Extract summary (last item)
    summary = results[-1]['summary']
    
    print("\n" + "="*60)
    print(f"RESULTS: {filename}")
    print("="*60)
    
    print("\nConfiguration:")
    print(f"  Provider: {summary['provider']}")
    print(f"  Model: {summary['model']}")
    print(f"  Chunk size: {summary['chunk_size']}")
    print(f"  K: {summary['k']}")
    print(f"  Mode: {summary['mode']}")
    print(f"  Page tolerance: {summary['use_page_tolerance']}")
    print(f"  Text threshold: {summary['text_similarity_threshold']}")
    
    print("\nPage-Based Metrics:")
    print(f"  Average MRR:       {summary['average_page_mrr']:.4f}")
    print(f"  Average Recall:    {summary['average_page_recall']:.4f}")
    print(f"  Average Precision: {summary['average_page_precision']:.4f}")
    print(f"  Average F1:        {summary['average_page_f1']:.4f}")
    
    print("\nText-Based Metrics:")
    print(f"  Average MRR:       {summary['average_text_mrr']:.4f}")
    print(f"  Average Recall:    {summary['average_text_recall']:.4f}")
    print(f"  Average Precision: {summary['average_text_precision']:.4f}")
    print(f"  Average F1:        {summary['average_text_f1']:.4f}")
    
    print(f"\nTotal queries: {summary['total_queries']}")
    print(f"Total results (queries + summary): {len(results)}")
    
    # Show sample query result
    if len(results) > 1:
        sample_query = results[0]
        print("\nSample query result:")
        print(f"  Query ID: {sample_query['query_id']}")
        print(f"  Question: {sample_query['query'][:80]}...")
        print(f"  Page MRR: {sample_query['page_mrr_score']:.4f}, Rank: {sample_query['page_rank']}")
        print(f"  Text MRR: {sample_query['text_mrr_score']:.4f}, Rank: {sample_query['text_rank']}")
        print(f"  Retrieved docs: {len(sample_query['retrieved_docs'])}")
        
        if len(sample_query['retrieved_docs']) > 0:
            first_doc = sample_query['retrieved_docs'][0]
            print(f"\n  First retrieved doc:")
            print(f"    Doc: {first_doc['doc_name']}, Page: {first_doc['page_number']}")
            print(f"    Chunk text: {first_doc['chunk_text'][:100]}...")
            if len(first_doc['text_similarities']) > 0:
                print(f"    Text similarity with evidence 0: {first_doc['text_similarities'][0]['cosine_similarity']:.4f}")
    
    print("\n" + "="*60)
    
    return results

print("✓ Result inspection function defined")

# Example usage (uncomment to use):
# results = load_and_inspect_result("voyage_voyage-finance-2_chunk1024_k20_global.json")

# %%
print("\n" + "="*60)
print("✓ STEP 10 COMPLETE!")
print("="*60)
print("  ✓ Configurations defined")
print("  ✓ Evaluation parameters set")
print("  ✓ Evaluation plan displayed")
print("  ✓ Batch evaluation ready (uncomment to run)")
print("  ✓ Result analysis tools ready")
print("="*60)

# %%
print("\n" + "="*80)
print("🎉 ALL STEPS COMPLETE! 🎉")
print("="*80)
print("\n✅ SETUP COMPLETE:")
print("  ✓ Step 1: Imports and configuration")
print("  ✓ Step 2: Sentence-BERT model loaded")
print("  ✓ Step 3: Evidence embeddings pre-computed")
print("  ✓ Step 4: Vector store loading functions")
print("  ✓ Step 5: Page-based evaluation functions")
print("  ✓ Step 6: Text-based evaluation functions")
print("  ✓ Step 7: Retrieval functions")
print("  ✓ Step 8: Main evaluation function")
print("  ✓ Step 9: Batch evaluation function")
print("  ✓ Step 10: Configuration and execution ready")

print("\n📊 EVALUATION CAPABILITIES:")
print("  ✓ Page-based metrics: MRR, Recall, Precision, F1")
print("  ✓ Text-based metrics: MRR, Recall, Precision, F1")
print("  ✓ Both global and single-document modes")
print("  ✓ Comprehensive JSON output with all similarities")
print(f"  ✓ Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  ✓ Pre-computed embeddings: {evidence_embeddings.shape[0]} evidence items")

print("\n🚀 NEXT STEPS:")
print("  1. Review the evaluation plan above")
print("  2. Uncomment the evaluation code in section 10.4")
print("  3. Run the batch evaluation (may take 30+ minutes)")
print("  4. After completion, uncomment section 10.5 to view results")
print("  5. Use section 10.7 to inspect individual result files")

print("\n💾 OUTPUT:")
print(f"  Location: {OUTPUT_DIR}")
print(f"  Format: JSON files with complete metrics and similarities")
print(f"  Naming: {{provider}}_{{model}}_chunk{{size}}_k{{k}}_{{mode}}.json")

print("\n⚠️  IMPORTANT NOTES:")
print("  - Evaluation runs incrementally (existing results are skipped)")
print("  - You can interrupt and resume anytime")
print("  - Progress is shown for each configuration")
print("  - Each query processes text similarities (slowest part)")
print("  - Results are saved immediately after each configuration")

print("\n" + "="*80)
print("Ready to evaluate! Uncomment section 10.4 when ready to start.")
print("="*80)