In [96]:
# %% [markdown]
# # Re-Ranking Evaluation Pipeline
# This notebook implements re-ranking evaluation for RAG systems using multiple reranker models.
# 
# **Workflow:**
# 1. Retrieve top k_retrieve chunks from existing vector stores
# 2. Apply re-ranking to get top k_rerank chunks
# 3. Evaluate and compare: k_retrieve → k_rerank (initial) → k_rerank (reranked)

# %% [markdown]
# ## 1. Imports and Configuration

# %%
# Standard library imports
import os
import json
from typing import List, Dict, Tuple, Any
from pathlib import Path
from datetime import datetime

# Third-party imports
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv

# LangChain imports
from langchain.schema import Document
from langchain_voyageai import VoyageAIRerank
from langchain_community.vectorstores import Chroma
from langchain_voyageai import VoyageAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import CohereEmbeddings

# Sentence transformers for cross-encoders
from sentence_transformers import CrossEncoder

# Hugging Face datasets
from datasets import load_dataset

print("✓ All imports successful")

# %%
# Load environment variables
load_dotenv()

# API Keys
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

# Verify API keys
assert VOYAGE_API_KEY is not None, "VOYAGE_API_KEY not found in environment"
print("✓ API keys loaded successfully")

# %%
# Configuration Parameters
CHUNK_TEXT_PREFIX_CHARS = 100  # Characters to keep from start
CHUNK_TEXT_SUFFIX_CHARS = 100  # Characters to keep from end
TEXT_SIMILARITY_THRESHOLD = 0.7  # From baseline
USE_PAGE_TOLERANCE = True  # From baseline

# Paths
VECTOR_DB_BASE_DIR = "../../vector_databases"  # Base directory for vector databases
COLLECTION_PREFIX = "financebench_docs_chunk_"  # Collection name prefix
OUTPUT_DIR = "../../evaluation_results/reranking_results"

# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print("✓ Configuration parameters set")
print(f"  - Chunk text preview: {CHUNK_TEXT_PREFIX_CHARS} + {CHUNK_TEXT_SUFFIX_CHARS} chars")
print(f"  - Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  - Vector DB base directory: {VECTOR_DB_BASE_DIR}")
print(f"  - Collection prefix: {COLLECTION_PREFIX}")
print(f"  - Output directory: {OUTPUT_DIR}")

# Evaluation modes
modes = ['global', 'single']

print("✓ Re-ranking configurations defined")
print(f"\nTotal configurations: {len(configurations)}")
for i, config in enumerate(configurations, 1):
    print(f"\n  Configuration {i}:")
    print(f"    Provider/Model: {config['provider']}/{config['model']}")
    print(f"    Chunk sizes: {config['chunk_sizes']}")
    print(f"    k_retrieve: {config['k_retrieve']}, k_rerank: {config['k_rerank']}")
    print(f"    Reranker models: {len(config['reranker_models'])}")
    for reranker in config['reranker_models']:
        print(f"      - {reranker}")

print(f"\n  Evaluation modes: {modes}")

# %%
print("\n" + "="*80)
print("✓ STEP 1 COMPLETE: Imports and Configuration")
print("="*80)
print("\nNext steps:")
print("  2. Load dataset and evidence")
print("  3. Initialize reranker models")
print("  4. Implement retrieval and re-ranking functions")
print("  5. Implement evaluation metrics")
print("  6. Run batch evaluation")

✓ All imports successful
✓ API keys loaded successfully
✓ Configuration parameters set
  - Chunk text preview: 100 + 100 chars
  - Text similarity threshold: 0.7
  - Vector DB base directory: ../../vector_databases
  - Collection prefix: financebench_docs_chunk_
  - Output directory: ../../evaluation_results/reranking_results
✓ Re-ranking configurations defined

Total configurations: 1

  Configuration 1:
    Provider/Model: voyage/voyage-3-large
    Chunk sizes: [1024]
    k_retrieve: 80, k_rerank: 20
    Reranker models: 1
      - voyage-rerank-2.5

  Evaluation modes: ['global', 'single']

✓ STEP 1 COMPLETE: Imports and Configuration

Next steps:
  2. Load dataset and evidence
  3. Initialize reranker models
  4. Implement retrieval and re-ranking functions
  5. Implement evaluation metrics
  6. Run batch evaluation


In [97]:
# %% [markdown]
# ## 2. Load Dataset and Evidence

# %%
# Load FinanceBench dataset
print("Loading FinanceBench dataset...")
dataset = load_dataset("PatronusAI/financebench", split="train")
print(f"✓ Loaded {len(dataset)} questions from FinanceBench")

# %%
# Extract all evidence items for evaluation
print("\nExtracting evidence items...")
all_evidence = []
for item in dataset:
    doc_name = item['doc_name']
    evidence = item['evidence']
    
    for ev in evidence:
        all_evidence.append({
            'doc_name': doc_name,
            'page_number': ev['evidence_page_num'],
            'text': ev['evidence_text']
        })

print(f"✓ Extracted {len(all_evidence)} evidence items")
print(f"  Unique documents: {len(set(ev['doc_name'] for ev in all_evidence))}")

# %%
# Load Sentence-BERT model for text similarity
print("\nLoading Sentence-BERT model for text similarity...")
from sentence_transformers import SentenceTransformer
import numpy as np

SBERT_MODEL_NAME = "all-MiniLM-L6-v2"
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
print(f"✓ Loaded {SBERT_MODEL_NAME}")

# %%
# Compute embeddings for all evidence texts
print("\nComputing embeddings for evidence texts...")
evidence_texts = [ev['text'] for ev in all_evidence]
evidence_embeddings = sbert_model.encode(
    evidence_texts,
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=32
)
print(f"✓ Computed {len(evidence_embeddings)} evidence embeddings")
print(f"  Embedding shape: {evidence_embeddings.shape}")

# Add embeddings to evidence items
for i, ev in enumerate(all_evidence):
    ev['embedding'] = evidence_embeddings[i]

print("✓ Evidence embeddings added to evidence items")

# %%
# Display sample evidence
print("\nSample evidence item:")
sample_ev = all_evidence[0]
print(f"  Doc: {sample_ev['doc_name']}")
print(f"  Page: {sample_ev['page_number']}")
print(f"  Text preview: {sample_ev['text'][:150]}...")
print(f"  Embedding shape: {sample_ev['embedding'].shape}")

# %%
# Display sample query
print("\nSample query from dataset:")
sample_query = dataset[0]
print(f"  Question: {sample_query['question']}")
print(f"  Doc: {sample_query['doc_name']}")
print(f"  Evidence pages: {[ev['evidence_page_num'] for ev in sample_query['evidence']]}")
print(f"  Answer: {sample_query['answer']}")

# %%
print("\n" + "="*80)
print("✓ STEP 2 COMPLETE: Dataset, Evidence, and Embeddings Loaded")
print("="*80)
print(f"\nDataset statistics:")
print(f"  Total queries: {len(dataset)}")
print(f"  Total evidence items: {len(all_evidence)}")
print(f"  Evidence embeddings computed: {len(evidence_embeddings)}")
print(f"  SBERT model: {SBERT_MODEL_NAME}")
print(f"\nNext step: Initialize reranker models")

Loading FinanceBench dataset...
✓ Loaded 150 questions from FinanceBench

Extracting evidence items...
✓ Extracted 189 evidence items
  Unique documents: 84

Loading Sentence-BERT model for text similarity...
✓ Loaded all-MiniLM-L6-v2

Computing embeddings for evidence texts...


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

✓ Computed 189 evidence embeddings
  Embedding shape: (189, 384)
✓ Evidence embeddings added to evidence items

Sample evidence item:
  Doc: 3M_2018_10K
  Page: 59
  Text preview: Table of Contents 
3M Company and Subsidiaries
Consolidated Statement of Cash Flow s
Years ended December 31
 
(Millions)
 
2018
 
2017
 
2016
 
Cash ...
  Embedding shape: (384,)

Sample query from dataset:
  Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.
  Doc: 3M_2018_10K
  Evidence pages: [59]
  Answer: $1577.00

✓ STEP 2 COMPLETE: Dataset, Evidence, and Embeddings Loaded

Dataset statistics:
  Total queries: 150
  Total evidence items: 189
  Evidence embeddings computed: 189
  SBERT model: all-MiniLM-L6-v2

Next step: Initialize reranker models


In [86]:
# %% [markdown]
# ## 3. Initialize Reranker Models

# %%
def get_all_unique_rerankers(configurations: List[Dict]) -> List[str]:
    """Extract all unique reranker models from configurations."""
    rerankers = set()
    for config in configurations:
        for reranker in config['reranker_models']:
            rerankers.add(reranker)
    return sorted(list(rerankers))

# Get all unique rerankers
unique_rerankers = get_all_unique_rerankers(configurations)
print(f"Unique reranker models to initialize: {len(unique_rerankers)}")
for reranker in unique_rerankers:
    print(f"  - {reranker}")

# %%
# Initialize reranker models
print("\n" + "="*80)
print("Initializing reranker models...")
print("="*80)

reranker_instances = {}

for reranker_name in unique_rerankers:
    print(f"\nLoading: {reranker_name}")
    
    if reranker_name == 'voyage-rerank-2.5':
        # Voyage reranker will be initialized per-query (API-based)
        reranker_instances[reranker_name] = 'api'
        print(f"  ✓ Voyage reranker marked as API-based (will initialize per-query)")
        
    elif reranker_name.startswith('cross-encoder/') or reranker_name.startswith('BAAI/'):
        # Load Hugging Face cross-encoder models
        try:
            model = CrossEncoder(reranker_name)
            reranker_instances[reranker_name] = model
            print(f"  ✓ Successfully loaded Hugging Face model")
            print(f"    Max sequence length: {model.max_length}")
        except Exception as e:
            print(f"  ✗ Failed to load {reranker_name}: {e}")
            raise
    else:
        raise ValueError(f"Unknown reranker type: {reranker_name}")

print("\n" + "="*80)
print(f"✓ Initialized {len(reranker_instances)} reranker models")
print("="*80)

# %%
# Display loaded models
print("\nLoaded reranker models:")
for name, instance in reranker_instances.items():
    if instance == 'api':
        print(f"  ✓ {name} (API-based)")
    else:
        print(f"  ✓ {name} (Local model)")

# %%
print("\n" + "="*80)
print("✓ STEP 3 COMPLETE: Reranker Models Initialized")
print("="*80)
print("\nReady rerankers:")
for reranker_name in reranker_instances.keys():
    print(f"  - {reranker_name}")
print(f"\nNext step: Implement helper functions")

Unique reranker models to initialize: 1
  - voyage-rerank-2.5

Initializing reranker models...

Loading: voyage-rerank-2.5
  ✓ Voyage reranker marked as API-based (will initialize per-query)

✓ Initialized 1 reranker models

Loaded reranker models:
  ✓ voyage-rerank-2.5 (API-based)

✓ STEP 3 COMPLETE: Reranker Models Initialized

Ready rerankers:
  - voyage-rerank-2.5

Next step: Implement helper functions


In [87]:
# %% [markdown]
# ## 4. Helper Functions

# %%
def truncate_chunk_text(text: str, prefix_chars: int, suffix_chars: int) -> str:
    """
    Truncate chunk text to keep only prefix and suffix characters.
    
    Args:
        text: Full chunk text
        prefix_chars: Number of characters to keep from start
        suffix_chars: Number of characters to keep from end
        
    Returns:
        Truncated text in format: "prefix...suffix"
    """
    if len(text) <= (prefix_chars + suffix_chars):
        return text
    
    prefix = text[:prefix_chars]
    suffix = text[-suffix_chars:]
    return f"{prefix}...{suffix}"

# Test the function
test_text = "This is a very long text that needs to be truncated for storage efficiency."
truncated = truncate_chunk_text(test_text, 20, 20)
print("Truncation test:")
print(f"  Original length: {len(test_text)}")
print(f"  Truncated: {truncated}")
print(f"  Truncated length: {len(truncated)}")

# %%
def simplify_reranker_name(reranker_model: str) -> str:
    """
    Simplify reranker model name for use in filenames.
    
    Examples:
        'cross-encoder/ms-marco-MiniLM-L-12-v2' -> 'cross-encoder-ms-marco-miniLM'
        'BAAI/bge-reranker-large' -> 'bge-reranker-large'
        'voyage-rerank-2.5' -> 'voyage-rerank-2.5'
    """
    # Replace slashes with hyphens
    simplified = reranker_model.replace('/', '-')
    
    # Simplify long cross-encoder names
    if 'ms-marco-MiniLM' in simplified:
        simplified = 'cross-encoder-ms-marco-miniLM'
    elif 'BAAI-bge-reranker' in simplified:
        simplified = simplified.replace('BAAI-', '')
    
    return simplified

# Test the function
test_names = [
    'cross-encoder/ms-marco-MiniLM-L-12-v2',
    'BAAI/bge-reranker-large',
    'voyage-rerank-2.5'
]
print("\nReranker name simplification:")
for name in test_names:
    print(f"  {name}")
    print(f"    -> {simplify_reranker_name(name)}")

# %%
def get_output_filename(
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    mode: str,
    reranker_model: str
) -> str:
    """
    Generate output filename for reranking results.
    
    Format: {provider}_{model}_chunk{size}_k{k_retrieve}_{mode}_rerank_k{k_rerank}-{reranker}.json
    
    Example: voyage_voyage-3-large_chunk512_k100_global_rerank_k20-voyage-rerank-2.5.json
    """
    reranker_simplified = simplify_reranker_name(reranker_model)
    filename = f"{provider}_{model}_chunk{chunk_size}_k{k_retrieve}_{mode}_rerank_k{k_rerank}-{reranker_simplified}.json"
    return filename

# Test the function
test_filename = get_output_filename(
    provider='voyage',
    model='voyage-3-large',
    chunk_size=512,
    k_retrieve=100,
    k_rerank=20,
    mode='global',
    reranker_model='voyage-rerank-2.5'
)
print(f"\nFilename generation test:")
print(f"  {test_filename}")

# %%
def check_if_results_exist(
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    mode: str,
    reranker_model: str,
    output_dir: str
) -> bool:
    """Check if results file already exists."""
    filename = get_output_filename(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker_model)
    filepath = Path(output_dir) / filename
    return filepath.exists()

# %%
def save_results(
    results: Dict,
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    mode: str,
    reranker_model: str,
    output_dir: str
) -> str:
    """
    Save evaluation results to JSON file.
    
    Returns:
        Path to saved file
    """
    filename = get_output_filename(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker_model)
    filepath = Path(output_dir) / filename
    
    with open(filepath, 'w') as f:
        json.dump(results, f, indent=2)
    
    return str(filepath)

print("\n✓ File management functions defined")

# %%
def get_collection_name(provider: str, model: str, chunk_size: int) -> str:
    """
    Generate ChromaDB collection name.
    Format matches baseline: {provider}_{model}_chunk{size}
    """
    return f"{provider}_{model}_chunk{chunk_size}"

# Test the function
test_collection = get_collection_name('voyage', 'voyage-3-large', 512)
print(f"\nCollection name test:")
print(f"  {test_collection}")

# %%
print("\n" + "="*80)
print("✓ STEP 4 COMPLETE: Helper Functions Defined")
print("="*80)
print("\nImplemented functions:")
print("  ✓ truncate_chunk_text() - Truncate text for storage")
print("  ✓ simplify_reranker_name() - Simplify model names for filenames")
print("  ✓ get_output_filename() - Generate result filenames")
print("  ✓ check_if_results_exist() - Check for existing results")
print("  ✓ save_results() - Save evaluation results")
print("  ✓ get_collection_name() - Generate vector store collection names")
print("\nNext step: Implement retrieval functions")

Truncation test:
  Original length: 75
  Truncated: This is a very long ... storage efficiency.
  Truncated length: 43

Reranker name simplification:
  cross-encoder/ms-marco-MiniLM-L-12-v2
    -> cross-encoder-ms-marco-miniLM
  BAAI/bge-reranker-large
    -> bge-reranker-large
  voyage-rerank-2.5
    -> voyage-rerank-2.5

Filename generation test:
  voyage_voyage-3-large_chunk512_k100_global_rerank_k20-voyage-rerank-2.5.json

✓ File management functions defined

Collection name test:
  voyage_voyage-3-large_chunk512

✓ STEP 4 COMPLETE: Helper Functions Defined

Implemented functions:
  ✓ truncate_chunk_text() - Truncate text for storage
  ✓ simplify_reranker_name() - Simplify model names for filenames
  ✓ get_output_filename() - Generate result filenames
  ✓ check_if_results_exist() - Check for existing results
  ✓ save_results() - Save evaluation results
  ✓ get_collection_name() - Generate vector store collection names

Next step: Implement retrieval functions


In [88]:
# %% [markdown]
# ## 5. Retrieval Functions

# %%
def get_embedding_model(provider: str, model: str):
    """
    Initialize the appropriate embedding model based on provider.
    
    Args:
        provider: 'voyage', 'openai', or 'cohere'
        model: Model name
        
    Returns:
        Embedding model instance
    """
    if provider == 'voyage':
        return VoyageAIEmbeddings(
            model=model,
            voyage_api_key=VOYAGE_API_KEY
        )
    elif provider == 'openai':
        return OpenAIEmbeddings(
            model=model,
            openai_api_key=OPENAI_API_KEY
        )
    elif provider == 'cohere':
        return CohereEmbeddings(
            model=model,
            cohere_api_key=COHERE_API_KEY
        )
    else:
        raise ValueError(f"Unknown provider: {provider}")

# %%
def extract_doc_name_from_path(file_path: str) -> str:
    """
    Extract document name from file path.
    
    Example:
        "../../documents/3M_2018_10K.pdf" → "3M_2018_10K"
    
    Args:
        file_path: Full path to document
        
    Returns:
        Document name without extension
    """
    return Path(file_path).stem


def extract_metadata_from_retrieved_doc(doc) -> Dict:
    """
    Extract metadata from a retrieved LangChain document.
    
    ChromaDB metadata structure:
        - file_path: Full path to PDF
        - source: Page number (as integer or string)
    
    Args:
        doc: LangChain Document object from vectorstore.similarity_search()
        
    Returns:
        Dict with:
            - doc_name: Document name (e.g., "3M_2018_10K")
            - page_number: Page number (integer, 1-indexed)
            - chunk_text: Full chunk text
    """
    metadata = doc.metadata
    
    # Extract document name from file_path
    file_path = metadata.get('file_path', '')
    doc_name = extract_doc_name_from_path(file_path) if file_path else ''
    
    # Extract page number from 'source'
    # FinanceBench uses 0-indexed pages, but we convert to 1-indexed
    page_source = metadata.get('source', -1)
    
    # Handle both string and integer page numbers
    if isinstance(page_source, str):
        try:
            page_number = int(page_source) + 1  # Convert to 1-indexed
        except ValueError:
            page_number = -1
    elif isinstance(page_source, int):
        page_number = page_source + 1  # Convert to 1-indexed
    else:
        page_number = -1
    
    return {
        'doc_name': doc_name,
        'page_number': page_number,
        'chunk_text': doc.page_content
    }

print("✓ extract_doc_name_from_path() and extract_metadata_from_retrieved_doc() defined")

# %%
def load_vector_store(provider: str, model: str, chunk_size: int, base_dir: str, collection_prefix: str = "financebench_docs_chunk_"):
    """
    Load existing ChromaDB vector store.
    
    Matches the directory structure from baseline:
    {base_dir}/{provider}_{model}/financebench_docs_chunk_{chunk_size}/
    
    Args:
        provider: Embedding provider
        model: Embedding model name
        chunk_size: Chunk size used
        base_dir: Base directory for vector databases
        collection_prefix: Prefix for collection names
        
    Returns:
        Chroma vector store instance
        
    Raises:
        ValueError: If collection doesn't exist
    """
    # Construct paths matching baseline structure
    model_id = f"{provider}_{model.replace('/', '_')}"
    db_path = os.path.join(base_dir, model_id)
    collection_name = f"{collection_prefix}{chunk_size}"
    
    embedding_model = get_embedding_model(provider, model)
    
    try:
        vectorstore = Chroma(
            collection_name=collection_name,
            embedding_function=embedding_model,
            persist_directory=db_path
        )
        
        # Verify collection exists by checking count
        count = vectorstore._collection.count()
        if count == 0:
            raise ValueError(f"Collection '{collection_name}' is empty")
        
        print(f"  ✓ Loaded collection '{collection_name}' from {db_path}")
        print(f"    Documents: {count}")
        return vectorstore
        
    except Exception as e:
        raise ValueError(f"Failed to load collection '{collection_name}' from {db_path}: {e}")

print("✓ load_vector_store() defined")

# %%
def retrieve_documents(
    vectorstore,
    query: str,
    k: int,
    mode: str,
    doc_name: str = None
) -> List[Dict]:
    """
    Retrieve top k documents from vector store.
    
    Args:
        vectorstore: ChromaDB vector store
        query: Search query
        k: Number of documents to retrieve
        mode: 'global' or 'single'
        doc_name: Document name (required for 'single' mode)
        
    Returns:
        List of retrieved documents with metadata
        Format: [{'doc_name': str, 'page_number': int, 'content': str, 'rank': int, 'score': float}, ...]
    """
    if mode == 'single':
        if doc_name is None:
            raise ValueError("doc_name required for single-document mode")
        
        # Retrieve more documents and filter in Python
        # Retrieve 3x to ensure we get enough from the target document
        results = vectorstore.similarity_search_with_score(query, k=k * 3)
        
        # Filter to only documents matching the doc_name
        filtered_results = []
        for doc, score in results:
            metadata = extract_metadata_from_retrieved_doc(doc)
            if metadata['doc_name'] == doc_name:
                filtered_results.append((doc, score))
                if len(filtered_results) >= k:
                    break
        
        results = filtered_results[:k]
    else:  # global mode
        results = vectorstore.similarity_search_with_score(query, k=k)
    
    # Format results using metadata extraction
    retrieved_docs = []
    for rank, (doc, score) in enumerate(results, start=1):
        metadata = extract_metadata_from_retrieved_doc(doc)
        
        retrieved_docs.append({
            'doc_name': metadata['doc_name'],
            'page_number': metadata['page_number'],
            'content': metadata['chunk_text'],
            'rank': rank,
            'score': float(score)
        })
    
    return retrieved_docs

print("✓ retrieve_documents() defined")

# %%
# Test retrieval with a sample configuration
print("\n" + "="*80)
print("Testing retrieval functions...")
print("="*80)

# Use first configuration for testing
test_config = configurations[0]
test_provider = test_config['provider']
test_model = test_config['model']
test_chunk_size = test_config['chunk_sizes'][0]
test_k_retrieve = test_config['k_retrieve']

print(f"\nTest parameters:")
print(f"  Provider: {test_provider}")
print(f"  Model: {test_model}")
print(f"  Chunk size: {test_chunk_size}")
print(f"  k_retrieve: {test_k_retrieve}")

try:
    # Load vector store
    print(f"\nLoading vector store...")
    test_vectorstore = load_vector_store(
        test_provider,
        test_model,
        test_chunk_size,
        VECTOR_DB_BASE_DIR,
        COLLECTION_PREFIX
    )
    
    # Test retrieval with first query
    test_query = dataset[0]
    print(f"\nTest query: {test_query['question'][:100]}...")
    print(f"Target doc: {test_query['doc_name']}")
    
    # Test global mode
    print(f"\nRetrieving top {test_k_retrieve} documents (global mode)...")
    retrieved_global = retrieve_documents(
        test_vectorstore,
        test_query['question'],
        k=test_k_retrieve,
        mode='global'
    )
    print(f"  ✓ Retrieved {len(retrieved_global)} documents")
    print(f"  Top result: {retrieved_global[0]['doc_name']} (page {retrieved_global[0]['page_number']}, score: {retrieved_global[0]['score']:.4f})")
    
    # Test single mode
    print(f"\nRetrieving top {test_k_retrieve} documents (single mode)...")
    retrieved_single = retrieve_documents(
        test_vectorstore,
        test_query['question'],
        k=test_k_retrieve,
        mode='single',
        doc_name=test_query['doc_name']
    )
    print(f"  ✓ Retrieved {len(retrieved_single)} documents")
    print(f"  Top result: {retrieved_single[0]['doc_name']} (page {retrieved_single[0]['page_number']}, score: {retrieved_single[0]['score']:.4f})")
    
    print("\n✓ Retrieval test successful!")
    
except Exception as e:
    print(f"\n✗ Retrieval test failed: {e}")
    print("\nThis is expected if vector stores haven't been created yet.")
    print("Make sure you have run the baseline evaluation first to create vector stores.")

# %%
print("\n" + "="*80)
print("✓ STEP 5 COMPLETE: Retrieval Functions Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ get_embedding_model() - Initialize embedding models")
print("  ✓ load_vector_store() - Load ChromaDB collections")
print("  ✓ retrieve_documents() - Retrieve top-k documents with filtering")
print("\nNext step: Implement re-ranking functions")

✓ extract_doc_name_from_path() and extract_metadata_from_retrieved_doc() defined
✓ load_vector_store() defined
✓ retrieve_documents() defined

Testing retrieval functions...

Test parameters:
  Provider: voyage
  Model: voyage-3-large
  Chunk size: 1024
  k_retrieve: 80

Loading vector store...
  ✓ Loaded collection 'financebench_docs_chunk_1024' from ../../vector_databases/voyage_voyage-3-large
    Documents: 15765

Test query: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
Target doc: 3M_2018_10K

Retrieving top 80 documents (global mode)...
  ✓ Retrieved 80 documents
  Top result: 3M_2018_10K (page 48, score: 0.5075)

Retrieving top 80 documents (single mode)...
  ✓ Retrieved 80 documents
  Top result: 3M_2018_10K (page 48, score: 0.5075)

✓ Retrieval test successful!

✓ STEP 5 COMPLETE: Retrieval Functions Implemented

Implemented functions:
  ✓ get_embedding_model() - Initialize embedding models
  ✓ load_vector_store() - Loa

In [89]:
# %% [markdown]
# ## 6. Re-ranking Functions

# %%
import time

def rerank_with_voyage(
    query: str,
    retrieved_docs: List[Dict],
    reranker_model: str,
    top_k: int,
    max_retries: int = 3,
    retry_delay: int = 60
) -> List[Dict]:
    """
    Re-rank documents using Voyage AI reranker API with rate limit handling.
    
    Args:
        query: Search query
        retrieved_docs: List from retrieval (with 'content', 'rank', 'score')
        reranker_model: Voyage model name (e.g., 'voyage-rerank-2.5')
        top_k: Number of documents to return after re-ranking
        max_retries: Maximum number of retries on rate limit
        retry_delay: Seconds to wait between retries
        
    Returns:
        Re-ranked list with {'doc_name', 'page_number', 'rank', 'initial_rank', 'initial_score', 'rerank_score', 'content'}
    """
    from langchain.schema import Document
    from langchain_voyageai import VoyageAIRerank
    
    # Convert to LangChain documents
    lc_docs = [
        Document(
            page_content=doc['content'],
            metadata={
                'doc_name': doc['doc_name'], 
                'page_number': doc['page_number'],
                'initial_rank': doc['rank'],
                'initial_score': doc['score']
            }
        )
        for doc in retrieved_docs
    ]
    
    # Initialize Voyage reranker
    # Extract model name (e.g., "rerank-2.5" from "voyage-rerank-2.5")
    model_name = reranker_model.replace('voyage-', '')
    
    reranker = VoyageAIRerank(
        model=model_name,
        voyage_api_key=VOYAGE_API_KEY,
        top_k=top_k
    )
    
    # Retry logic for rate limiting
    for attempt in range(max_retries):
        try:
            # Rerank documents
            reranked_docs = reranker.compress_documents(lc_docs, query)
            break  # Success, exit retry loop
            
        except Exception as e:
            error_msg = str(e)
            
            # Check if it's a rate limit error
            if "rate limit" in error_msg.lower() or "tpm" in error_msg.lower():
                if attempt < max_retries - 1:
                    print(f"\n⚠️  Rate limit hit. Waiting {retry_delay} seconds before retry {attempt + 1}/{max_retries}...")
                    time.sleep(retry_delay)
                else:
                    print(f"\n❌ Rate limit exceeded after {max_retries} attempts")
                    raise
            else:
                # Non-rate-limit error, raise immediately
                raise
    
    # Convert back to our format
    results = []
    for rank, doc in enumerate(reranked_docs, start=1):
        result = {
            'doc_name': doc.metadata['doc_name'],
            'page_number': doc.metadata['page_number'],
            'content': doc.page_content,
            'rank': rank,
            'initial_rank': doc.metadata['initial_rank'],
            'initial_score': doc.metadata['initial_score'],
            'rerank_score': doc.metadata.get('relevance_score', 0.0)
        }
        results.append(result)
    
    return results

print("✓ rerank_with_voyage() defined with rate limit handling")

# %%
def rerank_with_cross_encoder(
    query: str,
    retrieved_docs: List[Dict],
    cross_encoder_model: CrossEncoder,
    top_k: int
) -> List[Dict]:
    """
    Re-rank documents using Hugging Face cross-encoder model.
    
    Args:
        query: Search query
        retrieved_docs: List from retrieval (with 'content', 'rank', 'score')
        cross_encoder_model: Loaded CrossEncoder model instance
        top_k: Number of documents to return after re-ranking
        
    Returns:
        Re-ranked list with {'doc_name', 'page_number', 'rank', 'initial_rank', 'initial_score', 'rerank_score', 'content'}
    """
    # Prepare query-document pairs
    pairs = [[query, doc['content']] for doc in retrieved_docs]
    
    # Get relevance scores from cross-encoder
    scores = cross_encoder_model.predict(pairs)
    
    # Combine scores with documents
    docs_with_scores = []
    for doc, score in zip(retrieved_docs, scores):
        docs_with_scores.append({
            'doc_name': doc['doc_name'],
            'page_number': doc['page_number'],
            'content': doc['content'],
            'initial_rank': doc['rank'],
            'initial_score': doc['score'],
            'rerank_score': float(score)
        })
    
    # Sort by rerank score (descending) and take top_k
    docs_with_scores.sort(key=lambda x: x['rerank_score'], reverse=True)
    top_docs = docs_with_scores[:top_k]
    
    # Assign new ranks
    results = []
    for rank, doc in enumerate(top_docs, start=1):
        doc['rank'] = rank
        results.append(doc)
    
    return results

print("✓ rerank_with_cross_encoder() defined")

# %%
def rerank_documents(
    query: str,
    retrieved_docs: List[Dict],
    reranker_model: str,
    reranker_instance: Any,
    top_k: int
) -> List[Dict]:
    """
    Universal re-ranking function that routes to appropriate reranker.
    
    Args:
        query: Search query
        retrieved_docs: List from retrieval
        reranker_model: Model name/identifier
        reranker_instance: Loaded model instance or 'api' for Voyage
        top_k: Number of documents to return after re-ranking
        
    Returns:
        Re-ranked document list
    """
    if reranker_model == 'voyage-rerank-2.5':
        # Use Voyage API
        return rerank_with_voyage(query, retrieved_docs, reranker_model, top_k)
    
    elif isinstance(reranker_instance, CrossEncoder):
        # Use Hugging Face cross-encoder
        return rerank_with_cross_encoder(query, retrieved_docs, reranker_instance, top_k)
    
    else:
        raise ValueError(f"Unknown reranker type for model: {reranker_model}")

print("✓ rerank_documents() defined")

# %%
# Test re-ranking with sample data
print("\n" + "="*80)
print("Testing re-ranking functions...")
print("="*80)

# Create sample retrieved documents for testing
sample_retrieved_docs = [
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 1,
        'content': 'This is a highly relevant document about financial reporting.',
        'rank': 1,
        'score': 0.95
    },
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 2,
        'content': 'This document discusses unrelated topics.',
        'rank': 2,
        'score': 0.85
    },
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 3,
        'content': 'Annual financial statements and revenue details.',
        'rank': 3,
        'score': 0.80
    }
]

sample_query = "What was the company's revenue in 2023?"

print(f"\nSample query: {sample_query}")
print(f"Sample documents: {len(sample_retrieved_docs)}")

# Test with cross-encoder (if loaded)
try:
    test_reranker_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
    if test_reranker_name in reranker_instances:
        print(f"\nTesting with {test_reranker_name}...")
        test_reranker = reranker_instances[test_reranker_name]
        
        reranked = rerank_with_cross_encoder(
            sample_query,
            sample_retrieved_docs,
            test_reranker,
            top_k=3
        )
        
        print(f"  ✓ Re-ranking successful!")
        print(f"\n  Initial order:")
        for doc in sample_retrieved_docs:
            print(f"    Rank {doc['rank']}: Page {doc['page_number']} (score: {doc['score']:.3f})")
        
        print(f"\n  Re-ranked order:")
        for doc in reranked:
            print(f"    Rank {doc['rank']}: Page {doc['page_number']} (initial rank: {doc['initial_rank']}, rerank score: {doc['rerank_score']:.3f})")
    else:
        print(f"\n  Skipping test - {test_reranker_name} not loaded")
        
except Exception as e:
    print(f"\n  ✗ Re-ranking test failed: {e}")

# %%
print("\n" + "="*80)
print("✓ STEP 6 COMPLETE: Re-ranking Functions Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ rerank_with_voyage() - Voyage AI API re-ranking")
print("  ✓ rerank_with_cross_encoder() - Hugging Face cross-encoder re-ranking")
print("  ✓ rerank_documents() - Universal re-ranking router")
print("\nNext step: Implement evaluation metrics")

✓ rerank_with_voyage() defined with rate limit handling
✓ rerank_with_cross_encoder() defined
✓ rerank_documents() defined

Testing re-ranking functions...

Sample query: What was the company's revenue in 2023?
Sample documents: 3

  Skipping test - cross-encoder/ms-marco-MiniLM-L-12-v2 not loaded

✓ STEP 6 COMPLETE: Re-ranking Functions Implemented

Implemented functions:
  ✓ rerank_with_voyage() - Voyage AI API re-ranking
  ✓ rerank_with_cross_encoder() - Hugging Face cross-encoder re-ranking
  ✓ rerank_documents() - Universal re-ranking router

Next step: Implement evaluation metrics


In [90]:
# %% [markdown]
# ## 7. Evaluation Metrics (Matching Baseline)

# %%
def compute_cosine_similarity(embedding1: np.ndarray, embedding2: np.ndarray) -> float:
    """
    Calculate cosine similarity between two embeddings.
    
    Args:
        embedding1: First embedding vector
        embedding2: Second embedding vector
        
    Returns:
        Cosine similarity score (between -1 and 1)
    """
    # Normalize vectors
    norm1 = np.linalg.norm(embedding1)
    norm2 = np.linalg.norm(embedding2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    # Compute cosine similarity
    similarity = np.dot(embedding1, embedding2) / (norm1 * norm2)
    
    return float(similarity)

print("✓ compute_cosine_similarity() defined")

# %%
def calculate_text_similarities_for_chunk(
    chunk_text: str,
    evidence_items: List[Dict],
    sbert_model: SentenceTransformer
) -> List[Dict]:
    """
    Calculate cosine similarity between a chunk and all evidence items.
    
    Args:
        chunk_text: Text content of retrieved chunk
        evidence_items: List of evidence items (each has 'embedding', 'doc_name', 'page_number')
        sbert_model: Sentence-BERT model for encoding chunk
        
    Returns:
        List of similarity results with cosine similarity scores
    """
    # Encode chunk text
    chunk_embedding = sbert_model.encode(chunk_text, convert_to_numpy=True)
    
    # Calculate similarity with each evidence
    similarities = []
    
    for evidence_idx, evidence in enumerate(evidence_items):
        similarity_score = compute_cosine_similarity(
            chunk_embedding,
            evidence['embedding']
        )
        
        similarities.append({
            'evidence_index': evidence_idx,
            'evidence_doc': evidence['doc_name'],
            'evidence_page': evidence['page_number'],
            'cosine_similarity': similarity_score
        })
    
    return similarities

print("✓ calculate_text_similarities_for_chunk() defined")

# %%
def calculate_text_metrics_for_query(
    retrieved_docs: List[Dict],
    evidence_items: List[Dict],
    sbert_model: SentenceTransformer,
    threshold: float = 0.7
) -> Dict[str, float]:
    """
    Calculate TEXT-BASED metrics for a single query (matching baseline approach).
    
    For each retrieved chunk:
    1. Encode chunk text
    2. Calculate similarity with all evidence
    3. Determine if chunk matches (max_similarity >= threshold)
    
    Metrics:
    - Text MRR: Rank of first chunk where max(similarities) >= threshold
    - Text Recall: # evidence matched / # total evidence
    - Text Precision: # chunks matching / # total chunks
    - Text F1: Harmonic mean of precision and recall
    
    Args:
        retrieved_docs: List of retrieved docs with 'content'
        evidence_items: List of evidence items with 'embedding'
        sbert_model: Sentence-BERT model for encoding chunks
        threshold: Similarity threshold for matching (default: 0.7)
        
    Returns:
        Dict with 'text_mrr', 'text_recall', 'text_precision', 'text_f1'
    """
    if len(evidence_items) == 0 or len(retrieved_docs) == 0:
        return {
            'text_mrr': 0.0,
            'text_recall': 0.0,
            'text_precision': 0.0,
            'text_f1': 0.0
        }
    
    # Track results
    text_mrr = 0.0
    evidence_found = set()  # Set of evidence indices matched
    chunks_matching = 0
    
    # Process each retrieved chunk
    for rank, retrieved_doc in enumerate(retrieved_docs, start=1):
        chunk_text = retrieved_doc.get('content', '')
        
        if not chunk_text:
            continue
        
        # Calculate similarities with all evidence
        similarities = calculate_text_similarities_for_chunk(
            chunk_text,
            evidence_items,
            sbert_model
        )
        
        # Find maximum similarity
        max_similarity = max([s['cosine_similarity'] for s in similarities])
        
        # Check if this chunk matches (above threshold)
        chunk_matches_any_evidence = (max_similarity >= threshold)
        
        if chunk_matches_any_evidence:
            chunks_matching += 1
            
            # Record which evidence items this chunk matched
            for i, sim in enumerate(similarities):
                if sim['cosine_similarity'] >= threshold:
                    evidence_found.add(i)
            
            # Check for MRR (first match)
            if text_mrr == 0.0:  # First match found
                text_mrr = 1.0 / rank
    
    # Calculate recall and precision
    text_recall = len(evidence_found) / len(evidence_items)
    text_precision = chunks_matching / len(retrieved_docs)
    
    # Calculate F1
    if text_precision + text_recall > 0:
        text_f1 = 2 * (text_precision * text_recall) / (text_precision + text_recall)
    else:
        text_f1 = 0.0
    
    return {
        'text_mrr': text_mrr,
        'text_recall': text_recall,
        'text_precision': text_precision,
        'text_f1': text_f1
    }

print("✓ calculate_text_metrics_for_query() defined")

# %%
def evaluate_single_query(
    query_item: Dict,
    retrieved_docs_k_retrieve: List[Dict],
    retrieved_docs_k_rerank: List[Dict],
    reranked_docs: List[Dict],
    all_evidence: List[Dict],
    sbert_model: SentenceTransformer,
    threshold: float
) -> Dict:
    """
    Evaluate a single query with three sets of metrics.
    
    Args:
        query_item: Query from dataset with evidence
        retrieved_docs_k_retrieve: All k_retrieve documents
        retrieved_docs_k_rerank: Top k_rerank documents (before reranking)
        reranked_docs: Top k_rerank documents (after reranking)
        all_evidence: All evidence items with embeddings
        sbert_model: Sentence-BERT model
        threshold: Text similarity threshold
        
    Returns:
        Dict with all three metric sets
    """
    # Get evidence for this query
    doc_name = query_item['doc_name']
    query_evidence = [
        ev for ev in all_evidence 
        if ev['doc_name'] == doc_name and 
        ev['page_number'] in [e['evidence_page_num'] for e in query_item['evidence']]
    ]
    
    if len(query_evidence) == 0:
        # Fallback: use evidence from query_item
        query_evidence = []
        for ev in query_item['evidence']:
            # Find matching evidence in all_evidence
            matching = [
                e for e in all_evidence 
                if e['doc_name'] == doc_name and 
                e['page_number'] == ev['evidence_page_num']
            ]
            if matching:
                query_evidence.extend(matching)
    
    # Calculate metrics for k_retrieve
    metrics_k_retrieve = calculate_text_metrics_for_query(
        retrieved_docs_k_retrieve,
        query_evidence,
        sbert_model,
        threshold
    )
    
    # Calculate metrics for k_rerank (initial)
    metrics_k_rerank = calculate_text_metrics_for_query(
        retrieved_docs_k_rerank,
        query_evidence,
        sbert_model,
        threshold
    )
    
    # Calculate metrics for reranked
    metrics_reranked = calculate_text_metrics_for_query(
        reranked_docs,
        query_evidence,
        sbert_model,
        threshold
    )
    
    return {
        'initial_metrics_k_retrieve': metrics_k_retrieve,
        'initial_metrics_k_rerank': metrics_k_rerank,
        'reranked_metrics': metrics_reranked
    }

print("✓ evaluate_single_query() defined")

# %%
# Test metrics calculation
print("\n" + "="*80)
print("Testing metrics calculation...")
print("="*80)

# Create test data
test_retrieved = [
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 1,
        'content': 'The company reported revenue of $50 million in 2023',
        'rank': 1,
        'score': 0.95
    },
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 2,
        'content': 'Unrelated information about products',
        'rank': 2,
        'score': 0.85
    }
]

# Create test evidence with embeddings
test_evidence = [
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 1,
        'text': 'The company reported revenue of $50 million in 2023',
        'embedding': sbert_model.encode('The company reported revenue of $50 million in 2023', convert_to_numpy=True)
    }
]

metrics = calculate_text_metrics_for_query(
    test_retrieved,
    test_evidence,
    sbert_model,
    threshold=0.7
)

print("\nTest metrics:")
print(f"  MRR: {metrics['text_mrr']:.4f}")
print(f"  Recall: {metrics['text_recall']:.4f}")
print(f"  Precision: {metrics['text_precision']:.4f}")
print(f"  F1: {metrics['text_f1']:.4f}")

# %%
print("\n" + "="*80)
print("✓ STEP 7 COMPLETE: Evaluation Metrics Implemented (Matching Baseline)")
print("="*80)
print("\nImplemented functions:")
print("  ✓ compute_cosine_similarity() - Cosine similarity computation")
print("  ✓ calculate_text_similarities_for_chunk() - Calculate similarities with all evidence")
print("  ✓ calculate_text_metrics_for_query() - Calculate MRR, Recall, Precision, F1")
print("  ✓ evaluate_single_query() - Evaluate with three metric sets")
print("\nUsing Sentence-BERT model: all-MiniLM-L6-v2")
print("Similarity method: Cosine similarity (same as baseline)")
print("\nNext step: Implement main evaluation pipeline")

✓ compute_cosine_similarity() defined
✓ calculate_text_similarities_for_chunk() defined
✓ calculate_text_metrics_for_query() defined
✓ evaluate_single_query() defined

Testing metrics calculation...

Test metrics:
  MRR: 1.0000
  Recall: 1.0000
  Precision: 0.5000
  F1: 0.6667

✓ STEP 7 COMPLETE: Evaluation Metrics Implemented (Matching Baseline)

Implemented functions:
  ✓ compute_cosine_similarity() - Cosine similarity computation
  ✓ calculate_text_similarities_for_chunk() - Calculate similarities with all evidence
  ✓ calculate_text_metrics_for_query() - Calculate MRR, Recall, Precision, F1
  ✓ evaluate_single_query() - Evaluate with three metric sets

Using Sentence-BERT model: all-MiniLM-L6-v2
Similarity method: Cosine similarity (same as baseline)

Next step: Implement main evaluation pipeline


In [91]:
# %% [markdown]
# ## 8. Main Evaluation Pipeline

# %%
def evaluate_configuration(
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    reranker_model: str,
    mode: str,
    dataset,
    all_evidence: List[Dict],
    sbert_model: SentenceTransformer,
    reranker_instance,
    vector_db_base_dir: str,
    collection_prefix: str,
    output_dir: str,
    threshold: float,
    use_page_tolerance: bool
) -> Dict:
    """
    Evaluate a single configuration (provider, model, chunk_size, k_retrieve, k_rerank, reranker, mode).
    
    Args:
        provider: Embedding provider
        model: Embedding model
        chunk_size: Chunk size
        k_retrieve: Number of documents to retrieve
        k_rerank: Number of documents to keep after reranking
        reranker_model: Reranker model name
        mode: 'global' or 'single'
        dataset: FinanceBench dataset
        all_evidence: All evidence items with embeddings
        sbert_model: Sentence-BERT model for text similarity
        reranker_instance: Loaded reranker model instance
        vector_db_base_dir: Base directory for vector databases
        collection_prefix: Collection name prefix
        output_dir: Output directory for results
        threshold: Text similarity threshold
        use_page_tolerance: Whether to use page tolerance (not used in text-based metrics)
        
    Returns:
        Summary statistics dict
    """
    print(f"\n{'='*80}")
    print(f"Evaluating: {provider}/{model} | chunk={chunk_size} | k_retrieve={k_retrieve} | k_rerank={k_rerank} | {mode} | reranker={simplify_reranker_name(reranker_model)}")
    print(f"{'='*80}")
    
    # Check if results already exist
    if check_if_results_exist(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker_model, output_dir):
        print("⚠️  Results already exist. Skipping...")
        return {'status': 'skipped'}
    
    # Load vector store
    try:
        print("\n1. Loading vector store...")
        vectorstore = load_vector_store(provider, model, chunk_size, vector_db_base_dir, collection_prefix)
    except Exception as e:
        print(f"❌ Failed to load vector store: {e}")
        return {'status': 'failed', 'error': str(e)}
    
    # Initialize results storage
    query_results = []
    
    # Accumulators for averaging
    sum_metrics_k_retrieve = {'text_mrr': 0, 'text_recall': 0, 'text_precision': 0, 'text_f1': 0}
    sum_metrics_k_rerank = {'text_mrr': 0, 'text_recall': 0, 'text_precision': 0, 'text_f1': 0}
    sum_metrics_reranked = {'text_mrr': 0, 'text_recall': 0, 'text_precision': 0, 'text_f1': 0}
    
    # Process each query
    print(f"\n2. Processing {len(dataset)} queries...")
    for idx, query_item in enumerate(tqdm(dataset, desc="Queries")):
        question = query_item['question']
        doc_name = query_item['doc_name']
        
        # Add small delay to avoid rate limits (only for API-based rerankers)
        if reranker_model == 'voyage-rerank-2.5':
            time.sleep(0.5)  # 500ms delay between queries
        
        # Step 1: Retrieve top k_retrieve documents
        try:
            retrieved_k_retrieve = retrieve_documents(
                vectorstore,
                question,
                k=k_retrieve,
                mode=mode,
                doc_name=doc_name if mode == 'single' else None
            )
        except Exception as e:
            print(f"\n⚠️  Query {idx} retrieval failed: {e}")
            continue
        
        # Step 2: Get initial top k_rerank (before reranking)
        retrieved_k_rerank = retrieved_k_retrieve[:k_rerank]
        
        # Step 3: Rerank ALL k_retrieve documents (not just top k_rerank)
        try:
            reranked_all = rerank_documents(
                question,
                retrieved_k_retrieve,  # Pass all 100 documents
                reranker_model,
                reranker_instance,
                k_retrieve  # Rerank all documents
            )
        except Exception as e:
            print(f"\n⚠️  Query {idx} reranking failed: {e}")
            continue
        
        # Step 4: Get top k_rerank from reranked results
        reranked_top_k = reranked_all[:k_rerank]
        
        # Step 4: Get top k_rerank from reranked results
        reranked_top_k = reranked_all[:k_rerank]
        
        # Step 5: Evaluate with three metric sets
        metrics = evaluate_single_query(
            query_item,
            retrieved_k_retrieve,
            retrieved_k_rerank,
            reranked_top_k,  # Top k_rerank from reranked ALL documents
            all_evidence,
            sbert_model,
            threshold
        )
        
        # Accumulate metrics
        for key in sum_metrics_k_retrieve.keys():
            sum_metrics_k_retrieve[key] += metrics['initial_metrics_k_retrieve'][key]
            sum_metrics_k_rerank[key] += metrics['initial_metrics_k_rerank'][key]
            sum_metrics_reranked[key] += metrics['reranked_metrics'][key]
        
        # Prepare documents for storage (with truncated text)
        def prepare_doc_for_storage(doc):
            return {
                'doc_name': doc['doc_name'],
                'page_number': doc['page_number'],
                'chunk_text': truncate_chunk_text(doc['content'], CHUNK_TEXT_PREFIX_CHARS, CHUNK_TEXT_SUFFIX_CHARS),
                'rank': doc['rank'],
                'initial_rank': doc.get('initial_rank', doc['rank']),
                'initial_score': round(doc.get('initial_score', doc.get('score', 0.0)), 4),
                'rerank_score': round(doc.get('rerank_score'), 4) if doc.get('rerank_score') is not None else None
            }
        
        # Store query result (only initial top k_rerank and reranked top k_rerank)
        query_result = {
            'question_id': idx,
            'question': question,
            'doc_name': doc_name,
            'evidence_pages': [ev['evidence_page_num'] for ev in query_item['evidence']],
            'retrieved_docs_k_rerank': [prepare_doc_for_storage(doc) for doc in retrieved_k_rerank],
            'reranked_docs': [prepare_doc_for_storage(doc) for doc in reranked_top_k],
            'initial_metrics_k_retrieve': {k: round(v, 4) for k, v in metrics['initial_metrics_k_retrieve'].items()},
            'initial_metrics_k_rerank': {k: round(v, 4) for k, v in metrics['initial_metrics_k_rerank'].items()},
            'reranked_metrics': {k: round(v, 4) for k, v in metrics['reranked_metrics'].items()}
        }
        query_results.append(query_result)
    
    # Calculate averages
    num_queries = len(query_results)
    if num_queries == 0:
        print("\n❌ No queries were successfully processed")
        return {'status': 'failed', 'error': 'No queries processed'}
    
    avg_metrics_k_retrieve = {k: round(v / num_queries, 4) for k, v in sum_metrics_k_retrieve.items()}
    avg_metrics_k_rerank = {k: round(v / num_queries, 4) for k, v in sum_metrics_k_rerank.items()}
    avg_metrics_reranked = {k: round(v / num_queries, 4) for k, v in sum_metrics_reranked.items()}
    
    # Calculate improvements
    improvement = {
        'mrr_delta': round(avg_metrics_reranked['text_mrr'] - avg_metrics_k_rerank['text_mrr'], 4),
        'recall_delta': round(avg_metrics_reranked['text_recall'] - avg_metrics_k_rerank['text_recall'], 4),
        'precision_delta': round(avg_metrics_reranked['text_precision'] - avg_metrics_k_rerank['text_precision'], 4),
        'f1_delta': round(avg_metrics_reranked['text_f1'] - avg_metrics_k_rerank['text_f1'], 4)
    }
    
    # Prepare final results
    results = {
        'query_results': query_results,
        'summary': {
            'configuration': {
                'provider': provider,
                'model': model,
                'chunk_size': chunk_size,
                'k_retrieve': k_retrieve,
                'k_rerank': k_rerank,
                'reranker_model': reranker_model,
                'mode': mode,
                'text_similarity_threshold': threshold,
                'use_page_tolerance': use_page_tolerance
            },
            'total_queries': num_queries,
            'average_initial_metrics_k_retrieve': avg_metrics_k_retrieve,
            'average_initial_metrics_k_rerank': avg_metrics_k_rerank,
            'average_reranked_metrics': avg_metrics_reranked,
            'improvement_from_k_rerank_to_reranked': improvement
        }
    }
    
    # Save results
    print(f"\n3. Saving results...")
    save_path = save_results(
        results,
        provider,
        model,
        chunk_size,
        k_retrieve,
        k_rerank,
        mode,
        reranker_model,
        output_dir
    )
    print(f"✓ Results saved to: {save_path}")
    
    # Print summary
    print(f"\n{'='*80}")
    print("RESULTS SUMMARY")
    print(f"{'='*80}")
    print(f"\nMetrics at k_retrieve={k_retrieve}:")
    print(f"  MRR:       {avg_metrics_k_retrieve['text_mrr']:.4f}")
    print(f"  Recall:    {avg_metrics_k_retrieve['text_recall']:.4f}")
    print(f"  Precision: {avg_metrics_k_retrieve['text_precision']:.4f}")
    print(f"  F1:        {avg_metrics_k_retrieve['text_f1']:.4f}")
    
    print(f"\nMetrics at k_rerank={k_rerank} (before reranking):")
    print(f"  MRR:       {avg_metrics_k_rerank['text_mrr']:.4f}")
    print(f"  Recall:    {avg_metrics_k_rerank['text_recall']:.4f}")
    print(f"  Precision: {avg_metrics_k_rerank['text_precision']:.4f}")
    print(f"  F1:        {avg_metrics_k_rerank['text_f1']:.4f}")
    
    print(f"\nMetrics at k_rerank={k_rerank} (after reranking):")
    print(f"  MRR:       {avg_metrics_reranked['text_mrr']:.4f}")
    print(f"  Recall:    {avg_metrics_reranked['text_recall']:.4f}")
    print(f"  Precision: {avg_metrics_reranked['text_precision']:.4f}")
    print(f"  F1:        {avg_metrics_reranked['text_f1']:.4f}")
    
    print(f"\nImprovement (reranked vs initial k_rerank):")
    print(f"  MRR:       {improvement['mrr_delta']:+.4f}")
    print(f"  Recall:    {improvement['recall_delta']:+.4f}")
    print(f"  Precision: {improvement['precision_delta']:+.4f}")
    print(f"  F1:        {improvement['f1_delta']:+.4f}")
    print(f"{'='*80}")
    
    return {
        'status': 'completed',
        'summary': results['summary']
    }

print("✓ evaluate_configuration() defined")

# %%
print("\n" + "="*80)
print("✓ STEP 8 COMPLETE: Main Evaluation Pipeline Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ evaluate_configuration() - Complete evaluation pipeline")
print("\nNext step: Batch evaluation execution")

✓ evaluate_configuration() defined

✓ STEP 8 COMPLETE: Main Evaluation Pipeline Implemented

Implemented functions:
  ✓ evaluate_configuration() - Complete evaluation pipeline

Next step: Batch evaluation execution


In [92]:
# %%
# Re-ranking Configurations
configurations = [
    {
        'provider': 'voyage',
        'model': 'voyage-3-large',
        'chunk_sizes': [1024],
        'k_retrieve': 80,       # Retrieve 100 documents from vector store
        'k_rerank': 20,          # Keep top 20 after re-ranking
        'reranker_models': [
            # 'cross-encoder/ms-marco-MiniLM-L-12-v2',
            #'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    }
    # {
    #     'provider': 'ollama',
    #     'model': 'ollama-bge-m3',
    #     'chunk_sizes': [1024],
    #     'k_retrieve': 100,       # Retrieve 100 documents from vector store
    #     'k_rerank': 20,          # Keep top 20 after re-ranking
    #     'reranker_models': [
    #         # 'cross-encoder/ms-marco-MiniLM-L-12-v2',
    #         # 'BAAI/bge-reranker-large',
    #         'voyage-rerank-2.5'
    #     ]
    # }
]

In [100]:
# %% [markdown]
# ## 9. Batch Evaluation Execution

# %%
def run_batch_evaluation(
    configurations: List[Dict],
    modes: List[str],
    dataset,
    all_evidence: List[Dict],
    sbert_model: SentenceTransformer,
    reranker_instances: Dict,
    vector_db_base_dir: str,
    collection_prefix: str,
    output_dir: str,
    threshold: float,
    use_page_tolerance: bool
):
    """
    Run evaluation for all configurations.
    
    Args:
        configurations: List of configuration dicts
        modes: List of modes ('global', 'single')
        dataset: FinanceBench dataset
        all_evidence: All evidence items with embeddings
        sbert_model: Sentence-BERT model for text similarity
        reranker_instances: Dict of loaded reranker models
        vector_db_base_dir: Base directory for vector databases
        collection_prefix: Collection name prefix
        output_dir: Output directory
        threshold: Text similarity threshold
        use_page_tolerance: Whether to use page tolerance
    """
    print("\n" + "#"*80)
    print("STARTING BATCH EVALUATION")
    print("#"*80)
    
    # Calculate total runs
    total_runs = 0
    for config in configurations:
        chunk_sizes = config['chunk_sizes']
        reranker_models = config['reranker_models']
        total_runs += len(chunk_sizes) * len(reranker_models) * len(modes)
    
    print(f"\nTotal evaluation runs: {total_runs}")
    print(f"Output directory: {output_dir}")
    
    # Track results
    all_results = []
    completed = 0
    skipped = 0
    failed = 0
    
    # Iterate through all configurations
    for config_idx, config in enumerate(configurations, 1):
        provider = config['provider']
        model = config['model']
        chunk_sizes = config['chunk_sizes']
        k_retrieve = config['k_retrieve']
        k_rerank = config['k_rerank']
        reranker_models = config['reranker_models']
        
        print(f"\n{'#'*80}")
        print(f"Configuration {config_idx}/{len(configurations)}: {provider}/{model}")
        print(f"{'#'*80}")
        
        for chunk_size in chunk_sizes:
            for reranker_model in reranker_models:
                for mode in modes:
                    # Get reranker instance
                    reranker_instance = reranker_instances.get(reranker_model)
                    
                    if reranker_instance is None:
                        print(f"\n⚠️  Reranker {reranker_model} not found. Skipping...")
                        failed += 1
                        continue
                    
                    # Run evaluation
                    try:
                        result = evaluate_configuration(
                            provider=provider,
                            model=model,
                            chunk_size=chunk_size,
                            k_retrieve=k_retrieve,
                            k_rerank=k_rerank,
                            reranker_model=reranker_model,
                            mode=mode,
                            dataset=dataset,
                            all_evidence=all_evidence,
                            sbert_model=sbert_model,
                            reranker_instance=reranker_instance,
                            vector_db_base_dir=vector_db_base_dir,
                            collection_prefix=collection_prefix,
                            output_dir=output_dir,
                            threshold=threshold,
                            use_page_tolerance=use_page_tolerance
                        )
                        
                        if result['status'] == 'completed':
                            completed += 1
                            all_results.append(result)
                        elif result['status'] == 'skipped':
                            skipped += 1
                        else:
                            failed += 1
                            
                    except Exception as e:
                        print(f"\n❌ Evaluation failed with exception: {e}")
                        import traceback
                        traceback.print_exc()
                        failed += 1
    
    # Print final summary
    print("\n" + "#"*80)
    print("BATCH EVALUATION COMPLETE")
    print("#"*80)
    print(f"\nTotal runs: {total_runs}")
    print(f"  ✓ Completed: {completed}")
    print(f"  ⊘ Skipped:   {skipped}")
    print(f"  ✗ Failed:    {failed}")
    print(f"\nResults saved to: {output_dir}")
    
    return {
        'total_runs': total_runs,
        'completed': completed,
        'skipped': skipped,
        'failed': failed,
        'results': all_results
    }

print("✓ run_batch_evaluation() defined")

# %% [markdown]
# ## 9.1 Display Evaluation Plan

# %%
print("\n" + "="*80)
print("EVALUATION PLAN")
print("="*80)

print(f"\nDataset: FinanceBench ({len(dataset)} queries)")

print(f"\nEvaluation Settings:")
print(f"  Modes: {modes}")
print(f"  Text similarity threshold: {TEXT_SIMILARITY_THRESHOLD}")
print(f"  Page tolerance: {'ENABLED' if USE_PAGE_TOLERANCE else 'DISABLED'}")

print(f"\nConfigurations to evaluate:")
total_runs = 0
for i, config in enumerate(configurations, 1):
    provider = config['provider']
    model = config['model']
    chunk_sizes = config['chunk_sizes']
    k_retrieve = config['k_retrieve']
    k_rerank = config['k_rerank']
    reranker_models = config['reranker_models']
    
    runs_for_config = len(chunk_sizes) * len(reranker_models) * len(modes)
    total_runs += runs_for_config
    
    print(f"\n  {i}. {provider}/{model}")
    print(f"     Chunk sizes: {chunk_sizes}")
    print(f"     k_retrieve: {k_retrieve}, k_rerank: {k_rerank}")
    print(f"     Reranker models: {len(reranker_models)}")
    for reranker in reranker_models:
        print(f"       - {reranker}")
    print(f"     Evaluation runs: {runs_for_config}")
    
    # Show output filenames that will be generated
    print(f"     Output files (sample):")
    for chunk_size in chunk_sizes[:1]:  # Show only first chunk size
        for reranker in reranker_models[:2]:  # Show only first 2 rerankers
            for mode in modes:
                filename = get_output_filename(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker)
                exists = check_if_results_exist(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker, OUTPUT_DIR)
                status = "EXISTS" if exists else "TO CREATE"
                print(f"       - {filename} [{status}]")
        if len(reranker_models) > 2:
            print(f"       ... ({len(reranker_models) - 2} more rerankers)")

print(f"\n{'='*80}")
print(f"Total evaluation runs: {total_runs}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"{'='*80}")

# %% [markdown]
# ## 9.2 Execute Batch Evaluation
# 
# **IMPORTANT**: This cell will run the full evaluation.
# - Depending on configurations, this may take significant time (hours)
# - Progress will be shown for each configuration
# - Results are saved incrementally (existing results are skipped)
# - You can interrupt and resume anytime

# %%
# Uncomment the lines below to start the evaluation

batch_results = run_batch_evaluation(
    configurations=configurations,
    modes=modes,
    dataset=dataset,
    all_evidence=all_evidence,
    sbert_model=sbert_model,
    reranker_instances=reranker_instances,
    vector_db_base_dir=VECTOR_DB_BASE_DIR,
    collection_prefix=COLLECTION_PREFIX,
    output_dir=OUTPUT_DIR,
    threshold=TEXT_SIMILARITY_THRESHOLD,
    use_page_tolerance=USE_PAGE_TOLERANCE
)

# print("\n⚠️  Batch evaluation is commented out.")
# print("Uncomment the code above to start the evaluation.")

# %%
print("\n" + "="*80)
print("✓ STEP 9 COMPLETE: Batch Evaluation Ready")
print("="*80)
print("\nTo run the evaluation:")
print("  1. Review the evaluation plan above")
print("  2. Uncomment the batch evaluation code in section 9.2")
print("  3. Run the cell to start evaluation")
print("  4. Monitor progress (may take hours depending on configurations)")
print("\nNext step: Results analysis and visualization")

✓ run_batch_evaluation() defined

EVALUATION PLAN

Dataset: FinanceBench (150 queries)

Evaluation Settings:
  Modes: ['global', 'single']
  Text similarity threshold: 0.7
  Page tolerance: ENABLED

Configurations to evaluate:

  1. voyage/voyage-3-large
     Chunk sizes: [1024]
     k_retrieve: 80, k_rerank: 20
     Reranker models: 1
       - voyage-rerank-2.5
     Evaluation runs: 2
     Output files (sample):
       - voyage_voyage-3-large_chunk1024_k80_global_rerank_k20-voyage-rerank-2.5.json [TO CREATE]
       - voyage_voyage-3-large_chunk1024_k80_single_rerank_k20-voyage-rerank-2.5.json [TO CREATE]

Total evaluation runs: 2
Output directory: ../../evaluation_results/reranking_results

################################################################################
STARTING BATCH EVALUATION
################################################################################

Total evaluation runs: 2
Output directory: ../../evaluation_results/reranking_results

#######################

Queries:   0%|          | 0/150 [00:00<?, ?it/s]


3. Saving results...
✓ Results saved to: ../../evaluation_results/reranking_results/voyage_voyage-3-large_chunk1024_k80_global_rerank_k20-voyage-rerank-2.5.json

RESULTS SUMMARY

Metrics at k_retrieve=80:
  MRR:       0.5422
  Recall:    0.8731
  Precision: 0.0946
  F1:        0.1633

Metrics at k_rerank=20 (before reranking):
  MRR:       0.5409
  Recall:    0.8120
  Precision: 0.1763
  F1:        0.2723

Metrics at k_rerank=20 (after reranking):
  MRR:       0.6440
  Recall:    0.8309
  Precision: 0.2117
  F1:        0.3188

Improvement (reranked vs initial k_rerank):
  MRR:       +0.1031
  Recall:    +0.0189
  Precision: +0.0354
  F1:        +0.0465

Evaluating: voyage/voyage-3-large | chunk=1024 | k_retrieve=80 | k_rerank=20 | single | reranker=voyage-rerank-2.5

1. Loading vector store...
  ✓ Loaded collection 'financebench_docs_chunk_1024' from ../../vector_databases/voyage_voyage-3-large
    Documents: 15765

2. Processing 150 queries...


Queries:   0%|          | 0/150 [00:00<?, ?it/s]


3. Saving results...
✓ Results saved to: ../../evaluation_results/reranking_results/voyage_voyage-3-large_chunk1024_k80_single_rerank_k20-voyage-rerank-2.5.json

RESULTS SUMMARY

Metrics at k_retrieve=80:
  MRR:       0.5689
  Recall:    0.8376
  Precision: 0.0906
  F1:        0.1547

Metrics at k_rerank=20 (before reranking):
  MRR:       0.5682
  Recall:    0.8176
  Precision: 0.1495
  F1:        0.2383

Metrics at k_rerank=20 (after reranking):
  MRR:       0.6914
  Recall:    0.8309
  Precision: 0.1802
  F1:        0.2788

Improvement (reranked vs initial k_rerank):
  MRR:       +0.1232
  Recall:    +0.0133
  Precision: +0.0307
  F1:        +0.0405

################################################################################
BATCH EVALUATION COMPLETE
################################################################################

Total runs: 2
  ✓ Completed: 2
  ⊘ Skipped:   0
  ✗ Failed:    0

Results saved to: ../../evaluation_results/reranking_results

✓ STEP 9 COMPLETE: B