In [1]:
# %% [markdown]
# # Simplified Re-Ranking Pipeline for Answer Generation
# This notebook implements re-ranking for RAG systems and outputs simplified JSON files
# with FULL chunk text for use in answer generation pipelines.
# 
# **Workflow:**
# 1. Retrieve top k_retrieve chunks from existing vector stores
# 2. Apply re-ranking to get top k_rerank chunks
# 3. Save results with full chunk text (no truncation, no evaluation metrics)

# %% [markdown]
# ## 1. Imports and Configuration

# %%
# Standard library imports
import os
import json
from typing import List, Dict, Tuple, Any
from pathlib import Path
from datetime import datetime

# Third-party imports
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv

# LangChain imports
from langchain.schema import Document
from langchain_voyageai import VoyageAIRerank
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_voyageai import VoyageAIEmbeddings
from langchain_openai import OpenAIEmbeddings

# Sentence transformers for cross-encoders
from sentence_transformers import CrossEncoder

# Hugging Face datasets
from datasets import load_dataset

print("✓ All imports successful")

# %%
# Load environment variables
load_dotenv()

# API Keys
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Verify API keys
assert VOYAGE_API_KEY is not None, "VOYAGE_API_KEY not found in environment"
print("✓ API keys loaded successfully")

# %%
# Configuration Parameters
# Paths
VECTOR_DB_BASE_DIR = "../../vector_databases"  # Base directory for vector databases
COLLECTION_PREFIX = "financebench_docs_chunk_"  # Collection name prefix
OUTPUT_DIR = "../../retrieval_set"

# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print("✓ Configuration parameters set")
print(f"  - Vector DB base directory: {VECTOR_DB_BASE_DIR}")
print(f"  - Collection prefix: {COLLECTION_PREFIX}")
print(f"  - Output directory: {OUTPUT_DIR}")

# Evaluation modes
modes = ['global', 'single']
print(f"\n  Evaluation modes: {modes}")

# %%
print("\n" + "="*80)
print("✓ STEP 1 COMPLETE: Imports and Configuration")
print("="*80)
print("\nNext steps:")
print("  2. Load dataset")
print("  3. Initialize reranker models")
print("  4. Implement retrieval and re-ranking functions")
print("  5. Implement main pipeline")
print("  6. Run batch evaluation")

✓ All imports successful
✓ API keys loaded successfully
✓ Configuration parameters set
  - Vector DB base directory: ../../vector_databases
  - Collection prefix: financebench_docs_chunk_
  - Output directory: ../../retrieval_set

  Evaluation modes: ['global', 'single']

✓ STEP 1 COMPLETE: Imports and Configuration

Next steps:
  2. Load dataset
  3. Initialize reranker models
  4. Implement retrieval and re-ranking functions
  5. Implement main pipeline
  6. Run batch evaluation


In [2]:
# %% [markdown]
# ## 2. Load Dataset

# %%
# Load FinanceBench dataset
print("Loading FinanceBench dataset...")
dataset = load_dataset("PatronusAI/financebench", split="train")
print(f"✓ Loaded {len(dataset)} questions from FinanceBench")

# %%
# Display dataset structure
print("\nDataset fields:")
print(f"  Available columns: {dataset.column_names}")

# %%
# Display sample query to verify structure
print("\nSample query from dataset:")
sample_query = dataset[0]
print(f"  financebench_id: {sample_query['financebench_id']}")
print(f"  question_type: {sample_query['question_type']}")
print(f"  question_reasoning: {sample_query['question_reasoning']}")
print(f"  question: {sample_query['question'][:100]}...")
print(f"  doc_name: {sample_query['doc_name']}")
print(f"  answer: {sample_query['answer']}")
print(f"  evidence items: {len(sample_query['evidence'])}")

# %%
# Verify all required fields exist
required_fields = [
    'financebench_id', 
    'question_type', 
    'question_reasoning', 
    'question', 
    'doc_name',
    'answer',
    'evidence'
]

print("\nVerifying required fields...")
for field in required_fields:
    if field in sample_query:
        print(f"  ✓ {field}")
    else:
        print(f"  ✗ {field} - MISSING!")
        raise ValueError(f"Required field '{field}' not found in dataset")

print("\n✓ All required fields present")

# %%
print("\n" + "="*80)
print("✓ STEP 2 COMPLETE: Dataset Loaded")
print("="*80)
print(f"\nDataset statistics:")
print(f"  Total queries: {len(dataset)}")
print(f"  Fields available: {len(dataset.column_names)}")
print(f"\nNext step: Initialize reranker models")

Loading FinanceBench dataset...
✓ Loaded 150 questions from FinanceBench

Dataset fields:
  Available columns: ['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link']

Sample query from dataset:
  financebench_id: financebench_id_03029
  question_type: metrics-generated
  question_reasoning: Information extraction
  question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
  doc_name: 3M_2018_10K
  answer: $1577.00
  evidence items: 1

Verifying required fields...
  ✓ financebench_id
  ✓ question_type
  ✓ question_reasoning
  ✓ question
  ✓ doc_name
  ✓ answer
  ✓ evidence

✓ All required fields present

✓ STEP 2 COMPLETE: Dataset Loaded

Dataset statistics:
  Total queries: 150
  Fields available: 15

Next step: Initialize reranker models


In [3]:
# %% [markdown]
# ## 3. Initialize Reranker Models

# %%
# Define configurations
configurations = [
    {
        'provider': 'voyage',
        'model': 'voyage-3-large',
        'chunk_sizes': [512],
        'k_retrieve': 80,
        'k_rerank': 20,
        'reranker_models': [
            'cross-encoder/ms-marco-MiniLM-L-12-v2',
            'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    },
    {
        'provider': 'ollama',
        'model': 'nomic-embed-text',
        'chunk_sizes': [512],
        'k_retrieve': 80,
        'k_rerank': 20,
        'reranker_models': [
            'cross-encoder/ms-marco-MiniLM-L-12-v2',
            'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    },
    {
        'provider': 'voyage',
        'model': 'voyage-3-large',
        'chunk_sizes': [1024],
        'k_retrieve': 80,
        'k_rerank': 20,
        'reranker_models': [
            'cross-encoder/ms-marco-MiniLM-L-12-v2',
            'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    },
    {
        'provider': 'ollama',
        'model': 'nomic-embed-text',
        'chunk_sizes': [1024],
        'k_retrieve': 80,
        'k_rerank': 20,
        'reranker_models': [
            'cross-encoder/ms-marco-MiniLM-L-12-v2',
            'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    },
    {
        'provider': 'voyage',
        'model': 'voyage-3-large',
        'chunk_sizes': [2048],
        'k_retrieve': 80,
        'k_rerank': 10,
        'reranker_models': [
            'cross-encoder/ms-marco-MiniLM-L-12-v2',
            'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    },
    {
        'provider': 'ollama',
        'model': 'nomic-embed-text',
        'chunk_sizes': [2048],
        'k_retrieve': 80,
        'k_rerank': 10,
        'reranker_models': [
            'cross-encoder/ms-marco-MiniLM-L-12-v2',
            'BAAI/bge-reranker-large',
            'voyage-rerank-2.5'
        ]
    }
]

print(f"✓ Defined {len(configurations)} configurations")

# %%
def get_all_unique_rerankers(configurations: List[Dict]) -> List[str]:
    """Extract all unique reranker models from configurations."""
    rerankers = set()
    for config in configurations:
        for reranker in config['reranker_models']:
            rerankers.add(reranker)
    return sorted(list(rerankers))

# Get all unique rerankers
unique_rerankers = get_all_unique_rerankers(configurations)
print(f"\nUnique reranker models to initialize: {len(unique_rerankers)}")
for reranker in unique_rerankers:
    print(f"  - {reranker}")

# %%
# Initialize reranker models
print("\n" + "="*80)
print("Initializing reranker models...")
print("="*80)

reranker_instances = {}

for reranker_name in unique_rerankers:
    print(f"\nLoading: {reranker_name}")
    
    if reranker_name == 'voyage-rerank-2.5':
        # Voyage reranker will be initialized per-query (API-based)
        reranker_instances[reranker_name] = 'api'
        print(f"  ✓ Voyage reranker marked as API-based (will initialize per-query)")
        
    elif reranker_name.startswith('cross-encoder/') or reranker_name.startswith('BAAI/'):
        # Load Hugging Face cross-encoder models
        try:
            model = CrossEncoder(reranker_name)
            reranker_instances[reranker_name] = model
            print(f"  ✓ Successfully loaded Hugging Face model")
            print(f"    Max sequence length: {model.max_length}")
        except Exception as e:
            print(f"  ✗ Failed to load {reranker_name}: {e}")
            raise
    else:
        raise ValueError(f"Unknown reranker type: {reranker_name}")

print("\n" + "="*80)
print(f"✓ Initialized {len(reranker_instances)} reranker models")
print("="*80)

# %%
# Display loaded models
print("\nLoaded reranker models:")
for name, instance in reranker_instances.items():
    if instance == 'api':
        print(f"  ✓ {name} (API-based)")
    else:
        print(f"  ✓ {name} (Local model)")

# %%
print("\n" + "="*80)
print("✓ STEP 3 COMPLETE: Reranker Models Initialized")
print("="*80)
print("\nReady rerankers:")
for reranker_name in reranker_instances.keys():
    print(f"  - {reranker_name}")
print(f"\nNext step: Implement helper functions")

✓ Defined 6 configurations

Unique reranker models to initialize: 3
  - BAAI/bge-reranker-large
  - cross-encoder/ms-marco-MiniLM-L-12-v2
  - voyage-rerank-2.5

Initializing reranker models...

Loading: BAAI/bge-reranker-large
  ✓ Successfully loaded Hugging Face model
    Max sequence length: 512

Loading: cross-encoder/ms-marco-MiniLM-L-12-v2
  ✓ Successfully loaded Hugging Face model
    Max sequence length: 512

Loading: voyage-rerank-2.5
  ✓ Voyage reranker marked as API-based (will initialize per-query)

✓ Initialized 3 reranker models

Loaded reranker models:
  ✓ BAAI/bge-reranker-large (Local model)
  ✓ cross-encoder/ms-marco-MiniLM-L-12-v2 (Local model)
  ✓ voyage-rerank-2.5 (API-based)

✓ STEP 3 COMPLETE: Reranker Models Initialized

Ready rerankers:
  - BAAI/bge-reranker-large
  - cross-encoder/ms-marco-MiniLM-L-12-v2
  - voyage-rerank-2.5

Next step: Implement helper functions


In [10]:
# %% [markdown]
# ## 4. Helper Functions

# %%
def extract_doc_name_from_path(file_path: str) -> str:
    """
    Extract document name from file path.
    
    Example:
        "../../documents/3M_2018_10K.pdf" → "3M_2018_10K"
    
    Args:
        file_path: Full path to document
        
    Returns:
        Document name without extension
    """
    return Path(file_path).stem

print("✓ extract_doc_name_from_path() defined")

# %%
def extract_metadata_from_retrieved_doc(doc) -> Dict:
    """
    Extract metadata from a retrieved LangChain document.
    
    ChromaDB metadata structure:
        - file_path: Full path to PDF
        - source: Page number (as integer or string, 0-indexed)
    
    Args:
        doc: LangChain Document object from vectorstore.similarity_search()
        
    Returns:
        Dict with:
            - doc_name: Document name (e.g., "3M_2018_10K")
            - page_number: Page number (integer, 1-indexed)
            - chunk_text: Full chunk text
    """
    metadata = doc.metadata
    
    # Extract document name from file_path
    file_path = metadata.get('file_path', '')
    doc_name = extract_doc_name_from_path(file_path) if file_path else ''
    
    # Extract page number from 'source'
    # FinanceBench uses 0-indexed pages, but we convert to 1-indexed
    page_source = metadata.get('source', -1)
    
    # Handle both string and integer page numbers
    if isinstance(page_source, str):
        try:
            page_number = int(page_source) + 1  # Convert to 1-indexed
        except ValueError:
            page_number = -1
    elif isinstance(page_source, int):
        page_number = page_source + 1  # Convert to 1-indexed
    else:
        page_number = -1
    
    return {
        'doc_name': doc_name,
        'page_number': page_number,
        'chunk_text': doc.page_content
    }

print("✓ extract_metadata_from_retrieved_doc() defined")

# %%
def simplify_reranker_name(reranker_model: str) -> str:
    """
    Simplify reranker model name for display and filenames.
    
    Examples:
        'cross-encoder/ms-marco-MiniLM-L-12-v2' -> 'ms-marco-MiniLM-L-12-v2'
        'BAAI/bge-reranker-large' -> 'bge-reranker-large'
        'voyage-rerank-2.5' -> 'voyage-rerank-2.5'
    """
    if '/' in reranker_model:
        return reranker_model.split('/')[-1]
    return reranker_model

print("✓ simplify_reranker_name() defined")

# %%
def get_output_filename(
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    mode: str,
    reranker_model: str
) -> str:
    """
    Generate output filename for results.
    
    Format: {provider}_{model}_chunk{chunk_size}_k{k_retrieve}_{mode}_rerank_k{k_rerank}-{reranker}.json
    
    Example: voyage_voyage-3-large_chunk512_k80_single_rerank_k20-voyage-rerank-2.5.json
    """
    reranker_simple = simplify_reranker_name(reranker_model)
    filename = f"{provider}_{model}_chunk{chunk_size}_k{k_retrieve}_{mode}_rerank_k{k_rerank}-{reranker_simple}.json"
    return filename

print("✓ get_output_filename() defined")

# %%
def check_if_results_exist(
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    mode: str,
    reranker_model: str,
    output_dir: str
) -> bool:
    """
    Check if results file already exists.
    
    Returns:
        True if file exists, False otherwise
    """
    filename = get_output_filename(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker_model)
    filepath = Path(output_dir) / filename
    return filepath.exists()

print("✓ check_if_results_exist() defined")

# %%
def save_results(
    results: Dict,
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    mode: str,
    reranker_model: str,
    output_dir: str
) -> str:
    """
    Save results to JSON file.
    
    Args:
        results: Results dictionary to save
        provider: Embedding provider
        model: Embedding model
        chunk_size: Chunk size
        k_retrieve: Number of documents retrieved
        k_rerank: Number of documents kept after reranking
        mode: 'global' or 'single'
        reranker_model: Reranker model name
        output_dir: Output directory
        
    Returns:
        Path to saved file
    """
    filename = get_output_filename(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker_model)
    filepath = Path(output_dir) / filename
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    return str(filepath)

print("✓ save_results() defined")

# %%
print("\n" + "="*80)
print("✓ STEP 4 COMPLETE: Helper Functions Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ extract_doc_name_from_path() - Extract doc name from file path")
print("  ✓ extract_metadata_from_retrieved_doc() - Extract metadata from retrieved docs")
print("  ✓ simplify_reranker_name() - Simplify reranker names")
print("  ✓ get_output_filename() - Generate output filenames")
print("  ✓ check_if_results_exist() - Check if results already exist")
print("  ✓ save_results() - Save results to JSON")
print("\nNext step: Implement retrieval functions")

✓ extract_doc_name_from_path() defined
✓ extract_metadata_from_retrieved_doc() defined
✓ simplify_reranker_name() defined
✓ get_output_filename() defined
✓ check_if_results_exist() defined
✓ save_results() defined

✓ STEP 4 COMPLETE: Helper Functions Implemented

Implemented functions:
  ✓ extract_doc_name_from_path() - Extract doc name from file path
  ✓ extract_metadata_from_retrieved_doc() - Extract metadata from retrieved docs
  ✓ simplify_reranker_name() - Simplify reranker names
  ✓ get_output_filename() - Generate output filenames
  ✓ check_if_results_exist() - Check if results already exist
  ✓ save_results() - Save results to JSON

Next step: Implement retrieval functions


In [11]:
# %% [markdown]
# ## 5. Retrieval Functions

# %%
def get_embedding_model(provider: str, model: str):
    """
    Initialize embedding model based on provider and model name.
    
    Args:
        provider: 'voyage', 'openai', or 'ollama'
        model: Model name
        
    Returns:
        Embedding model instance
    """
    if provider == 'voyage':
        return VoyageAIEmbeddings(
            model=model,
            voyage_api_key=VOYAGE_API_KEY
        )
    elif provider == 'openai':
        return OpenAIEmbeddings(
            model=model,
            openai_api_key=OPENAI_API_KEY
        )
    elif provider == 'ollama':
        return OllamaEmbeddings(
            model=model,
            base_url=OLLAMA_BASE_URL
        )
    else:
        raise ValueError(f"Unknown provider: {provider}")

print("✓ get_embedding_model() defined")

# %%
def load_vector_store(provider: str, model: str, chunk_size: int, base_dir: str, collection_prefix: str = "financebench_docs_chunk_"):
    """
    Load existing ChromaDB vector store.
    
    Matches the directory structure from baseline:
    {base_dir}/{provider}_{model}/financebench_docs_chunk_{chunk_size}/
    
    Args:
        provider: Embedding provider
        model: Embedding model name
        chunk_size: Chunk size used
        base_dir: Base directory for vector databases
        collection_prefix: Prefix for collection names
        
    Returns:
        Chroma vector store instance
        
    Raises:
        ValueError: If collection doesn't exist
    """
    # Construct paths matching baseline structure
    model_id = f"{provider}_{model.replace('/', '_')}"
    db_path = os.path.join(base_dir, model_id)
    collection_name = f"{collection_prefix}{chunk_size}"
    
    embedding_model = get_embedding_model(provider, model)
    
    try:
        vectorstore = Chroma(
            collection_name=collection_name,
            embedding_function=embedding_model,
            persist_directory=db_path
        )
        
        # Verify collection exists by checking count
        count = vectorstore._collection.count()
        if count == 0:
            raise ValueError(f"Collection '{collection_name}' is empty")
        
        print(f"  ✓ Loaded collection '{collection_name}' from {db_path}")
        print(f"    Documents: {count}")
        return vectorstore
        
    except Exception as e:
        raise ValueError(f"Failed to load collection '{collection_name}' from {db_path}: {e}")

print("✓ load_vector_store() defined")

# %%
def retrieve_documents(
    vectorstore,
    query: str,
    k: int,
    mode: str,
    doc_name: str = None
) -> List[Dict]:
    """
    Retrieve top k documents from vector store.
    
    Args:
        vectorstore: ChromaDB vector store
        query: Search query
        k: Number of documents to retrieve
        mode: 'global' or 'single'
        doc_name: Document name (required for 'single' mode)
        
    Returns:
        List of retrieved documents with metadata
        Format: [{'doc_name': str, 'page_number': int, 'content': str, 'rank': int, 'score': float}, ...]
    """
    if mode == 'single':
        if doc_name is None:
            raise ValueError("doc_name required for single-document mode")
        
        # Retrieve more documents and filter in Python
        # Retrieve 3x to ensure we get enough from the target document
        results = vectorstore.similarity_search_with_score(query, k=k * 3)
        
        # Filter to only documents matching the doc_name
        filtered_results = []
        for doc, score in results:
            metadata = extract_metadata_from_retrieved_doc(doc)
            if metadata['doc_name'] == doc_name:
                filtered_results.append((doc, score))
                if len(filtered_results) >= k:
                    break
        
        results = filtered_results[:k]
    else:  # global mode
        results = vectorstore.similarity_search_with_score(query, k=k)
    
    # Format results using metadata extraction
    retrieved_docs = []
    for rank, (doc, score) in enumerate(results, start=1):
        metadata = extract_metadata_from_retrieved_doc(doc)
        
        retrieved_docs.append({
            'doc_name': metadata['doc_name'],
            'page_number': metadata['page_number'],
            'content': metadata['chunk_text'],  # FULL text
            'rank': rank,
            'score': float(score)
        })
    
    return retrieved_docs

print("✓ retrieve_documents() defined")

# %%
# Test retrieval with a sample configuration
print("\n" + "="*80)
print("Testing retrieval functions...")
print("="*80)

# Use first configuration for testing
test_config = configurations[0]
test_provider = test_config['provider']
test_model = test_config['model']
test_chunk_size = test_config['chunk_sizes'][0]
test_k_retrieve = test_config['k_retrieve']

print(f"\nTest parameters:")
print(f"  Provider: {test_provider}")
print(f"  Model: {test_model}")
print(f"  Chunk size: {test_chunk_size}")
print(f"  k_retrieve: {test_k_retrieve}")

try:
    # Load vector store
    print(f"\nLoading vector store...")
    test_vectorstore = load_vector_store(
        test_provider,
        test_model,
        test_chunk_size,
        VECTOR_DB_BASE_DIR,
        COLLECTION_PREFIX
    )
    
    # Test retrieval with first query
    test_query = dataset[0]
    print(f"\nTest query: {test_query['question'][:100]}...")
    print(f"Target doc: {test_query['doc_name']}")
    
    # Test global mode
    print(f"\nRetrieving top {test_k_retrieve} documents (global mode)...")
    retrieved_global = retrieve_documents(
        test_vectorstore,
        test_query['question'],
        k=test_k_retrieve,
        mode='global'
    )
    print(f"  ✓ Retrieved {len(retrieved_global)} documents")
    print(f"  Top result: {retrieved_global[0]['doc_name']} (page {retrieved_global[0]['page_number']}, score: {retrieved_global[0]['score']:.4f})")
    print(f"  Content length: {len(retrieved_global[0]['content'])} characters")
    
    # Test single mode
    print(f"\nRetrieving top {test_k_retrieve} documents (single mode)...")
    retrieved_single = retrieve_documents(
        test_vectorstore,
        test_query['question'],
        k=test_k_retrieve,
        mode='single',
        doc_name=test_query['doc_name']
    )
    print(f"  ✓ Retrieved {len(retrieved_single)} documents")
    print(f"  Top result: {retrieved_single[0]['doc_name']} (page {retrieved_single[0]['page_number']}, score: {retrieved_single[0]['score']:.4f})")
    print(f"  Content length: {len(retrieved_single[0]['content'])} characters")
    
    print("\n✓ Retrieval test successful!")
    
except Exception as e:
    print(f"\n✗ Retrieval test failed: {e}")
    print("\nThis is expected if vector stores haven't been created yet.")
    print("Make sure you have run the baseline evaluation first to create vector stores.")

# %%
print("\n" + "="*80)
print("✓ STEP 5 COMPLETE: Retrieval Functions Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ get_embedding_model() - Initialize embedding models")
print("  ✓ load_vector_store() - Load ChromaDB collections")
print("  ✓ retrieve_documents() - Retrieve top-k documents with filtering")
print("\nNext step: Implement re-ranking functions")

✓ get_embedding_model() defined
✓ load_vector_store() defined
✓ retrieve_documents() defined

Testing retrieval functions...

Test parameters:
  Provider: voyage
  Model: voyage-3-large
  Chunk size: 512
  k_retrieve: 80

Loading vector store...
  ✓ Loaded collection 'financebench_docs_chunk_512' from ../../vector_databases/voyage_voyage-3-large
    Documents: 28635

Test query: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the quest...
Target doc: 3M_2018_10K

Retrieving top 80 documents (global mode)...
  ✓ Retrieved 80 documents
  Top result: 3M_2018_10K (page 48, score: 0.5490)
  Content length: 1848 characters

Retrieving top 80 documents (single mode)...
  ✓ Retrieved 80 documents
  Top result: 3M_2018_10K (page 48, score: 0.5490)
  Content length: 1848 characters

✓ Retrieval test successful!

✓ STEP 5 COMPLETE: Retrieval Functions Implemented

Implemented functions:
  ✓ get_embedding_model() - Initialize embedding models
  ✓ load_vec

In [12]:
# %% [markdown]
# ## 6. Re-ranking Functions

# %%
import time

def rerank_with_voyage(
    query: str,
    retrieved_docs: List[Dict],
    reranker_model: str,
    top_k: int,
    max_retries: int = 3,
    retry_delay: int = 60
) -> List[Dict]:
    """
    Re-rank documents using Voyage AI reranker API with rate limit handling.
    
    Args:
        query: Search query
        retrieved_docs: List from retrieval (with 'content', 'rank', 'score')
        reranker_model: Voyage model name (e.g., 'voyage-rerank-2.5')
        top_k: Number of documents to return after re-ranking
        max_retries: Maximum number of retries on rate limit
        retry_delay: Seconds to wait between retries
        
    Returns:
        Re-ranked list with {'doc_name', 'page_number', 'rank', 'initial_rank', 'initial_score', 'rerank_score', 'content'}
    """
    from langchain.schema import Document
    from langchain_voyageai import VoyageAIRerank
    
    # Convert to LangChain documents
    lc_docs = [
        Document(
            page_content=doc['content'],
            metadata={
                'doc_name': doc['doc_name'], 
                'page_number': doc['page_number'],
                'initial_rank': doc['rank'],
                'initial_score': doc['score']
            }
        )
        for doc in retrieved_docs
    ]
    
    # Initialize Voyage reranker
    # Extract model name (e.g., "rerank-2.5" from "voyage-rerank-2.5")
    model_name = reranker_model.replace('voyage-', '')
    
    reranker = VoyageAIRerank(
        model=model_name,
        voyage_api_key=VOYAGE_API_KEY,
        top_k=top_k
    )
    
    # Retry logic for rate limiting
    for attempt in range(max_retries):
        try:
            # Rerank documents
            reranked_docs = reranker.compress_documents(lc_docs, query)
            break  # Success, exit retry loop
            
        except Exception as e:
            error_msg = str(e)
            
            # Check if it's a rate limit error
            if "rate limit" in error_msg.lower() or "tpm" in error_msg.lower():
                if attempt < max_retries - 1:
                    print(f"\n⚠️  Rate limit hit. Waiting {retry_delay} seconds before retry {attempt + 1}/{max_retries}...")
                    time.sleep(retry_delay)
                else:
                    print(f"\n❌ Rate limit exceeded after {max_retries} attempts")
                    raise
            else:
                # Non-rate-limit error, raise immediately
                raise
    
    # Convert back to our format
    results = []
    for rank, doc in enumerate(reranked_docs, start=1):
        result = {
            'doc_name': doc.metadata['doc_name'],
            'page_number': doc.metadata['page_number'],
            'content': doc.page_content,  # FULL text
            'rank': rank,
            'initial_rank': doc.metadata['initial_rank'],
            'initial_score': doc.metadata['initial_score'],
            'rerank_score': doc.metadata.get('relevance_score', 0.0)
        }
        results.append(result)
    
    return results

print("✓ rerank_with_voyage() defined with rate limit handling")

# %%
def rerank_with_cross_encoder(
    query: str,
    retrieved_docs: List[Dict],
    cross_encoder_model: CrossEncoder,
    top_k: int
) -> List[Dict]:
    """
    Re-rank documents using Hugging Face cross-encoder model.
    
    Args:
        query: Search query
        retrieved_docs: List from retrieval (with 'content', 'rank', 'score')
        cross_encoder_model: Loaded CrossEncoder model instance
        top_k: Number of documents to return after re-ranking
        
    Returns:
        Re-ranked list with {'doc_name', 'page_number', 'rank', 'initial_rank', 'initial_score', 'rerank_score', 'content'}
    """
    # Prepare query-document pairs
    pairs = [[query, doc['content']] for doc in retrieved_docs]
    
    # Get relevance scores from cross-encoder
    scores = cross_encoder_model.predict(pairs)
    
    # Combine scores with documents
    docs_with_scores = []
    for doc, score in zip(retrieved_docs, scores):
        docs_with_scores.append({
            'doc_name': doc['doc_name'],
            'page_number': doc['page_number'],
            'content': doc['content'],  # FULL text
            'initial_rank': doc['rank'],
            'initial_score': doc['score'],
            'rerank_score': float(score)
        })
    
    # Sort by rerank score (descending) and take top_k
    docs_with_scores.sort(key=lambda x: x['rerank_score'], reverse=True)
    top_docs = docs_with_scores[:top_k]
    
    # Assign new ranks
    results = []
    for rank, doc in enumerate(top_docs, start=1):
        doc['rank'] = rank
        results.append(doc)
    
    return results

print("✓ rerank_with_cross_encoder() defined")

# %%
def rerank_documents(
    query: str,
    retrieved_docs: List[Dict],
    reranker_model: str,
    reranker_instance: Any,
    top_k: int
) -> List[Dict]:
    """
    Universal re-ranking function that routes to appropriate reranker.
    
    Args:
        query: Search query
        retrieved_docs: List from retrieval
        reranker_model: Model name/identifier
        reranker_instance: Loaded model instance or 'api' for Voyage
        top_k: Number of documents to return after re-ranking
        
    Returns:
        Re-ranked document list
    """
    if reranker_model == 'voyage-rerank-2.5':
        # Use Voyage API
        return rerank_with_voyage(query, retrieved_docs, reranker_model, top_k)
    
    elif isinstance(reranker_instance, CrossEncoder):
        # Use Hugging Face cross-encoder
        return rerank_with_cross_encoder(query, retrieved_docs, reranker_instance, top_k)
    
    else:
        raise ValueError(f"Unknown reranker type for model: {reranker_model}")

print("✓ rerank_documents() defined")

# %%
# Test re-ranking with sample data
print("\n" + "="*80)
print("Testing re-ranking functions...")
print("="*80)

# Create sample retrieved documents for testing
sample_retrieved_docs = [
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 1,
        'content': 'This is a highly relevant document about financial reporting and annual statements.',
        'rank': 1,
        'score': 0.95
    },
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 2,
        'content': 'This document discusses unrelated topics like weather and sports.',
        'rank': 2,
        'score': 0.85
    },
    {
        'doc_name': 'test_doc.pdf',
        'page_number': 3,
        'content': 'Annual financial statements and revenue details for the fiscal year.',
        'rank': 3,
        'score': 0.80
    }
]

sample_query = "What was the company's revenue in 2023?"

print(f"\nSample query: {sample_query}")
print(f"Sample documents: {len(sample_retrieved_docs)}")

# Test with cross-encoder (if loaded)
try:
    test_reranker_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
    if test_reranker_name in reranker_instances:
        print(f"\nTesting with {test_reranker_name}...")
        test_reranker = reranker_instances[test_reranker_name]
        
        reranked = rerank_with_cross_encoder(
            sample_query,
            sample_retrieved_docs,
            test_reranker,
            top_k=3
        )
        
        print(f"  ✓ Re-ranking successful!")
        print(f"\n  Initial order:")
        for doc in sample_retrieved_docs:
            print(f"    Rank {doc['rank']}: Page {doc['page_number']} (score: {doc['score']:.3f})")
        
        print(f"\n  Re-ranked order:")
        for doc in reranked:
            print(f"    Rank {doc['rank']}: Page {doc['page_number']} (initial rank: {doc['initial_rank']}, rerank score: {doc['rerank_score']:.3f})")
    else:
        print(f"\n⚠️  Reranker {test_reranker_name} not loaded. Skipping test.")
        
except Exception as e:
    print(f"\n✗ Re-ranking test failed: {e}")

# %%
print("\n" + "="*80)
print("✓ STEP 6 COMPLETE: Re-ranking Functions Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ rerank_with_voyage() - Voyage API re-ranking with rate limiting")
print("  ✓ rerank_with_cross_encoder() - HuggingFace cross-encoder re-ranking")
print("  ✓ rerank_documents() - Universal re-ranking router")
print("\nNext step: Implement main pipeline (simplified, no metrics)")

✓ rerank_with_voyage() defined with rate limit handling
✓ rerank_with_cross_encoder() defined
✓ rerank_documents() defined

Testing re-ranking functions...

Sample query: What was the company's revenue in 2023?
Sample documents: 3

Testing with cross-encoder/ms-marco-MiniLM-L-12-v2...
  ✓ Re-ranking successful!

  Initial order:
    Rank 1: Page 1 (score: 0.950)
    Rank 2: Page 2 (score: 0.850)
    Rank 3: Page 3 (score: 0.800)

  Re-ranked order:
    Rank 1: Page 3 (initial rank: 3, rerank score: -7.339)
    Rank 2: Page 1 (initial rank: 1, rerank score: -10.861)
    Rank 3: Page 2 (initial rank: 2, rerank score: -11.263)

✓ STEP 6 COMPLETE: Re-ranking Functions Implemented

Implemented functions:
  ✓ rerank_with_voyage() - Voyage API re-ranking with rate limiting
  ✓ rerank_with_cross_encoder() - HuggingFace cross-encoder re-ranking
  ✓ rerank_documents() - Universal re-ranking router

Next step: Implement main pipeline (simplified, no metrics)


In [13]:
# %% [markdown]
# ## 7. Main Pipeline (Simplified - No Metrics)

# %%
def evaluate_configuration(
    provider: str,
    model: str,
    chunk_size: int,
    k_retrieve: int,
    k_rerank: int,
    reranker_model: str,
    mode: str,
    dataset,
    reranker_instance,
    vector_db_base_dir: str,
    collection_prefix: str,
    output_dir: str
) -> Dict:
    """
    Run re-ranking pipeline for a single configuration.
    Simplified version - no evaluation metrics, just retrieve → rerank → save.
    
    Args:
        provider: Embedding provider
        model: Embedding model
        chunk_size: Chunk size
        k_retrieve: Number of documents to retrieve
        k_rerank: Number of documents to keep after reranking
        reranker_model: Reranker model name
        mode: 'global' or 'single'
        dataset: FinanceBench dataset
        reranker_instance: Loaded reranker model instance
        vector_db_base_dir: Base directory for vector databases
        collection_prefix: Collection name prefix
        output_dir: Output directory for results
        
    Returns:
        Summary statistics dict
    """
    print(f"\n{'='*80}")
    print(f"Processing: {provider}/{model} | chunk={chunk_size} | k_retrieve={k_retrieve} | k_rerank={k_rerank} | {mode} | reranker={simplify_reranker_name(reranker_model)}")
    print(f"{'='*80}")
    
    # Check if results already exist
    if check_if_results_exist(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker_model, output_dir):
        print("⚠️  Results already exist. Skipping...")
        return {'status': 'skipped'}
    
    # Load vector store
    try:
        print("\n1. Loading vector store...")
        vectorstore = load_vector_store(provider, model, chunk_size, vector_db_base_dir, collection_prefix)
    except Exception as e:
        print(f"❌ Failed to load vector store: {e}")
        return {'status': 'failed', 'error': str(e)}
    
    # Initialize results storage
    query_results = []
    
    # Process each query
    print(f"\n2. Processing {len(dataset)} queries...")
    for idx, query_item in enumerate(tqdm(dataset, desc="Queries")):
        question = query_item['question']
        doc_name = query_item['doc_name']
        
        # Add small delay to avoid rate limits (only for API-based rerankers)
        if reranker_model == 'voyage-rerank-2.5':
            time.sleep(0.5)  # 500ms delay between queries
        
        # Step 1: Retrieve top k_retrieve documents
        try:
            retrieved_k_retrieve = retrieve_documents(
                vectorstore,
                question,
                k=k_retrieve,
                mode=mode,
                doc_name=doc_name if mode == 'single' else None
            )
        except Exception as e:
            print(f"\n⚠️  Query {idx} retrieval failed: {e}")
            continue
        
        # Step 2: Rerank ALL k_retrieve documents
        try:
            reranked_all = rerank_documents(
                question,
                retrieved_k_retrieve,
                reranker_model,
                reranker_instance,
                k_retrieve  # Rerank all documents
            )
        except Exception as e:
            print(f"\n⚠️  Query {idx} reranking failed: {e}")
            continue
        
        # Step 3: Get top k_rerank from reranked results
        reranked_top_k = reranked_all[:k_rerank]
        
        # Step 4: Format evidence array matching FinanceBench structure
        evidence = []
        for doc in reranked_top_k:
            evidence.append({
                'evidence_text': doc['content'],  # FULL text, no truncation
                'doc_name': doc['doc_name'],
                'evidence_page_num': doc['page_number'],
                'rank': doc['rank'],
                'rerank_score': round(doc['rerank_score'], 4)
            })
        
        # Step 5: Store query result matching FinanceBench structure
        query_result = {
            'financebench_id': query_item['financebench_id'],
            'question_type': query_item['question_type'],
            'question_reasoning': query_item['question_reasoning'],
            'question': question,
            'doc_name': doc_name,
            'evidence': evidence
        }
        query_results.append(query_result)
    
    # Check if any queries were processed
    num_queries = len(query_results)
    if num_queries == 0:
        print("\n❌ No queries were successfully processed")
        return {'status': 'failed', 'error': 'No queries processed'}
    
    # Prepare final results
    results = {
        'configuration': {
            'provider': provider,
            'model': model,
            'chunk_size': chunk_size,
            'k_retrieve': k_retrieve,
            'k_rerank': k_rerank,
            'reranker_model': reranker_model,
            'mode': mode
        },
        'queries': query_results
    }
    
    # Save results
    print(f"\n3. Saving results...")
    save_path = save_results(
        results,
        provider,
        model,
        chunk_size,
        k_retrieve,
        k_rerank,
        mode,
        reranker_model,
        output_dir
    )
    print(f"✓ Results saved to: {save_path}")
    
    # Print summary
    print(f"\n{'='*80}")
    print("PROCESSING COMPLETE")
    print(f"{'='*80}")
    print(f"\nTotal queries processed: {num_queries}")
    print(f"Evidence chunks per query: {k_rerank}")
    print(f"Output file: {os.path.basename(save_path)}")
    print(f"{'='*80}")
    
    return {
        'status': 'completed',
        'num_queries': num_queries,
        'output_path': save_path
    }

print("✓ evaluate_configuration() defined")

# %%
print("\n" + "="*80)
print("✓ STEP 7 COMPLETE: Main Pipeline Implemented")
print("="*80)
print("\nImplemented functions:")
print("  ✓ evaluate_configuration() - Complete re-ranking pipeline (simplified)")
print("\nKey features:")
print("  - No evaluation metrics (MRR, recall, etc.)")
print("  - Full chunk text (no truncation)")
print("  - FinanceBench-compatible output format")
print("  - Evidence array with rank and rerank_score")
print("\nNext step: Batch execution")

✓ evaluate_configuration() defined

✓ STEP 7 COMPLETE: Main Pipeline Implemented

Implemented functions:
  ✓ evaluate_configuration() - Complete re-ranking pipeline (simplified)

Key features:
  - No evaluation metrics (MRR, recall, etc.)
  - Full chunk text (no truncation)
  - FinanceBench-compatible output format
  - Evidence array with rank and rerank_score

Next step: Batch execution


In [14]:
# %% [markdown]
# ## 8. Batch Evaluation Execution

# %%
def run_batch_evaluation(
    configurations: List[Dict],
    modes: List[str],
    dataset,
    reranker_instances: Dict,
    vector_db_base_dir: str,
    collection_prefix: str,
    output_dir: str
):
    """
    Run re-ranking for all configurations.
    
    Args:
        configurations: List of configuration dicts
        modes: List of modes ('global', 'single')
        dataset: FinanceBench dataset
        reranker_instances: Dict of loaded reranker models
        vector_db_base_dir: Base directory for vector databases
        collection_prefix: Collection name prefix
        output_dir: Output directory
    """
    print("\n" + "#"*80)
    print("STARTING BATCH RE-RANKING")
    print("#"*80)
    
    # Calculate total runs
    total_runs = 0
    for config in configurations:
        chunk_sizes = config['chunk_sizes']
        reranker_models = config['reranker_models']
        total_runs += len(chunk_sizes) * len(reranker_models) * len(modes)
    
    print(f"\nTotal runs: {total_runs}")
    print(f"Output directory: {output_dir}")
    
    # Track results
    all_results = []
    completed = 0
    skipped = 0
    failed = 0
    
    # Iterate through all configurations
    for config_idx, config in enumerate(configurations, 1):
        provider = config['provider']
        model = config['model']
        chunk_sizes = config['chunk_sizes']
        k_retrieve = config['k_retrieve']
        k_rerank = config['k_rerank']
        reranker_models = config['reranker_models']
        
        print(f"\n{'#'*80}")
        print(f"Configuration {config_idx}/{len(configurations)}: {provider}/{model}")
        print(f"{'#'*80}")
        
        for chunk_size in chunk_sizes:
            for reranker_model in reranker_models:
                for mode in modes:
                    # Get reranker instance
                    reranker_instance = reranker_instances.get(reranker_model)
                    
                    if reranker_instance is None:
                        print(f"\n⚠️  Reranker {reranker_model} not found. Skipping...")
                        failed += 1
                        continue
                    
                    # Run evaluation
                    try:
                        result = evaluate_configuration(
                            provider=provider,
                            model=model,
                            chunk_size=chunk_size,
                            k_retrieve=k_retrieve,
                            k_rerank=k_rerank,
                            reranker_model=reranker_model,
                            mode=mode,
                            dataset=dataset,
                            reranker_instance=reranker_instance,
                            vector_db_base_dir=vector_db_base_dir,
                            collection_prefix=collection_prefix,
                            output_dir=output_dir
                        )
                        
                        if result['status'] == 'completed':
                            completed += 1
                            all_results.append(result)
                        elif result['status'] == 'skipped':
                            skipped += 1
                        else:
                            failed += 1
                            
                    except Exception as e:
                        print(f"\n❌ Evaluation failed with exception: {e}")
                        import traceback
                        traceback.print_exc()
                        failed += 1
    
    # Print final summary
    print("\n" + "#"*80)
    print("BATCH RE-RANKING COMPLETE")
    print("#"*80)
    print(f"\nTotal runs: {total_runs}")
    print(f"  ✓ Completed: {completed}")
    print(f"  ⊘ Skipped:   {skipped}")
    print(f"  ✗ Failed:    {failed}")
    print(f"\nResults saved to: {output_dir}")
    
    return {
        'total_runs': total_runs,
        'completed': completed,
        'skipped': skipped,
        'failed': failed,
        'results': all_results
    }

print("✓ run_batch_evaluation() defined")

# %%
# Display Evaluation Plan
print("\n" + "="*80)
print("EVALUATION PLAN")
print("="*80)

print(f"\nDataset: FinanceBench ({len(dataset)} queries)")

print(f"\nEvaluation Settings:")
print(f"  Modes: {modes}")

print(f"\nConfigurations to evaluate:")
total_runs = 0
for i, config in enumerate(configurations, 1):
    provider = config['provider']
    model = config['model']
    chunk_sizes = config['chunk_sizes']
    k_retrieve = config['k_retrieve']
    k_rerank = config['k_rerank']
    reranker_models = config['reranker_models']
    
    runs_for_config = len(chunk_sizes) * len(reranker_models) * len(modes)
    total_runs += runs_for_config
    
    print(f"\n  {i}. {provider}/{model}")
    print(f"     Chunk sizes: {chunk_sizes}")
    print(f"     k_retrieve: {k_retrieve}, k_rerank: {k_rerank}")
    print(f"     Reranker models: {len(reranker_models)}")
    for reranker in reranker_models:
        print(f"       - {reranker}")
    print(f"     Total runs: {runs_for_config}")
    
    # Show sample output filenames
    print(f"     Sample output files:")
    for chunk_size in chunk_sizes[:1]:  # Show only first chunk size
        for reranker in reranker_models[:2]:  # Show only first 2 rerankers
            for mode in modes:
                filename = get_output_filename(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker)
                exists = check_if_results_exist(provider, model, chunk_size, k_retrieve, k_rerank, mode, reranker, OUTPUT_DIR)
                status = "EXISTS" if exists else "TO CREATE"
                print(f"       - {filename} [{status}]")
        if len(reranker_models) > 2:
            print(f"       ... ({len(reranker_models) - 2} more rerankers)")

print(f"\n{'='*80}")
print(f"Total evaluation runs: {total_runs}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"{'='*80}")

# %%
print("\n" + "="*80)
print("✓ STEP 8 COMPLETE: Batch Evaluation Ready")
print("="*80)
print("\nTo run the evaluation:")
print("  1. Review the evaluation plan above")
print("  2. Run the next cell to start batch evaluation")
print("  3. Monitor progress (this may take time depending on configurations)")
print("\nNote: You can interrupt and resume anytime - existing results are skipped")

✓ run_batch_evaluation() defined

EVALUATION PLAN

Dataset: FinanceBench (150 queries)

Evaluation Settings:
  Modes: ['global', 'single']

Configurations to evaluate:

  1. voyage/voyage-3-large
     Chunk sizes: [512]
     k_retrieve: 80, k_rerank: 20
     Reranker models: 3
       - cross-encoder/ms-marco-MiniLM-L-12-v2
       - BAAI/bge-reranker-large
       - voyage-rerank-2.5
     Total runs: 6
     Sample output files:
       - voyage_voyage-3-large_chunk512_k80_global_rerank_k20-ms-marco-MiniLM-L-12-v2.json [TO CREATE]
       - voyage_voyage-3-large_chunk512_k80_single_rerank_k20-ms-marco-MiniLM-L-12-v2.json [TO CREATE]
       - voyage_voyage-3-large_chunk512_k80_global_rerank_k20-bge-reranker-large.json [TO CREATE]
       - voyage_voyage-3-large_chunk512_k80_single_rerank_k20-bge-reranker-large.json [TO CREATE]
       ... (1 more rerankers)

  2. ollama/nomic-embed-text
     Chunk sizes: [512]
     k_retrieve: 80, k_rerank: 20
     Reranker models: 3
       - cross-encoder/ms-

In [None]:
# Test configuration (temporary - just for verification)
test_configurations = [
    {
        'provider': 'voyage',
        'model': 'voyage-3-large',
        'chunk_sizes': [512],
        'k_retrieve': 80,
        'k_rerank': 20,
        'reranker_models': [
            'voyage-rerank-2.5'  # Just one reranker for testing
        ]
    }
]

In [None]:
# %% [markdown]
# ## 9. Execute Batch Evaluation
# 
# **Run this cell to start the batch re-ranking process**

# %%
# Run batch evaluation
batch_results = run_batch_evaluation(
    configurations=configurations,
    modes=modes,
    dataset=dataset,
    reranker_instances=reranker_instances,
    vector_db_base_dir=VECTOR_DB_BASE_DIR,
    collection_prefix=COLLECTION_PREFIX,
    output_dir=OUTPUT_DIR
)

# %%
# Display final summary
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)
print(f"\nTotal runs attempted: {batch_results['total_runs']}")
print(f"  ✓ Successfully completed: {batch_results['completed']}")
print(f"  ⊘ Skipped (already exists): {batch_results['skipped']}")
print(f"  ✗ Failed: {batch_results['failed']}")

if batch_results['completed'] > 0:
    print(f"\n✅ Successfully created {batch_results['completed']} JSON files")
    print(f"\nOutput directory: {OUTPUT_DIR}")
    print("\nYou can now use these JSON files in your answer generation pipeline!")
else:
    print("\n⚠️  No new files were created. Check if results already exist.")