In [5]:
# ============================================================================
# STEP 1: Setup and Imports
# ============================================================================

# %% [markdown]
# # Hybrid Retrieval Implementation
# 
# This notebook implements hybrid retrieval (Dense + BM25) for financial QA.
# We'll build this step-by-step, testing each section before moving forward.

# %% [markdown]
# ## Step 1: Setup and Imports
# 
# **Goal:** Import all required libraries and load configuration

# %% [markdown]
# ### 1.1 Import Libraries

# %%
import os
import pickle
from typing import List, Dict, Tuple

# Environment
from dotenv import load_dotenv

# Progress bars
from tqdm.auto import tqdm

# LangChain for vector stores
from langchain.docstore.document import Document as LCDocument
from langchain.vectorstores import Chroma

# VoyageAI embeddings
from langchain_voyageai import VoyageAIEmbeddings

# BM25 for keyword-based retrieval
from rank_bm25 import BM25Okapi

# Dataset
from datasets import load_dataset

print("✓ All imports successful")

# %% [markdown]
# ### 1.2 Load Environment Variables

# %%
load_dotenv()

VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")

if VOYAGE_API_KEY:
    print("✓ VoyageAI API key loaded")
else:
    raise ValueError("❌ VoyageAI API key not found in .env file")

# %% [markdown]
# ### 1.3 Configuration

# %%
# Paths
VECTOR_DB_DIR = "../../vector_databases"
BM25_INDEX_DIR = "../../bm25_indices"

# Dataset
DATASET_NAME = "PatronusAI/financebench"
DATASET_SPLIT = "train"

# Settings
COLLECTION_PREFIX = "financebench_docs_chunk_"
EMBEDDING_PROVIDER = "voyage"
EMBEDDING_MODEL = "voyage-finance-2"
CHUNK_SIZES = [512, 1024]

print("✓ Configuration set")
print(f"  Vector DB: {VECTOR_DB_DIR}")
print(f"  BM25 Index: {BM25_INDEX_DIR}")
print(f"  Embedding: {EMBEDDING_PROVIDER}/{EMBEDDING_MODEL}")
print(f"  Chunk Sizes: {CHUNK_SIZES}")

# %% [markdown]
# ### 1.4 Create Directories

# %%
os.makedirs(BM25_INDEX_DIR, exist_ok=True)
print(f"✓ BM25 directory ready: {BM25_INDEX_DIR}")

# %% [markdown]
# ### 1.5 Helper Functions

# %%
def get_embedding_function(provider: str, model: str):
    """Get embedding function."""
    if provider == "voyage":
        return VoyageAIEmbeddings(model=model, voyage_api_key=VOYAGE_API_KEY)
    else:
        raise ValueError(f"Unknown provider: {provider}")


def get_db_path(base_dir: str, provider: str, model: str) -> str:
    """Get database path for embedding."""
    model_id = f"{provider}_{model.replace('/', '_')}"
    return os.path.join(base_dir, model_id)


def get_bm25_path(base_dir: str, chunk_size: int) -> str:
    """Get path for BM25 index file."""
    return os.path.join(base_dir, f"bm25_chunk_{chunk_size}.pkl")


print("✓ Helper functions defined")

# %% [markdown]
# ### 1.6 Load Dataset

# %%
print("Loading FinanceBench dataset...")
dataset = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
print(f"✓ Loaded {len(dataset)} questions")

# Show sample
sample = dataset[0]
print(f"\nSample question:")
print(f"  Question: {sample['question']}")
print(f"  Document: {sample['doc_name']}")

# %%
print("\n" + "="*60)
print("✅ STEP 1 COMPLETE")
print("="*60)
print("All imports successful, configuration loaded, dataset ready.")
print("\nTest this step, then I'll provide Step 2!")

✓ All imports successful
✓ VoyageAI API key loaded
✓ Configuration set
  Vector DB: ../../vector_databases
  BM25 Index: ../../bm25_indices
  Embedding: voyage/voyage-finance-2
  Chunk Sizes: [512, 1024]
✓ BM25 directory ready: ../../bm25_indices
✓ Helper functions defined
Loading FinanceBench dataset...
✓ Loaded 150 questions

Sample question:
  Question: What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.
  Document: 3M_2018_10K

✅ STEP 1 COMPLETE
All imports successful, configuration loaded, dataset ready.

Test this step, then I'll provide Step 2!


In [6]:
# ============================================================================
# STEP 2: Load Vectorstores and Extract Chunks
# ============================================================================

# %% [markdown]
# ## Step 2: Load Vectorstores and Extract Chunks
# 
# **Goal:** Load existing ChromaDB vectorstores and extract all chunks for BM25 indexing

# %% [markdown]
# ### 2.1 Load Vectorstore Function

# %%
def load_vectorstore(
    embedding_provider: str,
    embedding_model: str,
    chunk_size: int,
    base_db_dir: str = "../../vector_databases",
    collection_prefix: str = "financebench_docs_chunk_"
) -> Chroma:
    """Load a vectorstore for a specific chunk size."""
    db_path = get_db_path(base_db_dir, embedding_provider, embedding_model)
    collection_name = f"{collection_prefix}{chunk_size}"
    
    print(f"\nLoading vectorstore:")
    print(f"  Path: {db_path}")
    print(f"  Collection: {collection_name}")
    
    if not os.path.exists(db_path):
        raise ValueError(f"Database not found at: {db_path}")
    
    emb_fn = get_embedding_function(embedding_provider, embedding_model)
    
    vectorstore = Chroma(
        collection_name=collection_name,
        embedding_function=emb_fn,
        persist_directory=db_path
    )
    
    count = vectorstore._collection.count()
    print(f"  Documents: {count:,}")
    
    return vectorstore


print("✓ Load vectorstore function defined")

# %% [markdown]
# ### 2.2 Extract Chunks Function

# %%
def extract_chunks_from_vectorstore(vectorstore: Chroma) -> Dict:
    """Extract all chunks with their texts and metadata."""
    print("\nExtracting chunks...")
    
    collection = vectorstore._collection
    results = collection.get(include=["documents", "metadatas"])
    
    chunk_data = {
        'ids': results['ids'],
        'texts': results['documents'],
        'metadatas': results['metadatas']
    }
    
    print(f"✓ Extracted {len(chunk_data['texts']):,} chunks")
    
    # Show sample
    if chunk_data['texts']:
        print(f"\nSample chunk:")
        print(f"  Length: {len(chunk_data['texts'][0])} chars")
        print(f"  Preview: {chunk_data['texts'][0][:150]}...")
    
    return chunk_data


print("✓ Extract chunks function defined")

# %% [markdown]
# ### 2.3 Load and Extract Chunk Size 512

# %%
print("\n" + "="*60)
print("Processing Chunk Size 512")
print("="*60)

vectorstore_512 = load_vectorstore(
    embedding_provider=EMBEDDING_PROVIDER,
    embedding_model=EMBEDDING_MODEL,
    chunk_size=512,
    base_db_dir=VECTOR_DB_DIR
)

chunks_512 = extract_chunks_from_vectorstore(vectorstore_512)

# %% [markdown]
# ### 2.4 Load and Extract Chunk Size 1024

# %%
print("\n" + "="*60)
print("Processing Chunk Size 1024")
print("="*60)

vectorstore_1024 = load_vectorstore(
    embedding_provider=EMBEDDING_PROVIDER,
    embedding_model=EMBEDDING_MODEL,
    chunk_size=1024,
    base_db_dir=VECTOR_DB_DIR
)

chunks_1024 = extract_chunks_from_vectorstore(vectorstore_1024)

# %% [markdown]
# ### 2.5 Summary Statistics

# %%
print("\n" + "="*60)
print("EXTRACTION SUMMARY")
print("="*60)

for chunk_size, chunks in [(512, chunks_512), (1024, chunks_1024)]:
    texts = chunks['texts']
    print(f"\nChunk Size {chunk_size}:")
    print(f"  Total chunks: {len(texts):,}")
    print(f"  Total chars: {sum(len(t) for t in texts):,}")
    print(f"  Avg length: {sum(len(t) for t in texts) / len(texts):.0f} chars")
    print(f"  Min length: {min(len(t) for t in texts):,} chars")
    print(f"  Max length: {max(len(t) for t in texts):,} chars")

# %%
print("\n" + "="*60)
print("✅ STEP 2 COMPLETE")
print("="*60)
print("Chunks extracted and ready for BM25 indexing.")
print("\nTest this step, then I'll provide Step 3!")

✓ Load vectorstore function defined
✓ Extract chunks function defined

Processing Chunk Size 512

Loading vectorstore:
  Path: ../../vector_databases/voyage_voyage-finance-2
  Collection: financebench_docs_chunk_512


  vectorstore = Chroma(


  Documents: 28,634

Extracting chunks...
✓ Extracted 28,634 chunks

Sample chunk:
  Length: 1606 chars
  Preview: Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549 
FORM 10-K
☒
ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) O...

Processing Chunk Size 1024

Loading vectorstore:
  Path: ../../vector_databases/voyage_voyage-finance-2
  Collection: financebench_docs_chunk_1024
  Documents: 15,765

Extracting chunks...
✓ Extracted 15,765 chunks

Sample chunk:
  Length: 3576 chars
  Preview: Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549 
FORM 10-K
☒
ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) O...

EXTRACTION SUMMARY

Chunk Size 512:
  Total chunks: 28,634
  Total chars: 43,780,927
  Avg length: 1529 chars
  Min length: 1 chars
  Max length: 4,103 chars

Chunk Size 1024:
  Total chunks: 15,765
  Total chars: 42,395,002
  Avg length: 2689 chars
  Min length: 1 chars
  Max length: 7,205 chars

✅ STEP 2 COMPL

In [7]:
# ============================================================================
# STEP 3: Build BM25 Indices
# ============================================================================

# %% [markdown]
# ## Step 3: Build BM25 Indices
# 
# **Goal:** Create BM25 indices from extracted chunks for keyword-based retrieval

# %% [markdown]
# ### 3.1 Tokenization Function
# 
# BM25 needs to tokenize text. We'll use simple whitespace tokenization.

# %%
def simple_tokenize(text: str) -> List[str]:
    """Simple tokenization: lowercase and split by whitespace."""
    return text.lower().split()


print("✓ Tokenization function defined")

# %% [markdown]
# ### 3.2 Build BM25 Index Function

# %%
def build_bm25_index(chunks: Dict, chunk_size: int) -> Dict:
    """
    Build BM25 index from chunks.
    
    Returns a dictionary with:
    - bm25: BM25Okapi object
    - ids: List of chunk IDs (same order as corpus)
    - metadatas: List of metadata dicts
    """
    print(f"\nBuilding BM25 index for chunk size {chunk_size}...")
    
    texts = chunks['texts']
    ids = chunks['ids']
    metadatas = chunks['metadatas']
    
    # Tokenize all texts
    print(f"  Tokenizing {len(texts):,} chunks...")
    tokenized_corpus = [simple_tokenize(text) for text in tqdm(texts, desc="Tokenizing")]
    
    # Build BM25 index
    print(f"  Building BM25 index...")
    bm25 = BM25Okapi(tokenized_corpus)
    
    bm25_data = {
        'bm25': bm25,
        'ids': ids,
        'metadatas': metadatas,
        'texts': texts,  # Keep original texts for retrieval
        'chunk_size': chunk_size
    }
    
    print(f"✓ BM25 index built with {len(texts):,} documents")
    
    return bm25_data


print("✓ Build BM25 index function defined")

# %% [markdown]
# ### 3.3 Save BM25 Index Function

# %%
def save_bm25_index(bm25_data: Dict, chunk_size: int, output_dir: str):
    """Save BM25 index to pickle file."""
    os.makedirs(output_dir, exist_ok=True)
    output_path = get_bm25_path(output_dir, chunk_size)
    
    with open(output_path, 'wb') as f:
        pickle.dump(bm25_data, f)
    
    print(f"✓ Saved BM25 index to: {output_path}")
    
    # Print file size
    file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
    print(f"  File size: {file_size_mb:.2f} MB")


print("✓ Save BM25 index function defined")

# %% [markdown]
# ### 3.4 Load BM25 Index Function (for later use)

# %%
def load_bm25_index(chunk_size: int, index_dir: str) -> Dict:
    """Load BM25 index from pickle file."""
    index_path = get_bm25_path(index_dir, chunk_size)
    
    if not os.path.exists(index_path):
        raise ValueError(f"BM25 index not found at: {index_path}")
    
    with open(index_path, 'rb') as f:
        bm25_data = pickle.load(f)
    
    print(f"✓ Loaded BM25 index from: {index_path}")
    print(f"  Documents: {len(bm25_data['ids']):,}")
    
    return bm25_data


print("✓ Load BM25 index function defined")

# %% [markdown]
# ### 3.5 Build and Save BM25 Index for Chunk Size 512

# %%
print("\n" + "="*60)
print("Building BM25 Index for Chunk Size 512")
print("="*60)

bm25_index_512 = build_bm25_index(chunks_512, 512)
save_bm25_index(bm25_index_512, 512, BM25_INDEX_DIR)

# %% [markdown]
# ### 3.6 Build and Save BM25 Index for Chunk Size 1024

# %%
print("\n" + "="*60)
print("Building BM25 Index for Chunk Size 1024")
print("="*60)

bm25_index_1024 = build_bm25_index(chunks_1024, 1024)
save_bm25_index(bm25_index_1024, 1024, BM25_INDEX_DIR)

# %% [markdown]
# ### 3.7 Test BM25 Search
# 
# Let's test the BM25 index with a sample query

# %%
print("\n" + "="*60)
print("Testing BM25 Search")
print("="*60)

# Test query
test_query = "What was Apple's revenue in 2022?"
print(f"\nTest Query: {test_query}")

# Tokenize query
tokenized_query = simple_tokenize(test_query)
print(f"Tokenized: {tokenized_query}")

# Search with BM25 (chunk 512)
print(f"\nTop 3 results from BM25 (chunk 512):")
scores = bm25_index_512['bm25'].get_scores(tokenized_query)

# Get top 3 indices
import numpy as np
top_indices = np.argsort(scores)[::-1][:3]

for rank, idx in enumerate(top_indices, 1):
    score = scores[idx]
    text = bm25_index_512['texts'][idx]
    print(f"\n  Rank {rank} (Score: {score:.4f}):")
    print(f"    {text[:200]}...")

# %%
print("\n" + "="*60)
print("✅ STEP 3 COMPLETE")
print("="*60)
print("BM25 indices built and saved successfully.")
print(f"Indices saved to: {BM25_INDEX_DIR}")
print("\nTest this step, then I'll provide Step 4!")

✓ Tokenization function defined
✓ Build BM25 index function defined
✓ Save BM25 index function defined
✓ Load BM25 index function defined

Building BM25 Index for Chunk Size 512

Building BM25 index for chunk size 512...
  Tokenizing 28,634 chunks...


Tokenizing:   0%|          | 0/28634 [00:00<?, ?it/s]

  Building BM25 index...
✓ BM25 index built with 28,634 documents
✓ Saved BM25 index to: ../../bm25_indices/bm25_chunk_512.pkl
  File size: 87.73 MB

Building BM25 Index for Chunk Size 1024

Building BM25 index for chunk size 1024...
  Tokenizing 15,765 chunks...


Tokenizing:   0%|          | 0/15765 [00:00<?, ?it/s]

  Building BM25 index...
✓ BM25 index built with 15,765 documents
✓ Saved BM25 index to: ../../bm25_indices/bm25_chunk_1024.pkl
  File size: 78.69 MB

Testing BM25 Search

Test Query: What was Apple's revenue in 2022?
Tokenized: ['what', 'was', "apple's", 'revenue', 'in', '2022?']

Top 3 results from BM25 (chunk 512):

  Rank 1 (Score: 17.2535):
    Once revenue is allocated to software or software-related elements as a group, 
we recognize revenue in conformance with software revenue accounting guidance. Revenue is recognized when revenue recogn...

  Rank 2 (Score: 17.0497):
    Revenue is recognized when revenue recognition 
criteria are met for each element.
We are generally unable to establish VSOE or TPE for non-software elements and as such, we use BESP. BESP is generall...

  Rank 3 (Score: 17.0164):
    Pricing practices taken into 
consideration include historic contractually stated prices, volume discounts where applicable and our price lists. We must estimate 
certain royal

In [9]:
# ============================================================================
# STEP 4: Hybrid Retrieval with RRF Fusion
# ============================================================================

# %% [markdown]
# ## Step 4: Hybrid Retrieval with RRF Fusion
# 
# **Goal:** Implement hybrid retrieval that combines dense (embeddings) and sparse (BM25) retrieval using Reciprocal Rank Fusion

# %% [markdown]
# ### 4.1 Dense Retrieval Function

# %%
def dense_retrieval(
    vectorstore: Chroma,
    query: str,
    k: int = 40
) -> List[Tuple[str, float, Dict]]:
    """
    Perform dense retrieval using embeddings.
    
    Returns list of (chunk_id, score, metadata) tuples.
    """
    results = vectorstore.similarity_search_with_score(query, k=k)
    
    # Format results: (id, score, metadata)
    dense_results = []
    for doc, score in results:
        # Get the document ID from metadata or generate one
        doc_id = doc.metadata.get('id', str(hash(doc.page_content)))
        dense_results.append((doc_id, score, doc.metadata, doc.page_content))
    
    return dense_results


print("✓ Dense retrieval function defined")

# %% [markdown]
# ### 4.2 BM25 Retrieval Function

# %%
def bm25_retrieval(
    bm25_data: Dict,
    query: str,
    k: int = 40
) -> List[Tuple[str, float, Dict]]:
    """
    Perform BM25 retrieval using keyword matching.
    
    Returns list of (chunk_id, score, metadata) tuples.
    """
    import numpy as np
    
    # Tokenize query
    tokenized_query = simple_tokenize(query)
    
    # Get BM25 scores
    scores = bm25_data['bm25'].get_scores(tokenized_query)
    
    # Get top k indices
    top_indices = np.argsort(scores)[::-1][:k]
    
    # Format results: (id, score, metadata, text)
    bm25_results = []
    for idx in top_indices:
        chunk_id = bm25_data['ids'][idx]
        score = scores[idx]
        metadata = bm25_data['metadatas'][idx]
        text = bm25_data['texts'][idx]
        bm25_results.append((chunk_id, score, metadata, text))
    
    return bm25_results


print("✓ BM25 retrieval function defined")

# %% [markdown]
# ### 4.3 Reciprocal Rank Fusion (RRF)

# %%
def reciprocal_rank_fusion(
    dense_results: List[Tuple],
    bm25_results: List[Tuple],
    k: int = 60,
    final_k: int = 20
) -> List[Dict]:
    """
    Combine dense and BM25 results using Reciprocal Rank Fusion.
    
    Args:
        dense_results: List of (id, score, metadata, text) from dense retrieval
        bm25_results: List of (id, score, metadata, text) from BM25 retrieval
        k: RRF constant (typically 60)
        final_k: Number of final results to return
    
    Returns:
        List of dicts with combined results
    """
    from collections import defaultdict
    
    # Store RRF scores and document info
    rrf_scores = defaultdict(float)
    doc_info = {}  # Store metadata and text for each doc
    
    # Add dense retrieval ranks
    for rank, (doc_id, score, metadata, text) in enumerate(dense_results, 1):
        rrf_scores[doc_id] += 1 / (k + rank)
        if doc_id not in doc_info:
            doc_info[doc_id] = {
                'metadata': metadata,
                'text': text,
                'dense_rank': rank,
                'dense_score': score,
                'bm25_rank': None,
                'bm25_score': None
            }
        else:
            doc_info[doc_id]['dense_rank'] = rank
            doc_info[doc_id]['dense_score'] = score
    
    # Add BM25 retrieval ranks
    for rank, (doc_id, score, metadata, text) in enumerate(bm25_results, 1):
        rrf_scores[doc_id] += 1 / (k + rank)
        if doc_id not in doc_info:
            doc_info[doc_id] = {
                'metadata': metadata,
                'text': text,
                'dense_rank': None,
                'dense_score': None,
                'bm25_rank': rank,
                'bm25_score': score
            }
        else:
            doc_info[doc_id]['bm25_rank'] = rank
            doc_info[doc_id]['bm25_score'] = score
    
    # Sort by RRF score and get top k
    sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:final_k]
    
    # Format final results
    final_results = []
    for doc_id, rrf_score in sorted_docs:
        result = {
            'id': doc_id,
            'rrf_score': rrf_score,
            'text': doc_info[doc_id]['text'],
            'metadata': doc_info[doc_id]['metadata'],
            'dense_rank': doc_info[doc_id]['dense_rank'],
            'dense_score': doc_info[doc_id]['dense_score'],
            'bm25_rank': doc_info[doc_id]['bm25_rank'],
            'bm25_score': doc_info[doc_id]['bm25_score']
        }
        final_results.append(result)
    
    return final_results


print("✓ RRF function defined")

# %% [markdown]
# ### 4.4 Complete Hybrid Retrieval Function

# %%
def hybrid_retrieval(
    vectorstore: Chroma,
    bm25_data: Dict,
    query: str,
    k_retrieve: int = 40,
    k_final: int = 20,
    rrf_k: int = 60
) -> List[Dict]:
    """
    Perform hybrid retrieval: Dense + BM25 with RRF fusion.
    
    Args:
        vectorstore: ChromaDB vectorstore
        bm25_data: BM25 index data
        query: Search query
        k_retrieve: Number of results to retrieve from each method
        k_final: Number of final results after fusion
        rrf_k: RRF constant
    
    Returns:
        List of results sorted by RRF score
    """
    print(f"\nHybrid Retrieval for: '{query}'")
    print(f"  Retrieving top-{k_retrieve} from each method...")
    
    # Dense retrieval
    dense_results = dense_retrieval(vectorstore, query, k=k_retrieve)
    print(f"  ✓ Dense retrieval: {len(dense_results)} results")
    
    # BM25 retrieval
    bm25_results = bm25_retrieval(bm25_data, query, k=k_retrieve)
    print(f"  ✓ BM25 retrieval: {len(bm25_results)} results")
    
    # Fusion
    print(f"  Fusing with RRF (k={rrf_k})...")
    hybrid_results = reciprocal_rank_fusion(
        dense_results, 
        bm25_results, 
        k=rrf_k, 
        final_k=k_final
    )
    print(f"  ✓ Hybrid results: {len(hybrid_results)} results")
    
    return hybrid_results


print("✓ Hybrid retrieval function defined")

# %% [markdown]
# ### 4.5 Test Hybrid Retrieval with Sample Query

# %%
print("\n" + "="*60)
print("Testing Hybrid Retrieval (Chunk 512)")
print("="*60)

test_query = "What was Apple's total revenue in 2022?"
print(f"\nQuery: {test_query}")

hybrid_results = hybrid_retrieval(
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    query=test_query,
    k_retrieve=40,
    k_final=10
)

# %% [markdown]
# ### 4.6 Display Results with Rankings

# %%
print("\n" + "="*60)
print("HYBRID RETRIEVAL RESULTS")
print("="*60)

for i, result in enumerate(hybrid_results[:5], 1):
    print(f"\nRank {i}:")
    print(f"  RRF Score: {result['rrf_score']:.6f}")
    
    # Format dense score
    dense_score_str = f"{result['dense_score']:.4f}" if result['dense_score'] is not None else "N/A"
    print(f"  Dense Rank: {result['dense_rank']} (score: {dense_score_str})")
    
    # Format BM25 score
    bm25_score_str = f"{result['bm25_score']:.4f}" if result['bm25_score'] is not None else "N/A"
    print(f"  BM25 Rank: {result['bm25_rank']} (score: {bm25_score_str})")
    
    # Show source if available
    if result['metadata'] and 'file_name' in result['metadata']:
        print(f"  Source: {result['metadata']['file_name']}")
    
    # Show text preview
    print(f"  Text: {result['text'][:200]}...")

# %% [markdown]
# ### 4.7 Compare: Dense Only vs BM25 Only vs Hybrid

# %%
print("\n" + "="*60)
print("COMPARISON: Dense vs BM25 vs Hybrid")
print("="*60)

# Dense only
print("\nDense Only (Top 3):")
dense_only = dense_retrieval(vectorstore_512, test_query, k=3)
for i, (doc_id, score, metadata, text) in enumerate(dense_only, 1):
    print(f"  {i}. Score: {score:.4f} | {text[:100]}...")

# BM25 only
print("\nBM25 Only (Top 3):")
bm25_only = bm25_retrieval(bm25_index_512, test_query, k=3)
for i, (doc_id, score, metadata, text) in enumerate(bm25_only, 1):
    print(f"  {i}. Score: {score:.4f} | {text[:100]}...")

# Hybrid
print("\nHybrid (Top 3):")
for i, result in enumerate(hybrid_results[:3], 1):
    print(f"  {i}. RRF: {result['rrf_score']:.6f} | {result['text'][:100]}...")

# %%
print("\n" + "="*60)
print("✅ STEP 4 COMPLETE")
print("="*60)
print("Hybrid retrieval with RRF fusion implemented successfully!")
print("\nTest this step, then I'll provide Step 5!")

✓ Dense retrieval function defined
✓ BM25 retrieval function defined
✓ RRF function defined
✓ Hybrid retrieval function defined

Testing Hybrid Retrieval (Chunk 512)

Query: What was Apple's total revenue in 2022?

Hybrid Retrieval for: 'What was Apple's total revenue in 2022?'
  Retrieving top-40 from each method...
  ✓ Dense retrieval: 40 results
  ✓ BM25 retrieval: 40 results
  Fusing with RRF (k=60)...
  ✓ Hybrid results: 10 results

HYBRID RETRIEVAL RESULTS

Rank 1:
  RRF Score: 0.016393
  Dense Rank: 1 (score: 1.2486)
  BM25 Rank: None (score: N/A)
  Text: Table of Contents
Net cash provided by operating activities was $3.6 billion in 2022, primarily due to our net income of $1.3 billion in 2022, adjusted for non-cash adjustments of
$4.1 billion and net...

Rank 2:
  RRF Score: 0.016393
  Dense Rank: None (score: N/A)
  BM25 Rank: 1 (score: 17.9997)
  Text: Pricing practices taken into 
consideration include historic contractually stated prices, volume discounts where applicable 

In [None]:
# ============================================================================
# STEP 5: Evaluation Framework
# ============================================================================

# %% [markdown]
# ## Step 5: Evaluation Framework
# 
# **Goal:** Evaluate and compare Dense, BM25, and Hybrid retrieval using MRR metric

# %% [markdown]
# ### 5.1 Page Matching Function
# 
# This matches your existing evaluation logic from the thesis

# %%
def check_page_match(retrieved_page: int, evidence_page: int, chunk_size: int) -> bool:
    """
    Check if retrieved page matches evidence page with tolerance based on chunk size.
    
    Tolerance is directional: retrieved page must be before or at evidence page.
    """
    # Determine tolerance based on chunk size
    if chunk_size <= 512:
        tolerance = 0
    elif chunk_size <= 1024:
        tolerance = 1
    elif chunk_size <= 2048:
        tolerance = 2
    else:
        tolerance = 3
    
    # Check if retrieved page is within tolerance and before/at evidence page
    if retrieved_page <= evidence_page and retrieved_page >= evidence_page - tolerance:
        return True
    return False


print("✓ Page matching function defined")

# %% [markdown]
# ### 5.2 MRR Calculation Function

# %%
def calculate_mrr(results: List[Dict], evidence_pages: List[int], doc_name: str, chunk_size: int) -> Tuple[float, int]:
    """
    Calculate Mean Reciprocal Rank for a single query.
    
    Args:
        results: List of retrieved results with metadata
        evidence_pages: List of evidence page numbers
        doc_name: Expected document name
        chunk_size: Chunk size for tolerance calculation
    
    Returns:
        (reciprocal_rank, rank_of_first_match)
    """
    for rank, result in enumerate(results, 1):
        metadata = result['metadata']
        
        # Extract document name from file_path
        if 'file_path' in metadata:
            file_path = metadata['file_path']
            # Extract filename from path: '../../financebench/documents/3M_2018_10K.pdf' -> '3M_2018_10K'
            import os
            filename = os.path.basename(file_path)
            retrieved_doc = filename.replace('.pdf', '')
        elif 'file_name' in metadata:
            retrieved_doc = metadata['file_name']
        else:
            continue
        
        # Check document name match
        if retrieved_doc != doc_name:
            continue
        
        # Get page number from 'source' or 'page_label'
        if 'source' in metadata:
            try:
                retrieved_page = int(metadata['source'])
            except (ValueError, TypeError):
                continue
        elif 'page_label' in metadata:
            try:
                retrieved_page = int(metadata['page_label'])
            except (ValueError, TypeError):
                continue
        else:
            continue
        
        # Check if any evidence page matches
        for evidence_page in evidence_pages:
            if check_page_match(retrieved_page, evidence_page, chunk_size):
                return 1.0 / rank, rank
    
    # No match found
    return 0.0, -1


print("✓ MRR calculation function defined")

# %% [markdown]
# ### 5.3 Evaluate Single Query

# %%
def evaluate_single_query(
    query: str,
    doc_name: str,
    evidence_pages: List[int],
    vectorstore: Chroma,
    bm25_data: Dict,
    chunk_size: int,
    k: int = 20,
    method: str = "hybrid"
) -> Dict:
    """
    Evaluate a single query with specified retrieval method.
    
    Args:
        method: "dense", "bm25", or "hybrid"
    """
    # Perform retrieval based on method
    if method == "dense":
        dense_results = dense_retrieval(vectorstore, query, k=k)
        results = [
            {
                'id': doc_id,
                'text': text,
                'metadata': metadata,
                'dense_score': score
            }
            for doc_id, score, metadata, text in dense_results
        ]
    
    elif method == "bm25":
        bm25_results = bm25_retrieval(bm25_data, query, k=k)
        results = [
            {
                'id': doc_id,
                'text': text,
                'metadata': metadata,
                'bm25_score': score
            }
            for doc_id, score, metadata, text in bm25_results
        ]
    
    elif method == "hybrid":
        results = hybrid_retrieval(
            vectorstore=vectorstore,
            bm25_data=bm25_data,
            query=query,
            k_retrieve=40,
            k_final=k,
            rrf_k=60
        )
    
    else:
        raise ValueError(f"Unknown method: {method}")
    
    # Calculate MRR
    reciprocal_rank, rank = calculate_mrr(results, evidence_pages, doc_name, chunk_size)
    
    return {
        'query': query,
        'doc_name': doc_name,
        'evidence_pages': evidence_pages,
        'reciprocal_rank': reciprocal_rank,
        'rank': rank,
        'found': rank != -1
    }


print("✓ Evaluate single query function defined")

# %% [markdown]
# ### 5.4 Evaluate All Queries

# %%
def evaluate_all_queries(
    dataset,
    vectorstore: Chroma,
    bm25_data: Dict,
    chunk_size: int,
    method: str = "hybrid",
    k: int = 20
) -> Dict:
    """
    Evaluate all queries in dataset.
    
    Returns dictionary with results and summary statistics.
    """
    print(f"\n{'='*60}")
    print(f"EVALUATING: {method.upper()} (Chunk Size: {chunk_size}, k={k})")
    print(f"{'='*60}")
    
    results = []
    reciprocal_ranks = []
    
    for record in tqdm(dataset, desc=f"Evaluating {method}"):
        query = record['question']
        doc_name = record['doc_name']
        
        # Parse evidence pages - try both 'page_number' and 'evidence_page_num'
        evidence = record['evidence']
        evidence_pages = []
        for item in evidence:
            if 'page_number' in item:
                evidence_pages.append(item['page_number'])
            elif 'evidence_page_num' in item:
                evidence_pages.append(item['evidence_page_num'])
        
        if not evidence_pages:
            continue
        
        # Evaluate query
        result = evaluate_single_query(
            query=query,
            doc_name=doc_name,
            evidence_pages=evidence_pages,
            vectorstore=vectorstore,
            bm25_data=bm25_data,
            chunk_size=chunk_size,
            k=k,
            method=method
        )
        
        results.append(result)
        reciprocal_ranks.append(result['reciprocal_rank'])
    
    # Calculate summary statistics
    if len(results) == 0:
        print("\n⚠️ WARNING: No results were evaluated!")
        print("This might be due to metadata mismatch.")
        return {
            'method': method,
            'chunk_size': chunk_size,
            'k': k,
            'total_queries': 0,
            'found_count': 0,
            'mrr': 0.0,
            'results': []
        }
    
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    found_count = sum(1 for r in results if r['found'])
    
    summary = {
        'method': method,
        'chunk_size': chunk_size,
        'k': k,
        'total_queries': len(results),
        'found_count': found_count,
        'mrr': mrr,
        'results': results
    }
    
    print(f"\n{'='*60}")
    print(f"RESULTS SUMMARY")
    print(f"{'='*60}")
    print(f"Method: {method.upper()}")
    print(f"Chunk Size: {chunk_size}")
    print(f"Total Queries: {len(results)}")
    print(f"Found: {found_count} ({found_count/len(results)*100:.1f}%)")
    print(f"MRR: {mrr:.4f}")
    print(f"{'='*60}\n")
    
    return summary


print("✓ Evaluate all queries function defined")

# %% [markdown]
# ### 5.5 Debug: Check Metadata Structure

# %%
print("\n" + "="*60)
print("Debugging Metadata Structure")
print("="*60)

# Check what metadata looks like in your chunks
print("\nSample metadata from chunks_512:")
if chunks_512['metadatas']:
    print(f"First metadata: {chunks_512['metadatas'][0]}")
    print(f"\nMetadata keys: {list(chunks_512['metadatas'][0].keys())}")

# Check what FinanceBench expects
print("\nSample from FinanceBench dataset:")
sample = dataset[0]
print(f"Expected doc_name: {sample['doc_name']}")
print(f"Evidence: {sample['evidence']}")

# Test a query to see what we retrieve
print("\n" + "="*60)
print("Test Retrieval to Check Metadata")
print("="*60)

test_query = sample['question']
print(f"\nQuery: {test_query}")

# Get one result and check its metadata
test_results = hybrid_retrieval(
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    query=test_query,
    k_retrieve=5,
    k_final=3
)

if test_results:
    print(f"\nFirst result metadata: {test_results[0]['metadata']}")
    print(f"Expected doc_name: {sample['doc_name']}")
    
    # Extract doc name from file_path
    import os
    if 'file_path' in test_results[0]['metadata']:
        file_path = test_results[0]['metadata']['file_path']
        filename = os.path.basename(file_path)
        retrieved_doc = filename.replace('.pdf', '')
        print(f"Extracted doc_name: {retrieved_doc}")
        print(f"Does it match? {retrieved_doc == sample['doc_name']}")
    
    # Check page number
    if 'source' in test_results[0]['metadata']:
        retrieved_source = test_results[0]['metadata']['source']
        print(f"Retrieved page (source): {retrieved_source}")
        
        # Get evidence pages - try both keys
        evidence_pages = []
        for item in sample['evidence']:
            if 'page_number' in item:
                evidence_pages.append(item['page_number'])
            elif 'evidence_page_num' in item:
                evidence_pages.append(item['evidence_page_num'])
        print(f"Expected evidence pages: {evidence_pages}")
        
        # Check all retrieved results to see if any match
        print(f"\nChecking all {len(test_results)} retrieved results:")
        for i, res in enumerate(test_results):
            if 'source' in res['metadata']:
                page = res['metadata']['source']
                file_path = res['metadata'].get('file_path', '')
                doc = os.path.basename(file_path).replace('.pdf', '') if file_path else 'unknown'
                print(f"  Result {i+1}: doc={doc}, page={page}")

# %% [markdown]
# ### 5.5b Test MRR Calculation Directly

# %%
print("\n" + "="*60)
print("Testing MRR Calculation")
print("="*60)

# Test with the first sample
sample = dataset[0]
print(f"\nQuery: {sample['question']}")
print(f"Expected doc: {sample['doc_name']}")

# Get evidence pages - try both keys
evidence_pages = []
for item in sample['evidence']:
    if 'page_number' in item:
        evidence_pages.append(item['page_number'])
    elif 'evidence_page_num' in item:
        evidence_pages.append(item['evidence_page_num'])
print(f"Evidence pages: {evidence_pages}")

# Perform retrieval
test_eval = evaluate_single_query(
    query=sample['question'],
    doc_name=sample['doc_name'],
    evidence_pages=evidence_pages,
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    chunk_size=512,
    k=20,
    method="hybrid"
)

print(f"\nResult:")
print(f"  Found: {test_eval['found']}")
print(f"  Rank: {test_eval['rank']}")
print(f"  Reciprocal Rank: {test_eval['reciprocal_rank']:.4f}")

# Let's also check what pages were actually retrieved
print(f"\nAll 20 retrieved pages from {sample['doc_name']}:")
test_hybrid_results = hybrid_retrieval(
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    query=sample['question'],
    k_retrieve=40,
    k_final=20
)

pages_from_correct_doc = []
for i, res in enumerate(test_hybrid_results, 1):
    if 'file_path' in res['metadata']:
        file_path = res['metadata']['file_path']
        doc = os.path.basename(file_path).replace('.pdf', '')
        page = res['metadata'].get('source', 'N/A')
        if doc == sample['doc_name']:
            pages_from_correct_doc.append((i, page))
            if int(page) == evidence_pages[0]:
                print(f"  ✓ Rank {i}: page {page} ← MATCH!")
            else:
                print(f"    Rank {i}: page {page}")

if not pages_from_correct_doc:
    print(f"  ⚠️ No pages from {sample['doc_name']} in top 20!")
elif evidence_pages[0] not in [int(p) for _, p in pages_from_correct_doc]:
    print(f"\n  ⚠️ Evidence page {evidence_pages[0]} NOT in top 20 results")

# %% [markdown]
# ### 5.6 Test Evaluation on Small Sample

# %%
print("\n" + "="*60)
print("Testing Evaluation (First 10 Queries)")
print("="*60)

# Test with first 10 queries
small_dataset = dataset.select(range(10))

# Test hybrid retrieval
test_results_hybrid = evaluate_all_queries(
    dataset=small_dataset,
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    chunk_size=512,
    method="hybrid",
    k=20
)

# %% [markdown]
# ### 5.6 Full Evaluation - Compare All Methods (Chunk 512)

# %%
print("\n" + "="*60)
print("FULL EVALUATION - CHUNK SIZE 512")
print("="*60)

# Dense only
print("\n1. Dense Retrieval...")
results_dense_512 = evaluate_all_queries(
    dataset=dataset,
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    chunk_size=512,
    method="dense",
    k=20
)

# BM25 only
print("\n2. BM25 Retrieval...")
results_bm25_512 = evaluate_all_queries(
    dataset=dataset,
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    chunk_size=512,
    method="bm25",
    k=20
)

# Hybrid
print("\n3. Hybrid Retrieval...")
results_hybrid_512 = evaluate_all_queries(
    dataset=dataset,
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    chunk_size=512,
    method="hybrid",
    k=20
)

# %% [markdown]
# ### 5.7 Compare Results (Chunk 512)

# %%
print("\n" + "="*60)
print("COMPARISON - CHUNK SIZE 512")
print("="*60)

comparison_512 = {
    'Dense': results_dense_512['mrr'],
    'BM25': results_bm25_512['mrr'],
    'Hybrid': results_hybrid_512['mrr']
}

for method, mrr in comparison_512.items():
    print(f"{method:15s} MRR: {mrr:.4f}")

# Calculate improvement (with safety check)
dense_mrr = comparison_512['Dense']
hybrid_mrr = comparison_512['Hybrid']

if dense_mrr > 0:
    improvement = ((hybrid_mrr - dense_mrr) / dense_mrr) * 100
    print(f"\nImprovement (Hybrid vs Dense): {improvement:+.2f}%")
else:
    print(f"\n⚠️ Warning: Dense MRR is 0, cannot calculate improvement percentage")
    print(f"Absolute improvement: {hybrid_mrr - dense_mrr:.4f}")

# %% [markdown]
# ### 5.8 Full Evaluation - Chunk Size 1024

# %%
print("\n" + "="*60)
print("FULL EVALUATION - CHUNK SIZE 1024")
print("="*60)

# Dense only
print("\n1. Dense Retrieval...")
results_dense_1024 = evaluate_all_queries(
    dataset=dataset,
    vectorstore=vectorstore_1024,
    bm25_data=bm25_index_1024,
    chunk_size=1024,
    method="dense",
    k=20
)

# BM25 only
print("\n2. BM25 Retrieval...")
results_bm25_1024 = evaluate_all_queries(
    dataset=dataset,
    vectorstore=vectorstore_1024,
    bm25_data=bm25_index_1024,
    chunk_size=1024,
    method="bm25",
    k=20
)

# Hybrid
print("\n3. Hybrid Retrieval...")
results_hybrid_1024 = evaluate_all_queries(
    dataset=dataset,
    vectorstore=vectorstore_1024,
    bm25_data=bm25_index_1024,
    chunk_size=1024,
    method="hybrid",
    k=20
)

# %% [markdown]
# ### 5.9 Compare Results (Chunk 1024)

# %%
print("\n" + "="*60)
print("COMPARISON - CHUNK SIZE 1024")
print("="*60)

comparison_1024 = {
    'Dense': results_dense_1024['mrr'],
    'BM25': results_bm25_1024['mrr'],
    'Hybrid': results_hybrid_1024['mrr']
}

for method, mrr in comparison_1024.items():
    print(f"{method:15s} MRR: {mrr:.4f}")

# Calculate improvement (with safety check)
dense_mrr = comparison_1024['Dense']
hybrid_mrr = comparison_1024['Hybrid']

if dense_mrr > 0:
    improvement = ((hybrid_mrr - dense_mrr) / dense_mrr) * 100
    print(f"\nImprovement (Hybrid vs Dense): {improvement:+.2f}%")
else:
    print(f"\n⚠️ Warning: Dense MRR is 0, cannot calculate improvement percentage")
    print(f"Absolute improvement: {hybrid_mrr - dense_mrr:.4f}")

# %% [markdown]
# ### 5.10 Final Summary

# %%
print("\n" + "="*60)
print("FINAL SUMMARY - ALL RESULTS")
print("="*60)

print("\nChunk Size 512:")
for method, mrr in comparison_512.items():
    print(f"  {method:10s}: {mrr:.4f}")

print("\nChunk Size 1024:")
for method, mrr in comparison_1024.items():
    print(f"  {method:10s}: {mrr:.4f}")

print("\n" + "="*60)

# %%
print("\n" + "="*60)
print("✅ STEP 5 COMPLETE")
print("="*60)
print("Evaluation complete! You now have MRR scores for all methods.")
print("\nYou can use these results for your thesis comparison!")

✓ Page matching function defined
✓ MRR calculation function defined
✓ Evaluate single query function defined
✓ Evaluate all queries function defined

Debugging Metadata Structure

Sample metadata from chunks_512:
First metadata: {'total_pages': 76, 'source': '1', 'chunk_size': 512, 'file_path': '../../financebench/documents/COSTCO_2021_10K.pdf'}

Metadata keys: ['total_pages', 'source', 'chunk_size', 'file_path']

Sample from FinanceBench dataset:
Expected doc_name: 3M_2018_10K
Evidence: [{'evidence_text': 'Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\nYears ended December 31\n \n(Millions)\n \n2018\n \n2017\n \n2016\n \nCash Flows from Operating Activities\n \n \n \n \n \n \n \nNet income including noncontrolling interest\n \n$\n5,363 \n$\n4,869 \n$\n5,058 \nAdjustments to reconcile net income including noncontrolling interest to net cash\nprovided by operating activities\n \n \n \n \n \n \n \nDepreciation and amortization\n \n \n1,488 \n \n1,

Evaluating hybrid:   0%|          | 0/10 [00:00<?, ?it/s]


This might be due to metadata mismatch.

FULL EVALUATION - CHUNK SIZE 512

1. Dense Retrieval...

EVALUATING: DENSE (Chunk Size: 512, k=20)


Evaluating dense:   0%|          | 0/150 [00:00<?, ?it/s]


This might be due to metadata mismatch.

2. BM25 Retrieval...

EVALUATING: BM25 (Chunk Size: 512, k=20)


Evaluating bm25:   0%|          | 0/150 [00:00<?, ?it/s]


This might be due to metadata mismatch.

3. Hybrid Retrieval...

EVALUATING: HYBRID (Chunk Size: 512, k=20)


Evaluating hybrid:   0%|          | 0/150 [00:00<?, ?it/s]


This might be due to metadata mismatch.

COMPARISON - CHUNK SIZE 512
Dense           MRR: 0.0000
BM25            MRR: 0.0000
Hybrid          MRR: 0.0000


ZeroDivisionError: float division by zero

In [None]:
# %% [markdown]
# ### 5.5 Debug: Check Metadata Structure

# %%
print("\n" + "="*60)
print("Debugging Metadata Structure")
print("="*60)

# Check what metadata looks like in your chunks
print("\nSample metadata from chunks_512:")
if chunks_512['metadatas']:
    print(f"First metadata: {chunks_512['metadatas'][0]}")
    print(f"\nMetadata keys: {list(chunks_512['metadatas'][0].keys())}")

# Check what FinanceBench expects
print("\nSample from FinanceBench dataset:")
sample = dataset[0]
print(f"Expected doc_name: {sample['doc_name']}")
print(f"Evidence: {sample['evidence']}")

# Test a query to see what we retrieve
print("\n" + "="*60)
print("Test Retrieval to Check Metadata")
print("="*60)

test_query = sample['question']
print(f"\nQuery: {test_query}")

# Get one result and check its metadata
test_results = hybrid_retrieval(
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    query=test_query,
    k_retrieve=5,
    k_final=3
)

if test_results:
    print(f"\nFirst result metadata: {test_results[0]['metadata']}")
    print(f"Expected doc_name: {sample['doc_name']}")
    
    # Extract doc name from file_path
    import os
    if 'file_path' in test_results[0]['metadata']:
        file_path = test_results[0]['metadata']['file_path']
        filename = os.path.basename(file_path)
        retrieved_doc = filename.replace('.pdf', '')
        print(f"Extracted doc_name: {retrieved_doc}")
        print(f"Does it match? {retrieved_doc == sample['doc_name']}")
    
    # Check page number
    if 'source' in test_results[0]['metadata']:
        retrieved_source = test_results[0]['metadata']['source']
        print(f"Retrieved page (source): {retrieved_source}")
        
        # Get evidence pages - try both keys
        evidence_pages = []
        for item in sample['evidence']:
            if 'page_number' in item:
                evidence_pages.append(item['page_number'])
            elif 'evidence_page_num' in item:
                evidence_pages.append(item['evidence_page_num'])
        print(f"Expected evidence pages: {evidence_pages}")
        
        # Check all retrieved results to see if any match
        print(f"\nChecking all {len(test_results)} retrieved results:")
        for i, res in enumerate(test_results):
            if 'source' in res['metadata']:
                page = res['metadata']['source']
                file_path = res['metadata'].get('file_path', '')
                doc = os.path.basename(file_path).replace('.pdf', '') if file_path else 'unknown'
                print(f"  Result {i+1}: doc={doc}, page={page}")

# %% [markdown]
# ### 5.5b Test MRR Calculation Directly

# %%
print("\n" + "="*60)
print("Testing MRR Calculation")
print("="*60)

# Test with the first sample
sample = dataset[0]
print(f"\nQuery: {sample['question']}")
print(f"Expected doc: {sample['doc_name']}")

# Get evidence pages - try both keys
evidence_pages = []
for item in sample['evidence']:
    if 'page_number' in item:
        evidence_pages.append(item['page_number'])
    elif 'evidence_page_num' in item:
        evidence_pages.append(item['evidence_page_num'])
print(f"Evidence pages: {evidence_pages}")

# Perform retrieval
test_eval = evaluate_single_query(
    query=sample['question'],
    doc_name=sample['doc_name'],
    evidence_pages=evidence_pages,
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    chunk_size=512,
    k=20,
    method="hybrid"
)

print(f"\nResult:")
print(f"  Found: {test_eval['found']}")
print(f"  Rank: {test_eval['rank']}")
print(f"  Reciprocal Rank: {test_eval['reciprocal_rank']:.4f}")

# Let's also check what pages were actually retrieved
print(f"\nAll 20 retrieved pages from {sample['doc_name']}:")
test_hybrid_results = hybrid_retrieval(
    vectorstore=vectorstore_512,
    bm25_data=bm25_index_512,
    query=sample['question'],
    k_retrieve=40,
    k_final=20
)

pages_from_correct_doc = []
for i, res in enumerate(test_hybrid_results, 1):
    if 'file_path' in res['metadata']:
        file_path = res['metadata']['file_path']
        doc = os.path.basename(file_path).replace('.pdf', '')
        page = res['metadata'].get('source', 'N/A')
        if doc == sample['doc_name']:
            pages_from_correct_doc.append((i, page))
            if int(page) == evidence_pages[0]:
                print(f"  ✓ Rank {i}: page {page} ← MATCH!")
            else:
                print(f"    Rank {i}: page {page}")

if not pages_from_correct_doc:
    print(f"  ⚠️ No pages from {sample['doc_name']} in top 20!")
elif evidence_pages[0] not in [int(p) for _, p in pages_from_correct_doc]:
    print(f"\n  ⚠️ Evidence page {evidence_pages[0]} NOT in top 20 results")



Debugging Metadata Structure

Sample metadata from chunks_512:
First metadata: {'total_pages': 76, 'source': '1', 'chunk_size': 512, 'file_path': '../../financebench/documents/COSTCO_2021_10K.pdf'}

Metadata keys: ['total_pages', 'source', 'chunk_size', 'file_path']

Sample from FinanceBench dataset:
Expected doc_name: 3M_2018_10K
Evidence: [{'evidence_text': 'Table of Contents \n3M Company and Subsidiaries\nConsolidated Statement of Cash Flow s\nYears ended December 31\n \n(Millions)\n \n2018\n \n2017\n \n2016\n \nCash Flows from Operating Activities\n \n \n \n \n \n \n \nNet income including noncontrolling interest\n \n$\n5,363 \n$\n4,869 \n$\n5,058 \nAdjustments to reconcile net income including noncontrolling interest to net cash\nprovided by operating activities\n \n \n \n \n \n \n \nDepreciation and amortization\n \n \n1,488 \n \n1,544 \n \n1,474 \nCompany pension and postretirement contributions\n \n \n(370) \n \n(967) \n \n(383) \nCompany pension and postretirement expense\n \