# Top-K Retrieval Comparison

This notebook compares retrieval performance for top 5, top 10, and top 50 results using semantic search embeddings.

In [1]:
import setup_notebook  # This fixes the path for imports

Added c:\Users\piete\Repos\pkuppens\on_prem_rag\src\backend to Python path.


In [2]:
import time
import json

from rag_pipeline.config.parameter_sets import get_param_set
from rag_pipeline.core.embeddings import process_pdf, query_embeddings
from rag_pipeline.utils.directory_utils import get_project_root, get_test_data_dir


In [3]:
# Set up paths and parameters

root = get_project_root()
test_data = get_test_data_dir()
param_name = 'fast'
params = get_param_set(param_name)
pdf = test_data / '2303.18223v16.pdf'
persist = root / 'data' / f'{param_name}_chroma'

# Ensure persist directory exists

persist.mkdir(parents=True, exist_ok=True)


In [4]:
# Create embeddings if not already present
if not (persist / 'chroma.sqlite3').exists():
    chunks, records = process_pdf(
        pdf,
        params.embedding.model_name,
        persist_dir=str(persist),
        chunk_size=params.chunking.chunk_size,
        chunk_overlap=params.chunking.chunk_overlap,
        max_pages=None,
        deduplicate=True,
    )
    print(f'Processing completed: {chunks} chunks, {records} records stored.')
else:
    print(f'Embeddings already exist in {persist}, skipping processing.')


Embeddings already exist in C:\Users\piete\Repos\pkuppens\on_prem_rag\data\fast_chroma, skipping processing.


In [5]:
# Query the embeddings

query_text = 'Healthcare'
results = query_embeddings(
    query_text,
    params.embedding.model_name,
    persist_dir=str(persist),
    top_k=5
)
print(json.dumps(results, indent=2))

{
  "primary_result": "Domain Specialization. Existing LLMs have showcased su-\nperior capabilities in traditional NLP tasks ( e.g., generation\nand reasoning) and daily questions. However, they may\nstill lack domain knowledge to accomplish specific tasks,\nsuch as medicine, law, and finance (See Section 8 for a\ndetailed discussion of LLMs in different applications). In-\nstruction tuning is an effective approach to adapting existing\ngeneral LLMs to be domain-specific experts. For instance,\nresearchers propose to fine-tune Flan-PaLM [69] using medi-\ncal datasets to create Med-PaLM [354], a medical knowledge\nassistant that achieves performance levels comparable to\nthose of expert clinicians. Furthermore, a recent study [355]\nfine-tunes FLAN-T5 to support e-commerce recommender\nsystems with natural language instructions, showing strong\nperformance in a variety of recommendation tasks. There\nare also several open-sourced medical models instruction-\ntuned based on LLaMA [57], s

In [6]:

results = {}

for top_k in [5, 10, 50]:
    start = time.perf_counter()
    results = query_embeddings(
        query_text,
        params.embedding.model_name,
        persist_dir=str(persist),
        top_k=top_k
    )
    elapsed = time.perf_counter() - start
    print(f"Top {top_k} retrieval took {elapsed:.6f} seconds")
    print(f'Found {len(results["all_results"])} results')
    if results['all_results']:
        for i, result in enumerate(results['all_results']):
            print(f'\nResult {i+1}:')
            print(f' Document: {result["document_name"]}')
            print(f' Chunk: {result["chunk_index"]} (ID: {result["document_id"]})')
            print(f' Page: {result.get("page_number", "unknown")}')
            print(f' Similarity score: {result["similarity_score"]:.4f}')
            print(f' Text preview: {result["text"][:200]}...')
    else:
        print('No results found for the query.')

AttributeError: 'builtin_function_or_method' object has no attribute 'perf_counter'

Check if the top 5 results are identical in both the top 5 and top 50 queries:

In [15]:
top_5 = query_embeddings(
        query_text,
        params.embedding.model_name,
        persist_dir=str(persist),
        top_k=5
    )
top_50 = query_embeddings(
        query_text,
        params.embedding.model_name,
        persist_dir=str(persist),
        top_k=50
    )
all_results_5 = top_5['all_results']
top_5_all_results_50 = top_5['all_results'][:5]

In [17]:
assert len(all_results_5) == len(top_5_all_results_50), "The length of results should be the same"

In [None]:
# Compare each result between top 5 and top 50 queries
for i, (result_5, result_50) in enumerate(zip(all_results_5, top_5_all_results_50)):
    print(f"\nComparing result {i+1}:")
    print(f"Top 5 query - Document: {result_5['document_name']}, Score: {result_5['similarity_score']:.4f}")
    print(f"Top 50 query - Document: {result_50['document_name']}, Score: {result_50['similarity_score']:.4f}")
    
    # Verify all fields match
    assert result_5['document_name'] == result_50['document_name'], f"Document names don't match for result {i+1}"
    assert result_5['chunk_index'] == result_50['chunk_index'], f"Chunk indices don't match for result {i+1}"
    assert result_5['document_id'] == result_50['document_id'], f"Document IDs don't match for result {i+1}"
    assert abs(result_5['similarity_score'] - result_50['similarity_score']) < 1e-6, f"Similarity scores don't match for result {i+1}"
    assert result_5['text'] == result_50['text'], f"Text content doesn't match for result {i+1}"
    
print("\nAll top 5 results match between queries!")


This confirms that retrieving more results only adds minor overhead and the ranking of the first five remains the same.