# PDF Embedding Exploration

Use different parameter sets to create and query embeddings from a single PDF file.

In [None]:
import glob
import textwrap
from pathlib import Path

import setup_notebook  # This fixes the path for imports
from rag_pipeline.config.parameter_sets import get_param_set
from rag_pipeline.core.embeddings import chunk_pdf, embed_chunks, query_embeddings
from rag_pipeline.utils.directory_utils import get_project_root, get_test_data_dir

In [None]:
# Wrap the chunk text at 100 characters for better readability
def line_wrap_chunk(chunk):
    return textwrap.fill(chunk, width=100)

In [None]:
# Use absolute paths based on project root
root = get_project_root()
test_data = get_test_data_dir()

param_name = "fast"  # options: fast, context_rich, precise, etc.
params = get_param_set(param_name)

pdf = test_data / "2303.18223v16.pdf"

persist = root / "data" / f"{param_name}_chroma"

# Ensure persist directory exists
persist.mkdir(parents=True, exist_ok=True)

print(f"Processing PDF: {pdf.name}")
print(f"Using parameters: {param_name}")
print(f"Chunk size: {params.chunking.chunk_size}, Overlap: {params.chunking.chunk_overlap}")

# Step 1: Check if embeddings already exist
if (persist / "chroma.sqlite3").exists():
    print(f"Embeddings already exist in {persist}, skipping processing.")
else:
    # Step 2: Chunk the PDF first
    print("Step 1: Chunking PDF...")
    chunking_result = chunk_pdf(
        pdf,
        chunk_size=params.chunking.chunk_size,
        chunk_overlap=params.chunking.chunk_overlap,
        max_pages=None,  # Process all pages (144)
    )

    print(f"Chunking completed:")
    print(f"  - File: {chunking_result.file_name}")
    print(f"  - Pages: {chunking_result.num_pages}")
    print(f"  - File size: {chunking_result.file_size:,} bytes")
    print(f"  - Chunks created: {chunking_result.chunk_count}")

    # Step 3: Generate and store embeddings
    print("\nStep 2: Generating embeddings...")
    chunks, records = embed_chunks(
        chunking_result,
        params.embedding.model_name,
        persist_dir=str(persist),
        deduplicate=True,
    )

    print(f"\nProcessing completed:")
    print(f"  - Total chunks processed: {chunks}")
    print(f"  - Records stored in database: {records}")
    print(f"  - Deduplication removed: {chunks - records} duplicates")

In [None]:
# Process all PDF files in the test_data directory
pdf_files = glob.glob(str(test_data / "*.pdf"))
print(f"Found {len(pdf_files)} PDF files to process")

for pdf_path in pdf_files:
    pdf_file = Path(pdf_path)
    print(f"Processing: {pdf_file.name}")

    # Skip if already processed (check if embeddings exist)
    persist_subdir = persist / pdf_file.stem
    persist_subdir.mkdir(parents=True, exist_ok=True)

    if (persist_subdir / "chroma.sqlite3").exists():
        print(f"  Skipping {pdf_file.name} - embeddings already exist")
        continue

    # Process the PDF
    chunking_result = chunk_pdf(
        pdf_file,
        chunk_size=params.chunking.chunk_size,
        chunk_overlap=params.chunking.chunk_overlap,
        max_pages=None,  # Process all pages
    )

    embed_chunks(
        chunking_result,
        params.embedding.model_name,
        persist_dir=str(persist_subdir),
        deduplicate=True,
    )

print("All PDF files processed")

In [None]:
# Step 3: Query embeddings and get results with similarity scores
query_text = "LLM for Healthcare"
print(f"Querying embeddings for: '{query_text}'")

results = query_embeddings(query_text, params.embedding.model_name, persist_dir=persist)

print(f"\nQuery Results:")
print(f"Found {len(results['all_results'])} results")

if results["all_results"]:
    print(f"\nTop result (similarity: {results['all_results'][0]['similarity_score']:.4f}):")
    chunk = results["all_results"][0]["text"]

    print(f"Text preview:\n{line_wrap_chunk(chunk)}")

    print(f"\nDetailed results:")
    for i, result in enumerate(results["all_results"]):
        print(f"\nResult {i + 1}:")
        print(f"  Document: {result['document_name']}")
        print(f"  Chunk: {result['chunk_index']} (ID: {result['document_id']})")
        print(f"  Page: {result.get('page_number', 'unknown')}")
        print(f"  Record ID: {result['record_id']}")
        print(f"  Similarity score: {result['similarity_score']:.4f}")
        print(f"  Text chunk: {result['text'][:200]}...")
else:
    print("No results found for the query.")

In [None]:
# Query with unrelated topic to test similarity score range
test_query_high = "Multimodal Large Language Model"
test_query_low = "Oceanography"

print(f"\nTesting similarity score range with both high and low related query: '{test_query_high}' and '{test_query_low}'")

test_results_high = query_embeddings(test_query_high, params.embedding.model_name, persist_dir=persist)
test_results_low = query_embeddings(test_query_low, params.embedding.model_name, persist_dir=persist)

if test_results_high["all_results"]:
    highest_score_high = max(result["similarity_score"] for result in test_results_high["all_results"])
    print(f"\nHighest similarity score for high related query: {highest_score_high:.4f}")
    print("\nThis helps determine if similarity scores are relative or absolute:")
    print("- If scores are relative, they will be high for any query")
    print("- If scores are absolute, unrelated queries should have lower scores")
else:
    print("No results found for the test query.")

if test_results_low["all_results"]:
    highest_score_low = max(result["similarity_score"] for result in test_results_low["all_results"])
    print(f"\nHighest similarity score for low related query: {highest_score_low:.4f}")
else:
    print("No results found for the test query.")

assert highest_score_high > highest_score_low, (
    "The highest similarity score for the high related query should be higher than the highest similarity score for the low related query"
)

In [None]:
# Show the best match content for both queries
print("\nBest match for high related query:")
if test_results_high["all_results"]:
    best_match_high = test_results_high["all_results"][0]
    print(f"Document: {best_match_high['document_name']}")
    print(f"Page: {best_match_high.get('page_number', 'unknown')}")
    print(f"Similarity score: {best_match_high['similarity_score']:.4f}")
    print(f"Content:\n{best_match_high['text']}\n")

print("\nBest match for low related query:")
if test_results_low["all_results"]:
    best_match_low = test_results_low["all_results"][0]
    print(f"Document: {best_match_low['document_name']}")
    print(f"Page: {best_match_low.get('page_number', 'unknown')}")
    print(f"Similarity score: {best_match_low['similarity_score']:.4f}")
    print(f"Content:\n{best_match_low['text']}\n")