# PDF Embedding Exploration

Use different parameter sets to create and query embeddings from a single PDF file.

In [None]:
from rag_pipeline.config.parameter_sets import get_param_set
from rag_pipeline.core.embeddings import chunk_pdf, embed_chunks, query_embeddings
from rag_pipeline.utils.directory_utils import get_project_root, get_test_data_dir

# Use absolute paths based on project root
root = get_project_root()
test_data = get_test_data_dir()

param_name = "fast"  # options: fast, context_rich, precise, etc.
params = get_param_set(param_name)
pdf = test_data / "2303.18223v16.pdf"
persist = root / "data" / f"{param_name}_chroma"

# Ensure persist directory exists
persist.mkdir(parents=True, exist_ok=True)

print(f"Processing PDF: {pdf.name}")
print(f"Using parameters: {param_name}")
print(f"Chunk size: {params.chunking.chunk_size}, Overlap: {params.chunking.chunk_overlap}")

# Step 1: Check if embeddings already exist
if (persist / "chroma.sqlite3").exists():
    print(f"Embeddings already exist in {persist}, skipping processing.")
else:
    # Step 2: Chunk the PDF first
    print("Step 1: Chunking PDF...")
    chunking_result = chunk_pdf(
        pdf,
        chunk_size=params.chunking.chunk_size,
        chunk_overlap=params.chunking.chunk_overlap,
        max_pages=None,  # Process all pages (144)
    )

    print(f"Chunking completed:")
    print(f"  - File: {chunking_result.file_name}")
    print(f"  - Pages: {chunking_result.num_pages}")
    print(f"  - File size: {chunking_result.file_size:,} bytes")
    print(f"  - Chunks created: {chunking_result.chunk_count}")

    # Step 3: Generate and store embeddings
    print("\nStep 2: Generating embeddings...")
    chunks, records = embed_chunks(
        chunking_result,
        params.embedding.model_name,
        persist_dir=str(persist),
        deduplicate=True,
    )

    print(f"\nProcessing completed:")
    print(f"  - Total chunks processed: {chunks}")
    print(f"  - Records stored in database: {records}")
    print(f"  - Deduplication removed: {chunks - records} duplicates")

In [None]:
# Step 3: Query embeddings and get results with similarity scores
query_text = "LLM for Healthcare"
print(f"Querying embeddings for: '{query_text}'")

results = query_embeddings(query_text, params.embedding.model_name, persist_dir=persist)

print(f"\nQuery Results:")
print(f"Found {len(results['all_results'])} results")

if results["all_results"]:
    print(f"\nTop result (similarity: {results['all_results'][0]['similarity_score']:.4f}):")
    print(f"Text preview: {results['primary_result'][:200]}...")

    print(f"\nDetailed results:")
    for i, result in enumerate(results["all_results"]):
        print(f"\nResult {i + 1}:")
        print(f"  Document: {result['document_name']}")
        print(f"  Chunk: {result['chunk_index']} (ID: {result['document_id']})")
        print(f"  Page: {result.get('page_number', 'unknown')}")
        print(f"  Record ID: {result['record_id']}")
        print(f"  Similarity score: {result['similarity_score']:.4f}")
        print(f"  Text preview: {result['text'][:200]}...")
else:
    print("No results found for the query.")