# PDF Embedding Exploration

Use different parameter sets to create and query embeddings from a single PDF file.

In [None]:
from rag_pipeline.config.parameter_sets import get_param_set
from rag_pipeline.core.embeddings import process_pdf, query_embeddings
from rag_pipeline.utils.directory_utils import get_project_root, get_test_data_dir

# Use absolute paths based on project root
root = get_project_root()
test_data = get_test_data_dir()

param_name = "fast"  # options: fast, context_rich, precise, etc.
params = get_param_set(param_name)
pdf = test_data / "2303.18223v16.pdf"
persist = root / "data" / f"{param_name}_chroma"

# Ensure persist directory exists
persist.mkdir(parents=True, exist_ok=True)

# Process PDF with deduplication enabled
if not (persist / "chroma.sqlite3").exists():
    chunks, records = process_pdf(
        pdf,
        params.embedding.model_name,
        persist_dir=str(persist),  # Convert Path to string for compatibility
        chunk_size=params.chunking.chunk_size,
        chunk_overlap=params.chunking.chunk_overlap,
        max_pages=1,
        deduplicate=True,  # Enable deduplication
    )
    print(f"Processed {chunks} chunks, total records in database: {records}")
else:
    print(f"Embeddings already exist in {persist}, skipping processing.")


In [None]:
# Query embeddings and get results with similarity scores
results = query_embeddings("LLM for Healthcare", params.embedding.model_name, persist_dir=persist)

print("Primary result:")
print(results["primary_result"][:200])

print("
All results with detailed information:")
for result in results["all_results"]:
    print(f"
Document: {result['document_name']}")
    print(f"Chunk: {result['chunk_index']} (ID: {result['document_id']})")
    print(f"Record ID: {result['record_id']}")
    print(f"Similarity score: {result['similarity_score']:.4f}")
    print("Text preview:", result["text"][:200])