# Document Embedding Basics

When documents contain charts, tabular data, or image, you create embeddings for those documents using Nova Embeddings image embeddings with the `detail` parameter set to "HIGH". This notebook will demonstrate an example creating embeddings for a PDF document and then running a query against those embeddings.

<div style="background-color: #d9f2f7ff; border:1px solid #a0a0a060; padding: 1rem; color: #151515">
✏️ <strong>Note:</strong> We've also provide a stand alone script, "pdf_to_png.py" that you can use if you'd like to convert PDFs outside of this notebook.
</div>

In [None]:
# Restore variables from setup notebook
%store -r s3_bucket
print(f"Using S3 bucket: {s3_bucket}")
%store -r region_name
print(f"Using region: {region_name}")

In [None]:
from os import path, listdir
from pdf_to_png import pdf_to_png
from utils.utils import load_file_as_base64
import nova_embeddings
import chromadb

# Update the constants below if desired.
PDF_PATH = "sample_assets/documents/InTheHotSeat-WA.pdf"
EMBEDDING_DIMENSION = 3072

output_dir = path.splitext(PDF_PATH)[0]

pdf_to_png(pdf_path=PDF_PATH, output_dir=output_dir, dpi=200)

# Initialize the vector data store.
chroma_client = chromadb.PersistentClient()
collection_name = "document_pages"
try:
    chroma_client.delete_collection(collection_name)
except:
    pass  # Collection might not exist
collection = chroma_client.create_collection(collection_name)

# Generate embeddings for each image and add them to the vector store.
page_files = sorted(listdir(output_dir))
for filename in page_files:
    file_path = path.join(output_dir, filename)

    request_body = {
        "taskType": "SINGLE_EMBEDDING",
        "singleEmbeddingParams": {
            "embeddingDimension": EMBEDDING_DIMENSION,
            "embeddingPurpose": "GENERIC_INDEX",
            "image": {
                "format": "png",
                "detailLevel": "DOCUMENT_IMAGE",
                "source": {"bytes": load_file_as_base64(file_path)},
            },
        },
    }

    # Generate the embedding.
    print(f"Generating embedding for: {file_path}")
    body, metadata = nova_embeddings.generate_embedding_sync(request_body)
    embedding = body.get("embeddings")[0].get("embedding")

    # Add the image to the vector store.
    collection.add(
        ids=[file_path], embeddings=[embedding], metadatas=[{"file_path": file_path}]
    )

    print("Done")

Now, we will query the vector store to find the top 3 pages related to a query.

In [None]:
from os import path, listdir
from pdf_to_png import pdf_to_png
from utils.utils import load_file_as_base64
import nova_embeddings
import chromadb
from IPython.display import Image, display

# Updaate the constants below if desired.

QUERY_TEXT = "What are the signs of heat stroke?"
# QUERY_TEXT = "What is a heat dome?"
# QUERY_TEXT = "image of an ambulance"

EMBEDDING_DIMENSION = 3072

result_body, _ = nova_embeddings.generate_embedding_sync(
    {
        "taskType": "SINGLE_EMBEDDING",
        "singleEmbeddingParams": {
            "embeddingPurpose": "DOCUMENT_RETRIEVAL",
            "embeddingDimension": EMBEDDING_DIMENSION,
            "text": {"truncationMode": "END", "value": QUERY_TEXT},
        },
    }
)

print("Generating query embedding...")
query_embedding = nova_embeddings.extract_embedding(result_body)

# Query the vector store.
collection = chromadb.PersistentClient().get_collection("document_pages")

# Query the collection for the top N most similar pages
retrieval_count = 3
results = collection.query(
    query_embeddings=[query_embedding], n_results=retrieval_count
)

# Display the results
print(f"Most relevant {retrieval_count} pages:")
for i, (doc_id, distance) in enumerate(zip(results["ids"][0], results["distances"][0])):
    print(f"Result {i+1}: {doc_id} (distance: {distance:.4f})")

print("\nMost relevant page image:")
first_image_path = results["ids"][0][0]
display(Image(first_image_path))