# RAG: Load Vector Database

This notebook loads book chapters into a Chroma vector database using pydantic-ai embeddings.
Run this notebook once to populate the database before using the query notebook.

## Initialize

In [None]:
from pathlib import Path

from agentic_patterns.core.config.config import MAIN_PROJECT_DIR
from agentic_patterns.core.vectordb import (
    get_vector_db,
    vdb_add,
    load_vectordb_settings,
)

In [None]:
DOCS_DIR = MAIN_PROJECT_DIR / "tests" / "data" / "books"
print(f"Books directory: {DOCS_DIR}")

## Vector-db: ETL (Extract, Transform, Load)

Creates/loads a Chroma vector database collection named 'books'. The database persists to the location defined in config.yaml.

In [None]:
vdb = get_vector_db("books")

# Show database location
settings = load_vectordb_settings(MAIN_PROJECT_DIR / "config.yaml")
db_path = Path(settings.get_vectordb().persist_directory)
print(f"Database directory: {db_path}")

In [None]:
# Check if the database needs to be populated
count = vdb.count()
create_vdb = count == 0
print(f"Collection has {count} documents. Need to populate: {create_vdb}")

## Chunking

In [None]:
def chunks(file: Path, min_lines: int = 3):
    """Chunk a book into paragraphs, returning (document, doc_id, metadata) tuples."""
    text = file.read_text()
    paragraphs = text.split("\n\n")
    for paragraph_num, paragraph in enumerate(paragraphs):
        doc = paragraph.strip()
        if not doc or len(doc.strip().split("\n")) < min_lines:
            continue
        doc_id = f"{file.stem}-{paragraph_num}"
        metadata = {"source": str(file.stem), "paragraph": paragraph_num}
        yield doc, doc_id, metadata

## Load documents

In [None]:
if create_vdb:
    count_added = 0
    for txt_file in DOCS_DIR.glob("*.txt"):
        print(f"Reading file '{txt_file}'")
        for doc, doc_id, meta in chunks(txt_file):
            vdb_add(vdb, text=doc, doc_id=doc_id, meta=meta)
            print(f"  Added doc_id: {doc_id}")
            count_added += 1
    print(f"\nTotal documents added: {count_added}")
    assert count_added > 0, f"No documents added. Check books directory: {DOCS_DIR}"
else:
    print("Database already populated, skipping load.")

In [None]:
print(f"Final document count: {vdb.count()}")