# RAG: Load Vector Database (LLM-based Chunking)

This notebook loads book chapters into a Chroma vector database using LLM-based semantic chunking.
Instead of naive paragraph splitting, an LLM analyzes each chapter and chunks it by topics.

## Initialize

In [None]:
from pathlib import Path

from agentic_patterns.core.config.config import MAIN_PROJECT_DIR
from agentic_patterns.core.agents import get_agent, run_agent
from agentic_patterns.core.vectordb import get_vector_db, vdb_add, load_vectordb_settings

In [None]:
DOCS_DIR = MAIN_PROJECT_DIR / 'tests' / 'data' / 'books'
COLLECTION_NAME = 'books_llm_chunked'
print(f"Books directory: {DOCS_DIR}")

## Vector-db: Setup

Creates/loads a Chroma vector database collection. Uses a separate collection name to distinguish from naive chunking.

In [None]:
vdb = get_vector_db(COLLECTION_NAME)

settings = load_vectordb_settings(MAIN_PROJECT_DIR / "config.yaml")
db_path = Path(settings.get_vectordb().persist_directory)
print(f"Database directory: {db_path}")

In [None]:
count = vdb.count()
create_vdb = (count == 0)
print(f"Collection has {count} documents. Need to populate: {create_vdb}")

## LLM-based Chunking

Uses an LLM to analyze the text and split it into semantically coherent chunks based on topics.

In [None]:
CHUNKING_PROMPT = """
You are a text chunking assistant. Your task is to divide the following text into coherent chunks based on topics or themes.

Guidelines:
- Each chunk should be self-contained and focus on a single topic, scene, or theme
- Chunks should be substantial (at least a few sentences) but not too long
- Preserve the original text exactly - do not summarize or modify the content
- Return the chunks as a list of strings

TEXT TO CHUNK:
{text}
"""

chunking_agent = get_agent(output_type=list[str])

In [None]:
async def chunk_with_llm(file: Path) -> list[tuple[str, str, dict]]:
    """Chunk a file using LLM-based semantic chunking."""
    text = file.read_text()
    prompt = CHUNKING_PROMPT.format(text=text)
    
    agent_run, _ = await run_agent(chunking_agent, prompt, verbose=True)
    chunks: list[str] = agent_run.result.output
    
    results = []
    for chunk_num, chunk in enumerate(chunks):
        doc = chunk.strip()
        if not doc:
            continue
        doc_id = f"{file.stem}-llm-{chunk_num}"
        metadata = {'source': str(file.stem), 'chunk': chunk_num, 'method': 'llm'}
        results.append((doc, doc_id, metadata))
    
    return results

## Load documents

In [None]:
if create_vdb:
    count_added = 0
    for txt_file in DOCS_DIR.glob('*.txt'):
        print(f"Processing file '{txt_file.name}' with LLM chunking...")
        chunks = await chunk_with_llm(txt_file)
        print(f"  LLM produced {len(chunks)} chunks")
        
        for doc, doc_id, meta in chunks:
            vdb_add(vdb, text=doc, doc_id=doc_id, meta=meta)
            print(f"  Added doc_id: {doc_id}")
            count_added += 1
    
    print(f"\nTotal documents added: {count_added}")
    assert count_added > 0, f"No documents added. Check books directory: {DOCS_DIR}"
else:
    print("Database already populated, skipping load.")

In [None]:
print(f"Final document count: {vdb.count()}")