In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
import uuid

EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL)

# Persistent local vector DB folder
chroma = chromadb.PersistentClient(path="./vecdb")

# Create or get a collection (think “index”)
collection = chroma.get_or_create_collection(
    name="docs",
    metadata={"hnsw:space": "cosine"},  # cosine similarity (works well for MiniLM)
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(EMBED_MODEL)
)

def chunk(text, title, doc_id, source, chunk_tokens=450, overlap=80):
    """
    Lightweight splitter: slices text into overlapping windows.
    Tip: swap for tiktoken or a sentence-aware splitter for cleaner boundaries.
    """
    words = text.split()
    step = chunk_tokens - overlap
    out, i = [], 0
    while i < len(words):
        piece = " ".join(words[i:i+chunk_tokens])
        out.append({
            "id": f"{doc_id}:{i}",     # globally unique chunk id
            "text": piece,
            "metadata": {
                "doc_id": doc_id,      # tie chunks back to the doc
                "title": title,
                "source": source,
                "chunk_index": i,
            }
        })
        i += step
    return out

def index_document(raw_text: str, title: str, source: str):
    """
    Splits -> embeds -> stores.
    Returns a doc_id so you can reference or re-index later.
    """
    doc_id = str(uuid.uuid4())
    chunks = chunk(raw_text, title, doc_id, source)
    collection.add(
        ids=[c["id"] for c in chunks],
        documents=[c["text"] for c in chunks],
        metadatas=[c["metadata"] for c in chunks],
    )
    return doc_id
