# Finance RAG ‚Äì Embedding DB Builder (Per-Embedding Chroma DB, Per-Chunk Collections)

This notebook creates **one Chroma database per embedding model** and stores **one collection per chunk size**. It also lists existing databases/collections and shows basic metadata.

**Recommended Python:** 3.11 or 3.12

**What you get**
- Unified `EmbeddingBundle` loader with families: `openai`, `hf` (HuggingFace), `ollama`
- Chunking via LlamaIndex `SentenceSplitter`
- Per-embedding Chroma **persist directory** (configurable root & prefix)
- Per-chunk-size **collection** inside that DB
- Progress bars (`tqdm`) for embedding and upserting
- Utilities to **list databases** and **inspect collections**


In [9]:

# If you are running locally, uncomment and run this cell once to install dependencies.
# It's safer to do it in a clean virtualenv (Python 3.11/3.12).
# Note: We intentionally omit tiktoken to avoid Rust build requirements on some Python versions.
# %pip install -q "langchain>=0.3.7,<0.4.0" "langchain-community>=0.3.7,<0.4.0" "langchain-openai>=0.2.0,<0.3.0" \

#                "chromadb>=0.5.16,<0.6.0" "llama-index==0.11.18" \

#                "llama-index-embeddings-huggingface>=0.3.0,<0.4.0" "llama-index-embeddings-openai>=0.2.0,<0.4.0" \

#                "sentence-transformers>=3.1.1,<3.2.0" "tqdm>=4.66" "python-dotenv>=1.0,<2.0" \

#                "pydantic>=2.7,<3.0" "openai>=1.51.0,<2.0.0" "ollama>=0.3.0,<0.4.0"


In [10]:

import os
import uuid
import datetime as dt
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Sequence, Literal

from tqdm.auto import tqdm

# Chroma
import chromadb
from chromadb.config import Settings as ChromaSettings

# Embeddings (LangChain wrappers)
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings, OllamaEmbeddings

# Chunking
from llama_index.core.node_parser import SentenceSplitter

try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

# ---- Configuration defaults (you can change these in the next cell) ----
BASE_DB_ROOT = "../../vector_databases"   # where all per-embedding DBs will live
DB_PREFIX    = ""          # prefix on each embedding-specific persist dir
DATASET_TAG  = "financebench_v1"  # used in collection names, e.g., financebench_v1__c256


In [11]:

@dataclass
class EmbeddingBundle:
    family: str
    model: str
    name: str
    lc: Any          # LangChain embeddings object
    dim: Optional[int]  # embedding dimension (best effort); may be None if probing fails

def _safe_name(s: str) -> str:
    return s.replace("/", "_").replace(":", "_")

def _probe_dim(emb) -> Optional[int]:
    try:
        v = emb.embed_query("dimension probe")
        return len(v) if isinstance(v, list) else (len(v[0]) if v is not None else None)
    except Exception:
        return None

def load_embeddings(family: Literal["openai", "hf", "ollama"], model: Optional[str] = None, **kwargs) -> EmbeddingBundle:
    """Return a unified EmbeddingBundle for the requested family/model."
    Supported:
      - family="openai", model like "text-embedding-3-small"
      - family="hf", model like "BAAI/bge-m3" or "nomic-ai/nomic-embed-text-v1"
      - family="ollama", model like "nomic-embed-text" (Ollama must be running)
    """
    family = family.lower()
    if family == "openai":
        model = model or "text-embedding-3-small"
        emb = OpenAIEmbeddings(model=model, **kwargs)
        dim = _probe_dim(emb)
        return EmbeddingBundle(family="openai", model=model, name=f"openai__{model}", lc=emb, dim=dim)
    elif family == "hf":
        if not model:
            raise ValueError("For family='hf', please provide a HuggingFace model name.")
        # normalize embeddings tends to help cosine distances
        kwargs.setdefault("encode_kwargs", {"normalize_embeddings": True})
        emb = HuggingFaceEmbeddings(model_name=model, **kwargs)
        dim = _probe_dim(emb)
        return EmbeddingBundle(family="hf", model=model, name=f"hf__{_safe_name(model)}", lc=emb, dim=dim)
    elif family == "ollama":
        model = model or "nomic-embed-text"
        emb = OllamaEmbeddings(model=model, **kwargs)
        dim = _probe_dim(emb)
        return EmbeddingBundle(family="ollama", model=model, name=f"ollama__{_safe_name(model)}", lc=emb, dim=dim)
    else:
        raise ValueError(f"Unknown embedding family: {family}")


In [12]:

def persist_dir_for(db_root: str, db_prefix: str, emb_name: str) -> str:
    os.makedirs(db_root, exist_ok=True)
    safe = _safe_name(emb_name)
    return os.path.join(db_root, f"{db_prefix}{safe}")

def chroma_client(persist_dir: str):
    os.makedirs(persist_dir, exist_ok=True)
    return chromadb.Client(ChromaSettings(is_persistent=True, allow_reset=True, persist_directory=persist_dir))

def collection_name(dataset_tag: str, chunk_size: int) -> str:
    return f"{dataset_tag}__c{chunk_size}"

def chunk_texts_llamaindex(texts: Sequence[Dict[str, Any]], chunk_size: int = 512, chunk_overlap: int = 32):
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    out = []
    for item in texts:
        chunks = splitter.split_text(item["text"])
        for i, ch in enumerate(chunks):
            meta = dict(item.get("metadata", {}))
            meta.update({"chunk_index": i, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
            out.append({"text": ch, "metadata": meta})
    return out

def embed_texts_in_batches(emb, texts: List[str], batch_size: int = 64) -> List[List[float]]:
    vectors = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", leave=False):
        batch = texts[i:i+batch_size]
        # LangChain embeddings implement embed_documents for list inputs
        vectors.extend(emb.embed_documents(batch))
    return vectors


In [13]:

def build_collections_for_embedding(
    raw_texts: Sequence[Dict[str, Any]],
    family: Literal["openai", "hf", "ollama"],
    model: Optional[str] = None,
    chunk_sizes: Sequence[int] = (256, 512),
    chunk_overlap: int = 32,
    db_root: str = BASE_DB_ROOT,
    db_prefix: str = DB_PREFIX,
    dataset_tag: str = DATASET_TAG,
    batch_size: int = 64,
    upsert_batch: int = 1000,
    skip_existing: bool = True,
    embedding_kwargs: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    """Create (or update) one Chroma DB for the embedding, and one collection per chunk size.

    Returns a dict summary with persist_dir and created/updated collections.
    """
    embedding_kwargs = embedding_kwargs or {}
    bundle = load_embeddings(family, model, **embedding_kwargs)
    pdir = persist_dir_for(db_root, db_prefix, bundle.name)
    client = chroma_client(pdir)

    existing = {c.name for c in client.list_collections()}
    created = []

    for csz in tqdm(chunk_sizes, desc=f"Chunk sizes for {bundle.name}"):
        cname = collection_name(dataset_tag, csz)
        if skip_existing and cname in existing:
            print(f"‚úì Skipping existing collection: {cname} @ {pdir}")
            continue

        print(f"‚Üí Building collection: {cname} @ {pdir}")
        chunks = chunk_texts_llamaindex(raw_texts, chunk_size=csz, chunk_overlap=chunk_overlap)
        texts = [c["text"] for c in chunks]
        metadatas = [c["metadata"] for c in chunks]

        vectors = embed_texts_in_batches(bundle.lc, texts, batch_size=batch_size)

        col = client.get_or_create_collection(
            name=cname,
            metadata={
                "embedding_name": bundle.name,
                "embedding_family": bundle.family,
                "embedding_model": bundle.model,
                "embedding_dim": bundle.dim,
                "dataset_tag": dataset_tag,
                "chunk_size": csz,
                "chunk_overlap": chunk_overlap,
                "created_at": dt.datetime.utcnow().isoformat() + "Z",
            },
        )

        ids = [str(uuid.uuid4()) for _ in texts]
        for i in tqdm(range(0, len(texts), upsert_batch), desc="Upserting", leave=False):
            col.upsert(
                ids=ids[i:i+upsert_batch],
                documents=texts[i:i+upsert_batch],
                metadatas=metadatas[i:i+upsert_batch],
                embeddings=vectors[i:i+upsert_batch],
            )

        created.append({"collection": cname, "count": len(texts)})

    return {"persist_dir": pdir, "embedding": bundle.name, "collections": created}


In [14]:

def list_embedding_databases(db_root: str = BASE_DB_ROOT, db_prefix: str = DB_PREFIX) -> List[Dict[str, Any]]:
    """List per-embedding Chroma DBs (persist dirs) and their collections/metadata."""
    dbs = []
    if not os.path.isdir(db_root):
        return dbs
    for entry in sorted(os.listdir(db_root)):
        if not entry.startswith(db_prefix):
            continue
        pdir = os.path.join(db_root, entry)
        info = {"persist_dir": pdir, "collections": []}
        try:
            client = chroma_client(pdir)
            cols = client.list_collections()
            for c in cols:
                info["collections"].append({
                    "name": c.name,
                    "metadata": c.metadata or {},
                })
        except Exception as e:
            info["error"] = str(e)
        dbs.append(info)
    return dbs

def print_db_summary(dbs: List[Dict[str, Any]]):
    if not dbs:
        print("No databases found.")
        return
    for db in dbs:
        print(f"\nüìÅ {db['persist_dir']}")
        if "error" in db:
            print("  ‚ö†Ô∏è ", db["error"])
            continue
        if not db["collections"]:
            print("  (no collections)")
        for c in db["collections"]:
            md = c.get("metadata", {}) or {}
            cs = md.get("chunk_size", "?")
            fam = md.get("embedding_family", "?")
            model = md.get("embedding_model", "?")
            dim = md.get("embedding_dim", "?")
            print(f"  ‚Ä¢ {c['name']} | family={fam} model={model} dim={dim} chunk_size={cs}")


In [15]:

# --- Example raw texts (replace with your dataset loader) ---
raw_texts = [
    {"text": "Acme Corp reported revenue of $12.3B in FY2023.", "metadata": {"source": "10-K 2023", "ticker": "ACME"}},
    {"text": "Operating margin improved to 18% due to cost controls.", "metadata": {"source": "10-K 2023", "ticker": "ACME"}},
]

# --- Configure which embeddings and chunk sizes to build ---
families_and_models = [
    {"family": "hf", "model": "BAAI/bge-m3"},
    # Uncomment if you have OPENAI_API_KEY set in your environment:
    # {"family": "openai", "model": "text-embedding-3-small"},
    # If you run Ollama locally with an embedding model:
    # {"family": "ollama", "model": "nomic-embed-text"},
]

chunk_sizes = (256, 512)
chunk_overlap = 32

# --- Build phase (progress bars will show) ---
results = []
for fm in families_and_models:
    res = build_collections_for_embedding(
        raw_texts=raw_texts,
        family=fm["family"],
        model=fm.get("model"),
        chunk_sizes=chunk_sizes,
        chunk_overlap=chunk_overlap,
        db_root=BASE_DB_ROOT,
        db_prefix=DB_PREFIX,
        dataset_tag=DATASET_TAG,
        batch_size=64,
        upsert_batch=1000,
        skip_existing=True,
    )
    results.append(res)

results


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Chunk sizes for hf__BAAI_bge-m3:   0%|          | 0/2 [00:00<?, ?it/s]

‚Üí Building collection: financebench_v1__c256 @ ../../vector_databases/hf__BAAI_bge-m3


  "created_at": dt.datetime.utcnow().isoformat() + "Z",
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Chunk sizes for hf__BAAI_bge-m3:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1/2 [00:00<00:00,  5.29it/s]

‚Üí Building collection: financebench_v1__c512 @ ../../vector_databases/hf__BAAI_bge-m3


Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Chunk sizes for hf__BAAI_bge-m3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00,  8.54it/s]


[{'persist_dir': '../../vector_databases/hf__BAAI_bge-m3',
  'embedding': 'hf__BAAI_bge-m3',
  'collections': [{'collection': 'financebench_v1__c256', 'count': 2},
   {'collection': 'financebench_v1__c512', 'count': 2}]}]

In [16]:

dbs = list_embedding_databases(BASE_DB_ROOT, DB_PREFIX)
print_db_summary(dbs)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given



üìÅ ../../vector_databases/hf__BAAI_bge-m3
  ‚Ä¢ financebench_v1__c256 | family=hf model=BAAI/bge-m3 dim=1024 chunk_size=256
  ‚Ä¢ financebench_v1__c512 | family=hf model=BAAI/bge-m3 dim=1024 chunk_size=512


In [17]:

# OPTIONAL: quick retrieval sanity check
# Pick the first DB and first collection, and run a tiny query.
try:
    if dbs:
        any_db = dbs[0]['persist_dir']
        client = chroma_client(any_db)
        if dbs[0]['collections']:
            cname = dbs[0]['collections'][0]['name']
            col = client.get_or_create_collection(cname)
            # naive query by re-embedding using the same family/model is out of scope here,
            # but Chroma can query with raw embeddings if provided.
            # For simplicity, we'll just print a few docs:
            qs = col.get(include=["documents","metadatas"], limit=3)
            print(f"Sample from {any_db} :: {cname}")
            for i, (doc, meta) in enumerate(zip(qs.get('documents', []), qs.get('metadatas', []))):
                print(f"\n#{i+1}\n", doc, "\n", meta)
except Exception as e:
    print("Retrieval sanity check skipped:", e)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


Sample from ../../vector_databases/hf__BAAI_bge-m3 :: financebench_v1__c256

#1
 Acme Corp reported revenue of $12.3B in FY2023. 
 {'chunk_index': 0, 'chunk_overlap': 32, 'chunk_size': 256, 'source': '10-K 2023', 'ticker': 'ACME'}

#2
 Operating margin improved to 18% due to cost controls. 
 {'chunk_index': 0, 'chunk_overlap': 32, 'chunk_size': 256, 'source': '10-K 2023', 'ticker': 'ACME'}
