In [2]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pydantic>=1.9 (from chromadb)
  Downloading pydantic-2.11.9-py3-none-any.whl.metadata (68 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp313-cp313-win_amd64.whl.metadata (9.0 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.37.0-py3-none-any.whl.metadata (6.6 kB)
Collecting numpy>=1.22.5 (from chromadb)
  Downloading numpy-2.3.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting typing-extensions>=4.5.0 (from chromadb)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp313-cp313-win_amd64.whl



In [8]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Downloading torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence_transformers)
  Downloading scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading regex-2025.9.18-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting networkx (from torch>=1.11.0->sentence_transformers)
 



In [1]:
import json
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions

In [2]:
# ---------------- SETTINGS ----------------
JSONL_FILE = Path("..") / "output" / "documents.jsonl"  # adjust path from Notebook/
CHROMA_DB_PATH = Path("..") / "output" / "chroma_store"
COLLECTION_NAME = "card_docs"

In [3]:
# Embedding model
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"


In [4]:
# Initialize Chroma with persistent storage
client = chromadb.PersistentClient(path=str(CHROMA_DB_PATH))

In [5]:
# Create or get collection with embedding function
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_MODEL_NAME
)
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_fn
)

In [6]:
# Load JSONL and insert into Chroma
with open(JSONL_FILE, "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)

        # Unique ID for each chunk
        chunk_id = f"{record['card']}_{record['date']}_{record['filename']}_page{record['page']}_chunk{record['chunk_index']}"

        collection.add(
            ids=[chunk_id],
            documents=[record["text"]],
            metadatas=[{
                "card": record["card"],
                "date": record["date"],
                "filename": record["filename"],
                "path": record["path"],
                "page": record["page"],
                "chunk_index": record["chunk_index"]
            }]
        )

print(f"✅ All chunks loaded into Chroma at {CHROMA_DB_PATH.resolve()}")

✅ All chunks loaded into Chroma at C:\Users\soumy\OneDrive\Documents\IntelligentCardSelectorEngine\output\chroma_store
