# ChromaDB Basic

- https://www.perplexity.ai/search/in-vector-database-chroma-is-e-fT1gti.vRrSmTl1BWAZtyQ#0
- https://cookbook.chromadb.dev/core/api/

In [1]:
import chromadb

# Create a client (this is your database connection)
client = chromadb.Client()

# Create a collection (think of it like a table)
collection = client.create_collection(name="my_first_collection")


# Add some documents
collection.add(
    documents=[
        "The cat sat on the mat",
        "The dog played in the park",
        "Python is a programming language"
    ],
    ids=["doc1", "doc2", "doc3"]
)

# Search for similar documents
results = collection.query(
    query_texts=["Tell me about animals"],
    n_results=2
)

print('Showing results')
print(results)

Showing results
{'ids': [['doc2', 'doc3']], 'embeddings': None, 'documents': [['The dog played in the park', 'Python is a programming language']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[1.378471851348877, 1.6124422550201416]]}


In [2]:
client = chromadb.Client()
collection = client.create_collection(name="test_embeddings")

# Add documents
collection.add(
    documents=["The cat sat on the mat"],
    ids=["doc1"]
)

# Let's peek at what ChromaDB actually stored
results = collection.get(
    ids=["doc1"],
    include=["embeddings", "documents"]
)

print("Document:", results['documents'])
print("\nEmbedding (first 10 numbers):", results['embeddings'][0][:10])
print("Embedding length:", len(results['embeddings'][0]))

Document: ['The cat sat on the mat']

Embedding (first 10 numbers): [ 0.13040181 -0.01187013 -0.02811698  0.05123861 -0.05597446  0.03019161
  0.03016139  0.02469836 -0.01837054  0.05876685]
Embedding length: 384


In [3]:
import chromadb.api

chromadb.api.client.SharedSystemClient.clear_system_cache()

import chromadb

# This creates a local database folder
client = chromadb.PersistentClient(path="./data/my_chroma_db")

collection = client.get_or_create_collection(name="persistent_collection")

collection.add(
    documents=["This will be saved to disk"],
    ids=["doc1"]
)

print("Data saved to /data/my_chroma_db folder")


Data saved to /data/my_chroma_db folder


In [4]:
import chromadb

# This creates a local database folder
client = chromadb.PersistentClient(path="./data/my_chroma_db")

collection = client.get_or_create_collection(name="persistent_collection")

# Search for similar documents
results = collection.query(
    query_texts=["Tell me about animals"],
    n_results=2
)

print('Showing results')
print(results)

Showing results
{'ids': [['doc1']], 'embeddings': None, 'documents': [['This will be saved to disk']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None]], 'distances': [[1.9052947759628296]]}


In [5]:
import chromadb

client = chromadb.PersistentClient(path="./data/my_chroma_db")
collection = client.get_or_create_collection(name="persistent_collection")

# Let's see what's actually IN the collection
all_docs = collection.get()
print("Documents in collection:", all_docs['documents'])
print("Number of documents:", len(all_docs['documents']))

Documents in collection: ['This will be saved to disk']
Number of documents: 1


In [6]:
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="animal_test")

collection.add(
    documents=[
        "The cat sat on the mat",
        "The dog played in the park",
        "Python is a programming language"
    ],
    ids=["doc1", "doc2", "doc3"]
)

# Let's try different queries
queries = [
    "Tell me about animals",
    "cat",
    "pets and animals"
]

for query in queries:
    results = collection.query(query_texts=[query], n_results=3)
    print(f"\n--- Query: '{query}' ---")
    for i, (doc, distance) in enumerate(zip(results['documents'][0], results['distances'][0])):
        print(f"{i+1}. Distance: {distance:.4f} - '{doc}'")


--- Query: 'Tell me about animals' ---
1. Distance: 1.3785 - 'The dog played in the park'
2. Distance: 1.6124 - 'Python is a programming language'
3. Distance: 1.8603 - 'The cat sat on the mat'

--- Query: 'cat' ---
1. Distance: 0.9752 - 'The cat sat on the mat'
2. Distance: 1.5271 - 'Python is a programming language'
3. Distance: 1.6214 - 'The dog played in the park'

--- Query: 'pets and animals' ---
1. Distance: 1.1869 - 'The dog played in the park'
2. Distance: 1.5547 - 'The cat sat on the mat'
3. Distance: 1.6321 - 'Python is a programming language'


In [7]:
import chromadb

client = chromadb.Client()
collection = client.create_collection(name="word_test")

# Let's test single words vs. sentences
collection.add(
    documents=[
        "cat",
        "dog", 
        "animal",
        "The cat sat on the mat",
        "Tell me about animals"
    ],
    ids=["word_cat", "word_dog", "word_animal", "sentence_cat", "question_animals"]
)

# Compare these queries
test_queries = ["animal", "animals", "cat"]

for query in test_queries:
    results = collection.query(query_texts=[query], n_results=5)
    print(f"\n--- Query: '{query}' ---")
    for doc, dist in zip(results['documents'][0], results['distances'][0]):
        print(f"  {dist:.4f} - '{doc}'")


--- Query: 'animal' ---
  0.0000 - 'animal'
  0.3688 - 'dog'
  0.6511 - 'cat'
  0.7057 - 'Tell me about animals'
  1.5166 - 'The cat sat on the mat'

--- Query: 'animals' ---
  0.3075 - 'animal'
  0.4693 - 'Tell me about animals'
  0.7699 - 'dog'
  0.9449 - 'cat'
  1.6997 - 'The cat sat on the mat'

--- Query: 'cat' ---
  0.0000 - 'cat'
  0.6511 - 'animal'
  0.6787 - 'dog'
  0.9752 - 'The cat sat on the mat'
  1.3497 - 'Tell me about animals'


In [8]:
from dotenv import load_dotenv
import os
from huggingface_hub import login # Or other HF libraries

load_dotenv() # Loads variables from .env
hf_token = os.getenv("HF_TOKEN")


In [9]:

# Now use the token with HF libraries, e.g.:
login(token=hf_token)
# Or directly in from_pretrained:
# model = AutoModel.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=hf_token)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [10]:
import sentence_transformers
print(sentence_transformers.__version__)

5.2.2


In [11]:
import chromadb
from chromadb.utils import embedding_functions

# Example: Using OpenAI embeddings (you'll replace this with your own)
# openai_ef = embedding_functions.OpenAIEmbeddingFunction(
#     api_key="your-api-key",
#     model_name="text-embedding-ada-002"
# )

# Or using SentenceTransformers with a different model
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="paraphrase-MiniLM-L6-v2"
)

client = chromadb.Client()
collection = client.create_collection(
    name="custom_embeddings",
    embedding_function=sentence_transformer_ef
)

collection.add(
    documents=["The cat sat on the mat"],
    ids=["doc1"]
)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/paraphrase-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

text = "The cat sat on the mat"

# Step 1: Tokenization
tokens = tokenizer.encode(text)
print("Tokens (IDs):", tokens)
print("Tokens (text):", [tokenizer.decode([t]) for t in tokens])

# Step 2: What happens inside the model?
input_ids = torch.tensor([tokens])
with torch.no_grad():
    outputs = model(input_ids, output_hidden_states=True)

print("\nWhat's inside the model:")
print("- Number of hidden layers:", len(outputs.hidden_states))
print("- Shape of last hidden state:", outputs.hidden_states[-1].shape)
print("- That means: [batch_size, sequence_length, embedding_dimension]")

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Tokens (IDs): [464, 3797, 3332, 319, 262, 2603]
Tokens (text): ['The', ' cat', ' sat', ' on', ' the', ' mat']

What's inside the model:
- Number of hidden layers: 13
- Shape of last hidden state: torch.Size([1, 6, 768])
- That means: [batch_size, sequence_length, embedding_dimension]


In [13]:
import chromadb
from chromadb.utils import embedding_functions
import os

load_dotenv() # Loads variables from .env
openai_token = os.getenv("OPENAI_TOKEN")

# Set your API key
os.environ["OPENAI_API_KEY"] = openai_token  # Replace with your actual key

# Create OpenAI embedding function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.environ["OPENAI_API_KEY"],
    model_name="text-embedding-3-small"  # or "text-embedding-3-large"
)

# Create persistent client with OpenAI embeddings
client = chromadb.PersistentClient(path="./data/openai_chroma_db")

collection = client.get_or_create_collection(
    name="openai_collection",
    embedding_function=openai_ef
)

# Add documents
collection.add(
    documents=[
        "The cat sat on the mat",
        "The dog played in the park",
        "Python is a programming language"
    ],
    ids=["doc1", "doc2", "doc3"]
)

# Query
results = collection.query(
    query_texts=["Tell me about animals"],
    #n_results=2
)

print("Results:", results['documents'])
print("Distances:", results['distances'])

Results: [['The dog played in the park', 'The cat sat on the mat', 'Python is a programming language']]
Distances: [[0.7549846768379211, 0.7731590270996094, 0.8642709255218506]]


In [14]:
import chromadb

client = chromadb.PersistentClient(path="./data/test_db")
collection = client.get_or_create_collection(name="test")

collection.add(
    documents=["The cat sat on the mat", "The dog played"],
    ids=["doc1", "doc2"]
)

# Get the raw data
result = collection.get(ids=["doc1"], include=["embeddings"])
print("Vector for doc1 (first 5 numbers):", result['embeddings'][0][:5])
print("Vector length:", len(result['embeddings'][0]))

# Now query
query_result = collection.query(
    query_texts=["cat"],
    n_results=2,
    include=["embeddings", "distances"]
)
print("\nDistance to doc1:", query_result['distances'][0][0])
print("Distance to doc2:", query_result['distances'][0][1])

Vector for doc1 (first 5 numbers): [ 0.13040181 -0.01187013 -0.02811698  0.05123861 -0.05597446]
Vector length: 384

Distance to doc1: 0.9752495288848877
Distance to doc2: 1.356661319732666


- https://claude.ai/share/c86070ea-4fa0-4c73-bcf2-f3f635f042f0

## End