# ChromaDB

Introduction to ChromaDB

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 16/02/2026   | Martin | Created   | Introduction to ChromaDB. Started on News Article query | 
| 22/02/2026   | Martin | Update   | Sample movie recommender system | 

# Content

* [Introduction](#introduction)
* [News Article Query](#news-article-query)
* [Persistent DB Server](#persistent-db--server)
* [Sample Movie Recommender](#sample-movie-recommender)

# Introduction

Creating a simple collection and querying from it

In [1]:
import chromadb
from pprint import pprint
chroma_client = chromadb.Client()

In [None]:
# Create a collection
collection = chroma_client.create_collection(name="my_collection")
collection.add(
  ids=['id1', 'id2', 'id3', 'id4'],
  documents=[
    "This is a document about pineapples",
    "This is a document about oranges",
    "This is a document about strawberries",
    "This is a document about durians"
  ]
)

In [None]:
results = collection.query(
  query_texts=["This is a query document about Singapore"],
  n_results=2,
  # # Additional filtering rules
  # where={"metadata_field": "is_equal_to_this"},
  # where_document={"$contains": "pineapples"}
)
pprint(results)

{'data': None,
 'distances': [[1.1837674379348755, 1.2627590894699097]],
 'documents': [['This is a document about oranges',
                'This is a document about durians']],
 'embeddings': None,
 'ids': [['id2', 'id4']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[None, None]],
 'uris': None}


---

# News Article Query

Simple use case to query from a collection of news articles

In [2]:
import polars as pl
import chromadb.utils.embedding_functions as embedding_functions

In [4]:
articles = pl.read_csv("Articles.csv", encoding="ISO-8859-1").with_row_index(offset=1)
articles.head()

index,Article,Date,Heading,NewsType
u32,str,str,str,str
1,"""KARACHI: The Sindh government …","""1/1/2015""","""sindh govt decides to cut publ…","""business"""
2,"""HONG KONG: Asian markets start…","""1/2/2015""","""asia stocks up in new year tra…","""business"""
3,"""HONG KONG: Hong Kong shares o…","""1/5/2015""","""hong kong stocks open 0.66 per…","""business"""
4,"""HONG KONG: Asian markets tumbl…","""1/6/2015""","""asian stocks sink euro near ni…","""business"""
5,"""NEW YORK: US oil prices Monday…","""1/6/2015""","""us oil prices slip below 50 a …","""business"""


In [None]:
# Use OpenAI's embedding functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
  api_key_env_var="API_KEY",
  model_name="text-embedding-3-small"
)

In [None]:
# Create the embedding for a subset of articles and create the collection
N = 50
articles_sub = articles['Article'][:N].to_list()
vectors = openai_ef(articles_sub)
ids = [f"id{i}" for i in articles['index'][:N].to_list()]

collection = chroma_client.get_or_create_collection(name="articles")
collection.add(
  documents=articles_sub,
  ids=ids,
  embeddings=vectors
)
collection.count()

50

In [None]:
# Query the collection
query = "Public transport fares by 7 per cent"
query_emb = openai_ef([query])

collection.query(
  query_embeddings=query_emb,
  # query_texts=[query],
  n_results=3
)

{'ids': [['id1', 'id24', 'id29']],
 'embeddings': None,
 'documents': [['KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling.Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.                        \n\n\n\n\n\n\n\n\n\n\n',
   'ISLAMABAD: In a move to give relief to consumers, sources in the Finance Ministry said on Tuesday that the price of petrol and petroleum products are expected to decrease further from February 

---

# Persistent DB / Server

How to preserve database.

In [None]:
# Persistent DB
client = chromadb.PersistentClient(path="./vectordb")

In [None]:
collection = chroma_client.get_or_create_collection(name="documents")
collection.add(
  documents=[
    "This is a document about apples",
    "This is a document about oranges"
  ],
  ids=['id1', 'id2']
)

`chroma run --path /db_path`

In [None]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

---

# Sample Movie Recommender

In [None]:
import chromadb
from chromadb.utils import embedding_functions

In [None]:
# Sample data
movies = [
  {"id": "m1",  "title": "Inception",            "genre": "sci-fi",   "description": "A thief enters dreams to plant ideas in a corporate target's mind."},
  {"id": "m2",  "title": "Interstellar",          "genre": "sci-fi",   "description": "Astronauts travel through a wormhole near Saturn to find a new home for humanity."},
  {"id": "m3",  "title": "The Matrix",            "genre": "sci-fi",   "description": "A hacker discovers reality is a simulation and joins a rebellion against machines."},
  {"id": "m4",  "title": "The Dark Knight",       "genre": "action",   "description": "Batman faces the Joker, a criminal mastermind who wants to plunge Gotham into anarchy."},
  {"id": "m5",  "title": "John Wick",             "genre": "action",   "description": "A retired hitman seeks vengeance after criminals kill his dog and steal his car."},
  {"id": "m6",  "title": "Mad Max: Fury Road",    "genre": "action",   "description": "In a post-apocalyptic wasteland, a woman rebels against a tyrannical ruler."},
  {"id": "m7",  "title": "The Notebook",          "genre": "romance",  "description": "A poor young man and a rich young woman fall in love during the 1940s."},
  {"id": "m8",  "title": "Pride and Prejudice",   "genre": "romance",  "description": "Elizabeth Bennet navigates issues of manners, marriage, and love in Georgian England."},
  {"id": "m9",  "title": "Eternal Sunshine",      "genre": "romance",  "description": "A couple undergoes a procedure to erase memories of each other after a painful breakup."},
  {"id": "m10", "title": "Parasite",              "genre": "thriller", "description": "A poor family schemes their way into working for a wealthy household with dark consequences."},
  {"id": "m11", "title": "Gone Girl",             "genre": "thriller", "description": "A man becomes the prime suspect when his wife mysteriously disappears on their anniversary."},
  {"id": "m12", "title": "Get Out",               "genre": "thriller", "description": "A Black man uncovers disturbing secrets when visiting his white girlfriend's family estate."},
]


In [None]:
# Setup client
client = chromadb.PersistentClient(path="./movies_db")

ef = embedding_functions.SentenceTransformerEmbeddingFunction(
  model_name="all-MiniLM-L6-v2"
)

collection = client.get_or_create_collection(
  name="movies",
  embedding_functions=ef,
  metadata={"hnsw:space": "cosine"}
)

# Ingesting data
if collection.count() == 0:
  collection.add(
    ids=[m['id'] for m in movies],
    documents=[m['description'] for m in movies],
    metadata=[{"title": m["title"], "genre": m["genre"]} for m in movies]
  )
  print(f"Integrated {len(movies)} movies.")


In [None]:
# Recommendation Function
def recommend_by_title(liked_title: str, n: int=3, genre_filter: str=None) -> list[dict]:
  """Find movies that are similar to the ones that user liked."""
  # Find the corresponding movie
  source = next((m for m in movies if m['title'] == liked_title), None)
  if not source:
    raise ValueError(f"Movie '{liked_title}' not found.")
  
  # Additional filter on genre
  where = {"genre": genre_filter} if genre_filter else None

  # Query the database
  result = collection.query(
    query_text=[source['description']],
    n_result=n+1,
    where=where
  )

  # Clean results
  recommendations = []
  for title, genre, doc, distance in zip(
    [m['title'] for m in results['metadata'][0]],
    [m['genre'] for m in results['metadata'][0]],
    results['documents'][0],
    results['distances'][0]
  ):
    if title == liked_title:
      continue
    
    recommendations.append({
      "title": title,
      "genre": genre,
      "similarity": round(1 - distance, 3),
      "description": doc
    })
  
  return recommendations[:n]


In [None]:
def recommend_by_query(query: str, n: int=3) -> list[dict]:
  """Recomends movies from user preference description"""
  results = collection.query(
    query_text=[query],
    n_result=n
  )

  return [
    {
      "title": meta['title'],
      "genre": meta['genre'],
      "similarity": round(1 - dist, 3),
      "description": doc
    } for meta, doc, dist in zip(
      results['metadatas'][0],
      results['documents'][0],
      results['distances'][0]
    )
  ]


In [None]:
%load_ext watermark
%watermark