# ChromaDB

Introduction to ChromaDB

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 16/02/2026   | Martin | Created   |  | 

# Content

* [Introduction](#introduction)

# Introduction

Creating a simple collection and querying from it

In [4]:
import chromadb
from pprint import pprint
chroma_client = chromadb.Client()

In [None]:
# Create a collection
collection = chroma_client.create_collection(name="my_collection")
collection.add(
  ids=['id1', 'id2', 'id3', 'id4'],
  documents=[
    "This is a document about pineapples",
    "This is a document about oranges",
    "This is a document about strawberries",
    "This is a document about durians"
  ]
)

In [None]:
results = collection.query(
  query_texts=["This is a query document about Singapore"],
  n_results=2,
  # # Additional filtering rules
  # where={"metadata_field": "is_equal_to_this"},
  # where_document={"$contains": "pineapples"}
)
pprint(results)

{'data': None,
 'distances': [[1.1837674379348755, 1.2627590894699097]],
 'documents': [['This is a document about oranges',
                'This is a document about durians']],
 'embeddings': None,
 'ids': [['id2', 'id4']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[None, None]],
 'uris': None}


---

# News Article Query

In [None]:
import polars as pl
import chromadb.utils.embedding_functions as embedding_functions

In [9]:
articles = pl.read_csv("Articles.csv", encoding="ISO-8859-1").with_row_index(offset=1)
articles.head()

index,Article,Date,Heading,NewsType
u32,str,str,str,str
1,"""KARACHI: The Sindh government …","""1/1/2015""","""sindh govt decides to cut publ…","""business"""
2,"""HONG KONG: Asian markets start…","""1/2/2015""","""asia stocks up in new year tra…","""business"""
3,"""HONG KONG: Hong Kong shares o…","""1/5/2015""","""hong kong stocks open 0.66 per…","""business"""
4,"""HONG KONG: Asian markets tumbl…","""1/6/2015""","""asian stocks sink euro near ni…","""business"""
5,"""NEW YORK: US oil prices Monday…","""1/6/2015""","""us oil prices slip below 50 a …","""business"""


In [None]:
# Use OpenAI's embedding functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
  api_key_env_var="API_KEY",
  model_name="text-embedding-3-small"
)

In [None]:
# Create the embedding for a subset of articles and create the collection
N = 50
articles_sub = articles[:50]
vectors = openai_ef([articles_sub['Article'][0]])

collection = chroma_client.get_or_create_collection(name="articles")
collection.add(
  documents=[articles_sub['Article'][0]],
  ids=['id1'],
  embeddings=vectors
)

index,Article,Date,Heading,NewsType
u32,str,str,str,str
1,"""KARACHI: The Sindh government …","""1/1/2015""","""sindh govt decides to cut publ…","""business"""
2,"""HONG KONG: Asian markets start…","""1/2/2015""","""asia stocks up in new year tra…","""business"""
3,"""HONG KONG: Hong Kong shares o…","""1/5/2015""","""hong kong stocks open 0.66 per…","""business"""
4,"""HONG KONG: Asian markets tumbl…","""1/6/2015""","""asian stocks sink euro near ni…","""business"""
5,"""NEW YORK: US oil prices Monday…","""1/6/2015""","""us oil prices slip below 50 a …","""business"""


In [None]:
# Query the collection
query = ""
query_emb = openai_ef([query])
collection.query(
  query_embeddings=query_emb,
  n_results=2
)

In [None]:
%load_ext watermark
%watermark