## Exercise 3 Solution: Chroma with Advanced Filtering


In [None]:
# !pip install -q chromadb sentence-transformers

In [1]:
import chromadb
from chromadb.utils import embedding_functions

In [3]:
# -----------------------------
# Setup Chroma Client
# -----------------------------
client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage

# Use SentenceTransformer embeddings
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

In [4]:
# Create collection
collection = client.get_or_create_collection(
    name="advanced_documents",
    embedding_function=embedding_fn,
    metadata={"description": "30-document collection with rich metadata"}
)
print(f"Collection created: {collection.name}")

Collection created: advanced_documents


In [5]:
# -----------------------------
# Define 30 Documents (Module 4 style)
# -----------------------------
documents = [
    "Python is a versatile programming language used for web development and data science.",
    "Machine learning models require large amounts of training data to perform well.",
    "Neural networks are inspired by the structure of the human brain.",
    "Natural language processing enables computers to understand human language.",
    "Deep learning is a subset of machine learning using multi-layered neural networks.",
    "Data visualization helps communicate insights from complex datasets.",
    "Cloud computing provides on-demand access to computing resources.",
    "Cybersecurity protects systems and networks from digital attacks.",
    "Blockchain technology enables secure, decentralized transactions.",
    "Quantum computing uses quantum mechanics to solve complex problems.",
    "Artificial intelligence powers recommendation systems and chatbots.",
    "Python libraries like pandas and NumPy simplify data analysis.",
    "Computer vision allows machines to interpret visual information.",
    "Reinforcement learning is used for robotics and game AI.",
    "Supervised learning uses labeled datasets to train models.",
    "Unsupervised learning finds hidden patterns in data without labels.",
    "Transfer learning enables pre-trained models to adapt to new tasks.",
    "Generative AI can create images, text, and music from prompts.",
    "Semantic search improves information retrieval using embeddings.",
    "Vector databases store embeddings for fast similarity search.",
    "FAISS is ideal for local vector search and prototyping.",
    "Chroma provides easy-to-use embedded vector storage.",
    "Pinecone is a cloud-based managed vector database.",
    "Weaviate offers flexible self-hosted vector database solutions.",
    "Qdrant enables fast vector search with filtering capabilities.",
    "Milvus is scalable for enterprise-level vector storage needs.",
    "SentenceTransformers generate embeddings from text efficiently.",
    "RAG systems combine retrieval with language models for answers.",
    "Metadata filters help narrow down search results effectively.",
    "Maximal Marginal Relevance ensures diverse and relevant retrieval."
]

In [6]:
# Corresponding metadata for each document
metadatas = [
    {"category": "tech", "date": "2024-01-01", "author": "Alice", "priority": 1},
    {"category": "AI", "date": "2024-01-02", "author": "Bob", "priority": 2},
    {"category": "AI", "date": "2024-01-03", "author": "Charlie", "priority": 2},
    {"category": "AI", "date": "2024-01-04", "author": "Diana", "priority": 1},
    {"category": "AI", "date": "2024-01-05", "author": "Eve", "priority": 1},
    {"category": "data", "date": "2024-01-06", "author": "Alice", "priority": 2},
    {"category": "cloud", "date": "2024-01-07", "author": "Bob", "priority": 2},
    {"category": "security", "date": "2024-01-08", "author": "Charlie", "priority": 1},
    {"category": "blockchain", "date": "2024-01-09", "author": "Diana", "priority": 1},
    {"category": "quantum", "date": "2024-01-10", "author": "Eve", "priority": 3},
    {"category": "AI", "date": "2024-01-11", "author": "Alice", "priority": 1},
    {"category": "data", "date": "2024-01-12", "author": "Bob", "priority": 2},
    {"category": "vision", "date": "2024-01-13", "author": "Charlie", "priority": 2},
    {"category": "AI", "date": "2024-01-14", "author": "Diana", "priority": 1},
    {"category": "AI", "date": "2024-01-15", "author": "Eve", "priority": 1},
    {"category": "AI", "date": "2024-01-16", "author": "Alice", "priority": 2},
    {"category": "AI", "date": "2024-01-17", "author": "Bob", "priority": 2},
    {"category": "AI", "date": "2024-01-18", "author": "Charlie", "priority": 1},
    {"category": "search", "date": "2024-01-19", "author": "Diana", "priority": 1},
    {"category": "database", "date": "2024-01-20", "author": "Eve", "priority": 3},
    {"category": "FAISS", "date": "2024-01-21", "author": "Alice", "priority": 1},
    {"category": "Chroma", "date": "2024-01-22", "author": "Bob", "priority": 2},
    {"category": "Pinecone", "date": "2024-01-23", "author": "Charlie", "priority": 2},
    {"category": "Weaviate", "date": "2024-01-24", "author": "Diana", "priority": 1},
    {"category": "Qdrant", "date": "2024-01-25", "author": "Eve", "priority": 1},
    {"category": "Milvus", "date": "2024-01-26", "author": "Alice", "priority": 3},
    {"category": "embedding", "date": "2024-01-27", "author": "Bob", "priority": 1},
    {"category": "RAG", "date": "2024-01-28", "author": "Charlie", "priority": 2},
    {"category": "metadata", "date": "2024-01-29", "author": "Diana", "priority": 2},
    {"category": "MMR", "date": "2024-01-30", "author": "Eve", "priority": 1},
]


In [7]:
# Document IDs
ids = [f"doc_{i+1}" for i in range(30)]

In [8]:
ids

['doc_1',
 'doc_2',
 'doc_3',
 'doc_4',
 'doc_5',
 'doc_6',
 'doc_7',
 'doc_8',
 'doc_9',
 'doc_10',
 'doc_11',
 'doc_12',
 'doc_13',
 'doc_14',
 'doc_15',
 'doc_16',
 'doc_17',
 'doc_18',
 'doc_19',
 'doc_20',
 'doc_21',
 'doc_22',
 'doc_23',
 'doc_24',
 'doc_25',
 'doc_26',
 'doc_27',
 'doc_28',
 'doc_29',
 'doc_30']

In [10]:
# -----------------------------
# Add Documents to Collection
# -----------------------------
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)
print(f"Added {len(documents)} documents to Chroma collection")
print(f"Total documents: {collection.count()}")

Added 30 documents to Chroma collection
Total documents: 30


In [11]:
# -----------------------------
# Example Queries
# -----------------------------
query = "latest technology updates"

# --- No filter ---
results_no_filter = collection.query(query_texts=[query], n_results=3)
print("\n--- Query without filters ---")
for i, (doc, metadata, distance) in enumerate(zip(
    results_no_filter['documents'][0],
    results_no_filter['metadatas'][0],
    results_no_filter['distances'][0]
), 1):
    print(f"{i}. {doc} (Category: {metadata['category']}, Distance: {distance:.4f})")


--- Query without filters ---
1. Milvus is scalable for enterprise-level vector storage needs. (Category: Milvus, Distance: 0.7019)
2. FAISS is ideal for local vector search and prototyping. (Category: FAISS, Distance: 0.7891)
3. Chroma provides easy-to-use embedded vector storage. (Category: Chroma, Distance: 0.7901)


In [12]:
# --- Filter by category ---
results_category = collection.query(query_texts=[query], n_results=3, where={"category": "tech"})
print("\n--- Query with filter: category='tech' ---")
for i, (doc, metadata, distance) in enumerate(zip(
    results_category['documents'][0],
    results_category['metadatas'][0],
    results_category['distances'][0]
), 1):
    print(f"{i}. {doc} (Category: {metadata['category']}, Distance: {distance:.4f})")



--- Query with filter: category='tech' ---
1. Python is a versatile programming language used for web development and data science. (Category: tech, Distance: 0.9424)


In [13]:
# --- Combined filter (category + date) ---
results_combined = collection.query(
    query_texts=[query],
    n_results=3,
    where={"category": "tech", "date": {"$gte": "2024-01-01"}}
)
print("\n--- Query with filter: category='tech' AND date>='2024-01-01' ---")
for i, (doc, metadata, distance) in enumerate(zip(
    results_combined['documents'][0],
    results_combined['metadatas'][0],
    results_combined['distances'][0]
), 1):
    print(f"{i}. {doc} (Category: {metadata['category']}, Date: {metadata['date']}, Distance: {distance:.4f})")

ValueError: Expected where to have exactly one operator, got {'category': 'tech', 'date': {'$gte': '2024-01-01'}} in query.