In [1]:
# 📦 Install dependencies
%pip install chromadb python-dotenv openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# 🔌 OpenAI Embedding Example with ChromaDB (Persistent)
This notebook demonstrates how to embed documents using OpenAI and store them in ChromaDB for semantic search.

## 1️⃣ Load Environment & Initialize Embedding

In [2]:
import os
import chromadb
from dotenv import load_dotenv
from chromadb.utils import embedding_functions

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Set up OpenAI embedding function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-3-small"
)

## 2️⃣ Create Persistent Client & Collection

In [None]:
croma_client = chromadb.PersistentClient(path="./db/chroma_persist")

collection = croma_client.get_or_create_collection(
    name="my_story_openai",
    embedding_function=openai_ef,
)

ValueError: Embedding function name mismatch: openai != default

## 3️⃣ Add Documents to Collection

In [None]:
documents = [
    {"id": "doc1", "text": "Hello, world!"},
    {"id": "doc2", "text": "How are you today?"},
    {"id": "doc3", "text": "Goodbye, see you later!"},
    {"id": "doc4", "text": "Microsoft is a technology company that develops software. It was founded by Bill Gates and Paul Allen in 1975."},
    {"id": "doc5", "text": "Artificial Intelligence (AI) refers to the simulation of human intelligence in machines."},
    {"id": "doc6", "text": "Machine Learning (ML) allows computers to learn from data and make predictions."},
    {"id": "doc7", "text": "Deep Learning uses neural networks with many layers to analyze data."},
    {"id": "doc8", "text": "Natural Language Processing (NLP) helps computers understand human language."},
    {"id": "doc9", "text": "AI includes Narrow AI and General AI, depending on the task scope."},
    {"id": "doc10", "text": "Computer Vision enables understanding of visual data."},
    {"id": "doc11", "text": "Reinforcement Learning involves agents learning via rewards."},
    {"id": "doc12", "text": "The Turing Test checks if a machine can behave like a human."},
]

for doc in documents:
    collection.upsert(ids=doc["id"], documents=[doc["text"]])

## 4️⃣ Query for Similar Documents

In [None]:
query_text = "find document related to Turing Test"

results = collection.query(
    query_texts=[query_text],
    n_results=3,
)

for idx, document in enumerate(results["documents"][0]):
    doc_id = results["ids"][0][idx]
    distance = results["distances"][0][idx]
    print(f"{idx+1}. {document} (ID: {doc_id}, Distance: {distance})")