In [1]:
documents = [
    {
        "id": "1",
        "content": "Contact the admissions office at admissions@university.edu.",
        "source": "https://university.edu/admissions",
    },
    {
        "id": "2",
        "content": "Enrollment deadline is June 30.",
        "source": "https://university.edu/enrollment",
    },
    {
        "id": "3",
        "content": "The university offers Data Science, AI, and Robotics programs.",
        "source": "https://university.edu/programs",
    },
]

In [2]:
print(f"Number of documents: {len(documents)}")
print(f"First document: {documents[0]}")

Number of documents: 3
First document: {'id': '1', 'content': 'Contact the admissions office at admissions@university.edu.', 'source': 'https://university.edu/admissions'}


In [3]:
from sentence_transformers import SentenceTransformer

embedder_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(embedder_name, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import chromadb
from chromadb.config import Settings

db_directory = "data/db"
chroma_client = chromadb.Client(Settings(persist_directory=db_directory))
collection = chroma_client.get_or_create_collection("autonomous-agents")


for doc in documents:
    embedding = embedder.encode(doc["content"]).tolist()
    collection.add(
        documents=[doc["content"]],
        metadatas=[{"source": doc["source"]}],
        embeddings=[embedding],
        ids=[doc["id"]],
    )

In [5]:
query = "When is the enrollment deadline?"
query_embedding = embedder.encode(query).tolist()

search_results = collection.query(query_embeddings=[query_embedding], n_results=1)
search_results

{'ids': [['2']],
 'embeddings': None,
 'documents': [['Enrollment deadline is June 30.']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'https://university.edu/enrollment'}]],
 'distances': [[0.24081303179264069]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}