Imports and Loading Model

In [1]:
from sentence_transformers import SentenceTransformer, util
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

Indexing Data

In [4]:
documents = [
"The Eiffel Tower is in Paris.",
"The Colosseum is in Rome.",
"The Statue of Liberty is in New York.",
"The Burj Khalifa is in Dubai."
]

# Setup Chroma client and collection
chroma_client = chromadb.Client()

# Drop existing collection if it already exists
try:
  chroma_client.delete_collection("demo")
except:
  pass

# Now create fresh collection
collection = chroma_client.create_collection("demo")


# Add documents with embeddings
embeddings = model.encode(documents).tolist()
collection.add(documents=documents, embeddings=embeddings, ids=[str(i) for i in range(len(documents))])

print("Indexed documents:", documents)

Indexed documents: ['The Eiffel Tower is in Paris.', 'The Colosseum is in Rome.', 'The Statue of Liberty is in New York.', 'The Burj Khalifa is in Dubai.']


Basic RAG Query

In [5]:
query = "Where is the Eiffel Tower located?"
query_embedding = model.encode([query]).tolist()

results = collection.query(query_embeddings=query_embedding, n_results=2)
print("Query Results:", results["documents"][0])

Query Results: ['The Eiffel Tower is in Paris.', 'The Burj Khalifa is in Dubai.']


Hybrid Search + Reranking

In [8]:
# Hybrid = keyword + semantic search (simple demo - filter + semantic rerank)
keyword_matches = [doc for doc in documents if "Eiffel" in doc or "Paris" in doc]

# Rerank keyword matches with semantic similarity
scores = [(doc, util.cos_sim(model.encode(query), model.encode(doc)).item()) for doc in keyword_matches]
reranked = sorted(scores, key=lambda x: x[1], reverse=True)

print("\nHybrid Seaarch + Rerank Results:")
for doc, score in reranked:
    print(f"Document: {doc}, Score: {score:.4f}")


Hybrid Seaarch + Rerank Results:
Document: The Eiffel Tower is in Paris., Score: 0.8500
