In [None]:
– Vector space model and cosine similarity
 – SBERT for dense sentence embeddings
 – Fast retrieval using Faiss or Chroma
 – Convert text to sentence embeddings
 – Perform similarity-based retrieval from documents
 – Build a semantic search engine to return top matches

In [2]:
pip install chromadb sentence-transformers

Collecting chromadb
  Using cached chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.10.0-py3-none-any.whl.metadata (6.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)
  Downloading opentelemetry_instrumentation_fastapi-0.55b1-py3-none-any.whl.metadata (2.2 kB)
Collecting opentelemetry-sdk>=1.2.0 (from

In [None]:
# We will write a documents then store the embeddings of the document in chroma db and then find similar documents



In [1]:
# Step 1: Import libraries
import chromadb
from sentence_transformers import SentenceTransformer

# Step 2: Initialize Chroma in-memory DB
client = chromadb.Client()
collection = client.create_collection(name="my_collection")

# Step 3: Sample documents to index
docs = [
    "I love natural language processing",
    "Natural language processing is fascinating",
    "I enjoy machine learning and AI"
]

# Step 4: Convert docs to embeddings using Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')  # fast + good
embeddings = model.encode(docs).tolist()  # list of lists

# Step 5: Add docs to Chroma
collection.add(
    documents=docs,
    embeddings=embeddings,
    ids=[f"doc{i}" for i in range(len(docs))]
)

# Step 6: Query Chroma with a new sentence
query = "I like AI and deep learning"
query_embedding = model.encode(query).tolist()

# Step 7: Search for top-2 similar documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2
)

# Step 8: Display results
print("Top similar documents:")
for doc, score in zip(results['documents'][0], results['distances'][0]):
    print(f"Doc: {doc}, Score: {score:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Top similar documents:
Doc: I enjoy machine learning and AI, Score: 0.3536
Doc: I love natural language processing, Score: 1.0207


# User will query the search engine and ask what are the top quotes on life motivation.

In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer # For generating embeddings
import chromadb # Our vector database
import os # For managing directories
import shutil # For cleaning up directories

# --- 0. Setup: Install necessary libraries (if you haven't already) ---
# You would typically run these in your terminal:
# pip install sentence-transformers
# pip install chromadb
# pip install numpy

# --- 1. Define Our Document Corpus ---
# This is our collection of "documents" that we want to search through.
documents = [
    "The only way to do great work is to love what you do. - Steve Jobs",
    "Innovation distinguishes between a leader and a follower. - Steve Jobs",
    "Stay hungry, stay foolish. - Steve Jobs",
    "Life is what happens when you're busy making other plans. - John Lennon",
    "Imagine all the people living life in peace. - John Lennon",
    "The future belongs to those who believe in the beauty of their dreams. - Eleanor Roosevelt",
    "Do one thing every day that scares you. - Eleanor Roosevelt",
    "The greatest glory in living lies not in never falling, but in rising every time we fall. - Nelson Mandela",
    "Education is the most powerful weapon which you can use to change the world. - Nelson Mandela",
    "That which does not kill us makes us stronger. - Friedrich Nietzsche",
    "He who has a why to live can bear almost any how. - Friedrich Nietzsche"
]

# Create unique IDs for our documents
document_ids = [f"doc_{i+1}" for i in range(len(documents))]

print("Our Document Corpus:")
for i, doc in enumerate(documents):
    print(f"  {document_ids[i]}: \"{doc}\"")
print("-" * 60)

# --- 2. Load an Embedding Model ---
# We use a pre-trained Sentence Transformer model.
# 'all-MiniLM-L6-v2' is a good balance of size, speed, and performance
# for general sentence embeddings. It produces 384-dimensional vectors.
print("Loading Sentence Transformer model 'all-MiniLM-L6-v2'...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")
print("-" * 60)

# --- 3. Encode Documents into Embeddings ---
# This step converts each text document into its numerical vector representation.
print("Encoding documents into embeddings (numerical vectors)...")
document_embeddings = embedding_model.encode(documents, show_progress_bar=True)
print(f"Embeddings shape: {document_embeddings.shape}") # (number of documents, embedding dimension)
print("-" * 60)

# --- 4. Set up ChromaDB as our Vector Database ---
# We'll use a persistent client to store data on disk, so it doesn't disappear when the script ends.
# First, clean up any previous data for a fresh start.
chroma_db_path = "./semantic_search_db"
if os.path.exists(chroma_db_path):
    print(f"Clearing previous ChromaDB data at {chroma_db_path}...")
    shutil.rmtree(chroma_db_path) # Remove the directory and its contents
    print("Previous data cleared.")

print(f"Initializing ChromaDB persistent client at: {chroma_db_path}")
client = chromadb.PersistentClient(path=chroma_db_path)

# Create a collection (similar to a table in a relational database)
# This is where our embeddings, documents, and metadata will live.
collection_name = "famous_quotes_collection"

# Check if the collection already exists. If so, get it. Otherwise, create it.
try:
    collection = client.get_or_create_collection(name=collection_name)
    print(f"Collection '{collection_name}' ready.")
except Exception as e:
    print(f"Error getting/creating collection: {e}")
    # Handle the error appropriately, maybe exit if critical

# Add documents (text), their generated embeddings, IDs, and any metadata
# Metadata can be useful for filtering results later on.
metadatas = []
for i, doc in enumerate(documents):
    # Extract author from the end of the quote
    author = doc.split(" - ")[-1] if " - " in doc else "Unknown"
    metadatas.append({"author": author, "length": len(doc.split())})

print(f"Adding {len(documents)} documents to ChromaDB collection...")
collection.add(
    embeddings=document_embeddings.tolist(), # Chroma expects list of lists for embeddings
    documents=documents, # Original text documents
    metadatas=metadatas, # Associated metadata
    ids=document_ids # Unique identifiers for each document
)
print(f"Total documents in ChromaDB: {collection.count()}")
print("-" * 60)

# --- 5. Perform Semantic Search ---

def perform_semantic_search(query_text, num_results=3):
    """
    Encodes a query and searches the ChromaDB collection for similar documents.
    """
    print(f"\n--- Searching for: '{query_text}' ---")

    # Encode the query text into an embedding using the SAME model
    query_embedding = embedding_model.encode([query_text]).tolist()

    # Query the ChromaDB collection
    # Chroma performs the similarity search using its internal indexing.
    # It returns results ranked by distance (lower distance = more similar).
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=num_results,
        include=['documents', 'distances', 'metadatas'] # Specify what to return
    )

    # Process and display results
    print(f"Top {num_results} Semantic Matches:")
    if not results['ids'][0]: # Check if any results were returned
        print("  No matches found.")
        return

    for i in range(len(results['ids'][0])):
        doc_id = results['ids'][0][i]
        distance = results['distances'][0][i]
        retrieved_document = results['documents'][0][i]
        metadata = results['metadatas'][0][i]

        # For L2 distance (Chroma's default for many cases), smaller is better.
        # For Cosine Similarity, larger is better.
        # Since 'all-MiniLM-L6-v2' produces normalized embeddings, L2 distance is related to Cosine Similarity.
        # Lower L2 distance means higher cosine similarity.
        print(f"  Rank {i+1} (ID: {doc_id}):")
        print(f"    Distance: {distance:.4f} (lower is more similar)")
        print(f"    Document: \"{retrieved_document}\"")
        print(f"    Author: {metadata.get('author', 'N/A')}")
        print("-" * 40)

# Example searches
perform_semantic_search("What makes a person strong?")

Our Document Corpus:
  doc_1: "The only way to do great work is to love what you do. - Steve Jobs"
  doc_2: "Innovation distinguishes between a leader and a follower. - Steve Jobs"
  doc_3: "Stay hungry, stay foolish. - Steve Jobs"
  doc_4: "Life is what happens when you're busy making other plans. - John Lennon"
  doc_5: "Imagine all the people living life in peace. - John Lennon"
  doc_6: "The future belongs to those who believe in the beauty of their dreams. - Eleanor Roosevelt"
  doc_7: "Do one thing every day that scares you. - Eleanor Roosevelt"
  doc_8: "The greatest glory in living lies not in never falling, but in rising every time we fall. - Nelson Mandela"
  doc_9: "Education is the most powerful weapon which you can use to change the world. - Nelson Mandela"
  doc_10: "That which does not kill us makes us stronger. - Friedrich Nietzsche"
  doc_11: "He who has a why to live can bear almost any how. - Friedrich Nietzsche"
------------------------------------------------------

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model loaded successfully.
------------------------------------------------------------
Encoding documents into embeddings (numerical vectors)...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings shape: (11, 384)
------------------------------------------------------------
Initializing ChromaDB persistent client at: ./semantic_search_db
Collection 'famous_quotes_collection' ready.
Adding 11 documents to ChromaDB collection...
Total documents in ChromaDB: 11
------------------------------------------------------------

--- Searching for: 'What makes a person strong?' ---
Top 3 Semantic Matches:
  Rank 1 (ID: doc_10):
    Distance: 1.1086 (lower is more similar)
    Document: "That which does not kill us makes us stronger. - Friedrich Nietzsche"
    Author: Friedrich Nietzsche
----------------------------------------
  Rank 2 (ID: doc_11):
    Distance: 1.2058 (lower is more similar)
    Document: "He who has a why to live can bear almost any how. - Friedrich Nietzsche"
    Author: Friedrich Nietzsche
----------------------------------------
  Rank 3 (ID: doc_2):
    Distance: 1.3779 (lower is more similar)
    Document: "Innovation distinguishes between a leader and a

In [2]:
perform_semantic_search("quotes about innovation and leadership")


--- Searching for: 'quotes about innovation and leadership' ---
Top 3 Semantic Matches:
  Rank 1 (ID: doc_2):
    Distance: 0.6351 (lower is more similar)
    Document: "Innovation distinguishes between a leader and a follower. - Steve Jobs"
    Author: Steve Jobs
----------------------------------------
  Rank 2 (ID: doc_6):
    Distance: 1.1650 (lower is more similar)
    Document: "The future belongs to those who believe in the beauty of their dreams. - Eleanor Roosevelt"
    Author: Eleanor Roosevelt
----------------------------------------
  Rank 3 (ID: doc_9):
    Distance: 1.1877 (lower is more similar)
    Document: "Education is the most powerful weapon which you can use to change the world. - Nelson Mandela"
    Author: Nelson Mandela
----------------------------------------


In [None]:
Chat based NLP using Langchain