In [1]:
# ===========================================================
# 🔍 Semantic Search using Embeddings
# Author: Manasa Puellela
# Purpose: Demonstrate vector embeddings + semantic search
# ===========================================================

# --- Install dependencies ---
!pip install sentence-transformers --quiet

from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

# --- Load Model ---
model = SentenceTransformer("all-MiniLM-L6-v2")

print("Model loaded!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded!


In [2]:
# --- Sample Corpus (you can replace later) ---
documents = [
    "Credit card fraud detection using machine learning.",
    "How to build an API using Python and Flask.",
    "Deep learning models for image classification.",
    "Understanding embeddings and vector similarity.",
    "Building ETL pipelines for data engineering.",
    "Large language models and text generation.",
    "Deploying machine learning models on AWS Lambda.",
    "Creating dashboards using Power BI and SQL.",
]

df = pd.DataFrame({"documents": documents})
df


Unnamed: 0,documents
0,Credit card fraud detection using machine lear...
1,How to build an API using Python and Flask.
2,Deep learning models for image classification.
3,Understanding embeddings and vector similarity.
4,Building ETL pipelines for data engineering.
5,Large language models and text generation.
6,Deploying machine learning models on AWS Lambda.
7,Creating dashboards using Power BI and SQL.


In [3]:
# --- Generate Embeddings ---
document_embeddings = model.encode(documents, convert_to_tensor=True)
print("Embeddings shape:", document_embeddings.shape)


Embeddings shape: torch.Size([8, 384])


In [5]:
# --- Semantic Search Function ---
def semantic_search(query, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.cos_sim(query_embedding, document_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)

    results = []
    for score, idx in zip(top_results.values, top_results.indices):
        results.append({
            "score": float(score),
            "document": documents[int(idx)]
        })
    return results


In [6]:
# --- Test the Semantic Search ---
query = "machine learning deployment"
results = semantic_search(query, top_k=3)

pd.DataFrame(results)


Unnamed: 0,score,document
0,0.587531,Deploying machine learning models on AWS Lambda.
1,0.345801,Deep learning models for image classification.
2,0.285776,Credit card fraud detection using machine lear...


In [7]:
# --- Another example ---
query2 = "fraud detection system"
results2 = semantic_search(query2, top_k=3)

pd.DataFrame(results2)


Unnamed: 0,score,document
0,0.578565,Credit card fraud detection using machine lear...
1,0.12306,Deep learning models for image classification.
2,0.099342,Understanding embeddings and vector similarity.


In [9]:
# --- Save Results to CSV ---
example_results = pd.DataFrame(semantic_search("API development"))
example_results.to_csv("semantic_search_example.csv", index=False)

print("Saved semantic_search_example.csv")

Saved semantic_search_example.csv
