# Imports & Setup

In [3]:
# Imports
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Configuration: We will use a lightweight model designed for semantic search
MODEL_NAME = "all-MiniLM-L6-v2"

# Load Model

In [4]:
# Load the pre-trained model from HuggingFace
# This might take a moment on the first run to download weights (approx. 80MB)
print(f"Loading model: {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)
print("Model loaded successfully.")

Loading model: all-MiniLM-L6-v2...


Loading weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 424.06it/s, Materializing param=pooler.dense.weight]
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded successfully.


# Vectorization (The Magic)

In [5]:
# Sample sentences to simulate customer feedback
sentences = [
    "The internet speed is too slow, I can't work.",  # Complaint A (Connectivity)
    "My connection is lagging and dropping constantly.",  # Complaint B (Connectivity - similar to A)
    "The pizza delivery was late and cold.",  # Complaint C (Food - completely different)
]

# Convert text to embeddings (numerical vectors)
embeddings = model.encode(sentences)

# Show the shape of the vector (3 sentences, 384 dimensions each)
print(f"Embedding Shape: {embeddings.shape}")

# Look at the first 5 numbers of the first sentence's vector
print(f"\nFirst 5 dimensions of sentence 1:\n{embeddings[0][:5]}")

Embedding Shape: (3, 384)

First 5 dimensions of sentence 1:
[-0.02288154 -0.04712411  0.07011108 -0.03909583  0.00433517]


# Similarity Check (Cosine Similarity)

In [6]:
# Calculate similarity between Sentence 1 (Internet) and Sentence 2 (Lag)
sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# Calculate similarity between Sentence 1 (Internet) and Sentence 3 (Pizza)
sim_1_3 = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]

print(f"Similarity: 'Internet Slow' vs 'Connection Lagging': {sim_1_2:.4f}")
print(f"Similarity: 'Internet Slow' vs 'Pizza Late':        {sim_1_3:.4f}")

Similarity: 'Internet Slow' vs 'Connection Lagging': 0.4843
Similarity: 'Internet Slow' vs 'Pizza Late':        0.1290
