In [None]:
## Exercise 1: Understanding Embedding Similarity

In [10]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
print("Libraries imported successfully!")

Libraries imported successfully!


In [11]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("Similarity function ready!")

Similarity function ready!


In [None]:


# Sentences
sentences = [
    "The dog is playing in the park",
    "A puppy is running outside",
    "The cat is sleeping on the couch",
    "Python is a programming language",
    "Machine learning models need data",
    "I love coding in Python"
]

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for all sentences
embeddings = model.encode(sentences)

# Print results
print("Similarity Analysis: \n")

# Function to analyze a query
def analyze_query(query_index):
    query_sentence = sentences[query_index]
    similarities = []

    # Calculate similarity with all sentences
    for i, sentence in enumerate(sentences):
        score = cosine_similarity(embeddings[query_index], embeddings[i])
        similarities.append((sentence, score))

    # Sort by similarity descending
    similarities_sorted = sorted(similarities, key=lambda x: x[1], reverse=True)

    most_similar = similarities_sorted[1]  # skip self-similarity
    least_similar = similarities_sorted[-1]

    # Print results
    print(f'Query: "{query_sentence}"')
    print(f"Most similar: '{most_similar[0]}' (score: {most_similar[1]:.4f})")
    print(f"Least similar: '{least_similar[0]}' (score: {least_similar[1]:.4f})")
    print(f"All similarity scores:")
    
    for sentence, score in similarities_sorted:
        print(f"  {sentence} --> {score:.4f}")
    print("-" * 60 + "\n")

# Analyze Sentence 1 and Sentence 4
analyze_query(0)  # Sentence 1: "The dog is playing in the park"
analyze_query(3)  # Sentence 4: "Python is a programming language"

# Recommended similarity threshold
print("Recommended similarity threshold: 1.0000")

Similarity Analysis: 

Query: "The dog is playing in the park"
Most similar: 'A puppy is running outside' (score: 0.3984)
Least similar: 'Machine learning models need data' (score: -0.0052)
All similarity scores:
  The dog is playing in the park --> 1.0000
  A puppy is running outside --> 0.3984
  Python is a programming language --> 0.0987
  I love coding in Python --> 0.0902
  The cat is sleeping on the couch --> 0.0714
  Machine learning models need data --> -0.0052
------------------------------------------------------------

Query: "Python is a programming language"
Most similar: 'I love coding in Python' (score: 0.7304)
Least similar: 'The cat is sleeping on the couch' (score: 0.0199)
All similarity scores:
  Python is a programming language --> 1.0000
  I love coding in Python --> 0.7304
  Machine learning models need data --> 0.1133
  The dog is playing in the park --> 0.0987
  A puppy is running outside --> 0.0395
  The cat is sleeping on the couch --> 0.0199
-----------------