### Imports

In [29]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

### 1. Load the pre-trained Sentence-BERT model

In [30]:
# 'all-MiniLM-L6-v2' since it's a good balance of size and performance
print("Loading Sentence-BERT model (all-MiniLM-L6-v2)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")

Loading Sentence-BERT model (all-MiniLM-L6-v2)...
Model loaded.


### 2. Define some sample sentences

In [31]:
sample_sentences = [
    "Python variables are used to store data.",
    "How to define a function in Python?",
    "A variable is a named storage location.",
    "What is the syntax for creating a function in Python?",
    "Computers are good at math.",
    "Programmers prefer DARK themes because LIGHT attracts 'bugs'!"
]

### 3. Generate embeddings for the sentences

In [32]:
sentence_embeddings = model.encode(sample_sentences)

### 4. Print the shape of the embeddings and first embedding

In [33]:
# The shape will be (number_of_sentences, embedding_dimension)
# For all-MiniLM-L6-v2, the embedding dimension is 384.
print(f"Shape of embeddings: {sentence_embeddings.shape}") # Expected: (6, 384)

print("\nFirst embedding (partial):")
print(sentence_embeddings[0][:10]) # Print first 10 dimensions

Shape of embeddings: (6, 384)

First embedding (partial):
[-0.01797274  0.03773345 -0.10313956  0.05437659 -0.05526058 -0.0748354
  0.04968969  0.03465347  0.00623344 -0.01089949]


### 5. Calculate cosine similarity between embeddings

In [34]:
# Cosine similarity measures the cosine of the angle between two vectors.
# Values closer to 1 indicate higher similarity.
print("\nCalculated similarities:")

# Compare sentence 0 ("Python variables...") with others
embedding_0 = sentence_embeddings[0].reshape(1, -1) # Reshape for cosine_similarity
for i, embedding in enumerate(sentence_embeddings):
    similarity = cosine_similarity(embedding_0, embedding.reshape(1, -1))[0][0]
    print(f"\n'{sample_sentences[0]}' vs '{sample_sentences[i]}' \n > Similarity = {similarity:.4f}")


Calculated similarities:

'Python variables are used to store data.' vs 'Python variables are used to store data.' 
 > Similarity = 1.0000

'Python variables are used to store data.' vs 'How to define a function in Python?' 
 > Similarity = 0.4416

'Python variables are used to store data.' vs 'A variable is a named storage location.' 
 > Similarity = 0.6705

'Python variables are used to store data.' vs 'What is the syntax for creating a function in Python?' 
 > Similarity = 0.4966

'Python variables are used to store data.' vs 'Computers are good at math.' 
 > Similarity = 0.2396

'Python variables are used to store data.' vs 'Programmers prefer DARK themes because LIGHT attracts 'bugs'!' 
 > Similarity = 0.0861
