In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)

sentences = ['That is a happy person', 'That is a very happy person']
embeddings = model.encode(sentences)
print(embeddings)

[[ 0.23888057 -0.14092064 -0.22524145 ...  0.1500071   0.06842493
  -0.55839527]
 [ 0.05154587  0.04116409  0.14478017 ... -0.11266184  0.23965885
  -0.80651927]]


In [5]:
from sentence_transformers.util import cos_sim

print(cos_sim(embeddings[0], embeddings[1])[0][0].item())

0.9768850803375244


In [6]:
import math

def our_fancy_cosine_similarity(embedding1, embedding2):
    """
    Calculates the cosine similarity between two embeddings.
    
    Args:
        embedding1 (list): A list of floats representing the first embedding.
        embedding2 (list): A list of floats representing the second embedding.
        
    Returns:
        float: The cosine similarity between the two embeddings.
    """
    # Make sure the embeddings have the same length
    if len(embedding1) != len(embedding2):
        raise ValueError("Embeddings must have the same length.")
    
    # Calculate the dot product of the two embeddings
    dot_product = sum(a * b for a, b in zip(embedding1, embedding2))
    
    # Calculate the L2 norm (Euclidean length) of each embedding
    norm1 = math.sqrt(sum(x ** 2 for x in embedding1))
    norm2 = math.sqrt(sum(x ** 2 for x in embedding2))
    
    # Calculate the cosine similarity
    if norm1 == 0 or norm2 == 0:
        # If either norm is zero, the cosine similarity is undefined
        return 0
    else:
        return dot_product / (norm1 * norm2)

In [7]:
print(our_fancy_cosine_similarity(embeddings[0], embeddings[1]))

0.9768851434566402


## Exercises:

- Try using different sentences as input, with the goal of getting a sense for making comparisons between embeddings

## Discussion Questions:

- Do you notice any differences? If so, why do you think that is?