In [1]:
import os
from openai import AzureOpenAI
import numpy as np
from dotenv import load_dotenv
load_dotenv()

client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version = "2024-06-01" #os.getenv("OPENAI_API_VERSION")
)

# Cosine similarity 
Cosine similarity is a metric used to measure how similar two vectors are, regardless of their magnitude. It is commonly used in the context of embeddings, which are representations of data points in a high-dimensional space, such as word embeddings in natural language processing.

### Key Points about Cosine Similarity:

1. **Definition**:
   Cosine similarity calculates the cosine of the angle between two vectors. The formula is:
   $$
   \text{cosine similarity} = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}
   $$
   where \( \mathbf{A} \cdot \mathbf{B} \) is the dot product of vectors \( \mathbf{A} \) and \( \mathbf{B} \), and \( \|\mathbf{A}\| \) and \( \|\mathbf{B}\| \) are the magnitudes (or norms) of the vectors.

2. **Range**:
   The cosine similarity value ranges from -1 to 1:
   - **1** indicates that the vectors are identical.
   - **0** indicates that the vectors are orthogonal (no similarity).
   - **-1** indicates that the vectors are diametrically opposed.

3. **Applications**:
   - **Text Analysis**: Used to compare the similarity between documents or sentences.
   - **Recommendation Systems**: Helps in finding similar items or users based on their embeddings.
   - **Image Recognition**: Used to compare feature vectors of images.

4. **Advantages**:
   - **Scale-Invariant**: Cosine similarity is unaffected by the magnitude of the vectors, making it useful for comparing normalized data.
   - **Efficient**: Computationally efficient for high-dimensional data.

5. **Example**:
   Suppose we have two word embeddings \( \mathbf{A} \) and \( \mathbf{B} \):
   $$
   \mathbf{A} = [1, 2, 3], \quad \mathbf{B} = [4, 5, 6]
   $$
   The cosine similarity is calculated as:
   $$
   \text{cosine similarity} = \frac{(1 \cdot 4 + 2 \cdot 5 + 3 \cdot 6)}{\sqrt{1^2 + 2^2 + 3^2} \cdot \sqrt{4^2 + 5^2 + 6^2}} = \frac{32}{\sqrt{14} \cdot \sqrt{77}} \approx 0.974
   $$

Cosine similarity is a powerful tool for measuring similarity in various applications, especially when dealing with high-dimensional data like embeddings.

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Embeddings calculation for the ada model

In [3]:

text = 'the quick brown fox jumped over the lazy dog'
model = 'text-embedding-ada-002'
try:
    text_embedding_ada=client.embeddings.create(input = [text], model=model, dimensions=1536).data[0].embedding
except Exception as e:
    text_embedding_ada=client.embeddings.create(input = [text], model=model,).data[0].embedding

In [None]:
print(text_embedding_ada[:10])
print(len(text_embedding_ada))

# Word comparision using embeddings and cosine similarity

In [None]:
# compare several words
automobile_embedding_ada    = client.embeddings.create(input = 'automobile', model=model).data[0].embedding
vehicle_embedding_ada       = client.embeddings.create(input = 'vehicle', model=model).data[0].embedding
dinosaur_embedding_ada      = client.embeddings.create(input = 'dinosaur', model=model).data[0].embedding
stick_embedding_ada         = client.embeddings.create(input = 'stick', model=model).data[0].embedding
huskey_embedding_ada        = client.embeddings.create(input = 'huskey', model=model).data[0].embedding
brown_embedding_ada         = client.embeddings.create(input = 'brown fox', model=model).data[0].embedding

# comparing cosine similarity, automobiles vs automobiles should be 1.0, i.e exactly the same, while automobiles vs dinosaurs should be between 0 and 1, i.e. not the same
print(cosine_similarity(automobile_embedding_ada, automobile_embedding_ada))
print(cosine_similarity(automobile_embedding_ada, vehicle_embedding_ada))
print(cosine_similarity(automobile_embedding_ada, dinosaur_embedding_ada))
print(cosine_similarity(automobile_embedding_ada, stick_embedding_ada))
print(cosine_similarity(automobile_embedding_ada, huskey_embedding_ada))

# Same check for large model

In [6]:
text = 'the quick brown fox jumped over the lazy dog'
model = 'text-embedding-3-large'

text_embedding= client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# compare several words
automobile_embedding    = client.embeddings.create(input = 'automobile', model=model).data[0].embedding
vehicle_embedding       = client.embeddings.create(input = 'vehicle', model=model).data[0].embedding
dinosaur_embedding      = client.embeddings.create(input = 'dinosaur', model=model).data[0].embedding
stick_embedding         = client.embeddings.create(input = 'stick', model=model).data[0].embedding
huskey_embedding        = client.embeddings.create(input = 'huskey', model=model).data[0].embedding
brown_embedding         = client.embeddings.create(input = 'brown fox', model=model).data[0].embedding


# comparing cosine similarity, automobiles vs automobiles should be 1.0, i.e exactly the same, while automobiles vs dinosaurs should be between 0 and 1, i.e. not the same
print(cosine_similarity(automobile_embedding, automobile_embedding))
print(cosine_similarity(automobile_embedding, vehicle_embedding))
print(cosine_similarity(automobile_embedding, dinosaur_embedding))
print(cosine_similarity(automobile_embedding, stick_embedding))
print(cosine_similarity(automobile_embedding, huskey_embedding))