## Embedding investigation

I'm curious about the embeddings, and how they work for ollama versus openai. The video I watched had interesting example comparing the distance of "apple" from "orange" vs "apple" from "iphone". I'd kind of like to have a go at building a list of 5 or 6 different words and calculating the distance from each of them to the others w ollama and open ai. 



In [1]:
import numpy as np
from langchain.embeddings import OpenAIEmbeddings, OllamaEmbeddings

# Function to calculate Euclidean distance
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)

# Function to compute embeddings and calculate Euclidean distance
def calculate_embedding_distance(embedding_model, word1, word2, model_name):
    # Get embeddings for the words
    embedding1 = embedding_model.embed_query(word1)
    embedding2 = embedding_model.embed_query(word2)

    # Calculate distance between embeddings
    distance = euclidean_distance(np.array(embedding1), np.array(embedding2))

    print(f"Model: {model_name}")
    print(f"Embedding for '{word1}': {embedding1}")
    print(f"Embedding for '{word2}': {embedding2}")
    print(f"Euclidean distance between embeddings: {distance}\n")

# Words to embed
word1 = "orangutan"
word2 = "monkey"





In [2]:
# Initialize and calculate for OpenAIEmbeddings
openai_embedding_model = OpenAIEmbeddings()
calculate_embedding_distance(openai_embedding_model, word1, word2, "OpenAI Embeddings")



  warn_deprecated(


Model: OpenAI Embeddings
Embedding for 'orangutan': [-0.014931474228545814, -0.03383751453783446, -0.020893321487014983, -0.01463606710133063, 0.0026653754879775848, 0.02430392893003907, -0.0301852102696959, 0.014112391888679325, -0.019201445573858753, 0.0021030955591339524, 0.007734288624776466, 0.016851618227138168, -0.005747006050066321, -0.02015480506596083, -0.0059316356209911285, 0.003934282190014123, 0.04576120719212772, 0.0012722636541254914, 0.02740570050616962, -0.0041826924536452325, -0.011749136733602927, 0.021900390349895233, 0.008345244126079284, -0.027660825139639907, 0.011755850637780833, 0.007694006131031567, 0.0017674058217585515, -0.023229721025379756, 0.01328659471331239, -0.014891191734800916, 0.027486265493659418, -0.01296433290070812, 0.0013738097085865646, -0.033246698420759016, -0.03558309982176886, -0.022759755183506623, -0.002351505822765062, -0.0068144991192251895, -0.0005685742722180781, 0.0045217390266942695, 0.012615215471392223, 0.0067205063233795785, -0

In [3]:
# Initialize and calculate for OllamaEmbeddings
ollama_embedding_model = OllamaEmbeddings(model="nomic-embed-text")
calculate_embedding_distance(ollama_embedding_model, word1, word2, "Ollama Embeddings")


Model: Ollama Embeddings
Embedding for 'orangutan': [0.009855739772319794, 0.07009904086589813, -2.369049072265625, -1.8276951313018799, -0.41204705834388733, 0.7729711532592773, 0.20337678492069244, -0.694606602191925, -0.1498275250196457, 0.4523666203022003, 0.7506926655769348, 1.7605617046356201, -0.22726523876190186, 0.17109961807727814, 1.4240846633911133, -1.4677186012268066, -0.01435587927699089, -0.8973191976547241, 0.9495498538017273, -0.6937294006347656, -1.9700418710708618, -0.5630641579627991, 1.6775017976760864, 0.6479761004447937, 0.6981006860733032, 0.4590791165828705, -0.24912123382091522, 0.23188595473766327, -0.6704074144363403, -1.0301326513290405, 1.465040922164917, 0.2109237164258957, -1.2546383142471313, 1.6898715496063232, 0.2526347041130066, 1.066412329673767, 1.8441557884216309, 0.4158649146556854, 1.323754906654358, 0.30556365847587585, -0.05514093488454819, -0.6199914216995239, 0.46268677711486816, 0.709924578666687, 0.09289197623729706, 0.6835659742355347, 0

The overall distances between words seem to be on a different scale for llama3 vs openai, will check this out more robustly in the next section

#### Function to make a matrix of distances between each word embedding in a list to each other and make heatmaps

In [12]:
import numpy as np
import plotly.express as px
from langchain.embeddings import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings

# Function to calculate Euclidean distance
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)

# Function to calculate pairwise distances and visualize them
def calculate_and_visualize_distances(embedding_model, words, model_name):
    # Calculate embeddings for each word
    embeddings = [embedding_model.embed_query(word) for word in words]

    # Initialize a distance matrix
    distance_matrix = np.zeros((len(words), len(words)))

    # Calculate pairwise distances
    for i in range(len(words)):
        for j in range(len(words)):
            if i != j:
                distance_matrix[i, j] = euclidean_distance(
                    np.array(embeddings[i]), 
                    np.array(embeddings[j])
                )

    # # Display results
    # for i, word1 in enumerate(words):
    #     for j, word2 in enumerate(words):
    #         if i < j:
    #             print(f"Distance between '{word1}' and '{word2}' using {model_name}: {distance_matrix[i, j]}")

    # Create a heatmap using Plotly
    fig = px.imshow(
        distance_matrix,
        labels=dict(x="Words", y="Words", color="Distance"),
        x=words,
        y=words,
        title=f"Pairwise Euclidean Distance Matrix ({model_name})",
        color_continuous_scale="YlGnBu",
        text_auto=True,
    )

    # Update layout for better visualization
    fig.update_layout(
        xaxis_title="Words",
        yaxis_title="Words",
        width=600,
        height=500,
    )

    # Show the plot
    fig.show()

# List of words to compare
word_list = ['apple', 'orange', 'iphone', 'call', 'amsterdam', 'netherlands', 'orangutan', 'perplexed', 'allegory']



In [13]:
# Initialize the OllamaEmbeddings model
ollama_model = OllamaEmbeddings(model="nomic-embed-text")
calculate_and_visualize_distances(ollama_model, word_list, "ollama embeddings")


In [14]:

# Initialize the OpenAIEmbeddings model
openai_model = OpenAIEmbeddings()
calculate_and_visualize_distances(openai_model, word_list, "openai embeddings")



Main things from the above are that the embedding distances have less range than I expected. "Totally unrelated" doesn't necessarily have a way higher distance than "practically synonyms". Some words that I would expect to be really correlated (e.g. "orangutan" and "orange") are barely closer than "orangutan" and "allegory", in spite of orangutans being orange in colour.

Also, the overall distances between words seem to be on a different scale for llama3 vs openai, although they seem to be similar colours on the maps (i.e. similar relative values).

Maybe the cosine similarities would be more comparable?

#### Making heatmaps for cosine similarity

In [15]:
import numpy as np
import plotly.express as px
import plotly.subplots as sp
from langchain.embeddings import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings

# Function to calculate cosine similarity
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    return dot_product / (norm1 * norm2)

# Function to calculate pairwise similarities and return a Plotly heatmap
def calculate_similarity_heatmap(embedding_model, words, model_name, color_scale, zmin=None, zmax=None):
    # Calculate embeddings for each word
    embeddings = [embedding_model.embed_query(word) for word in words]

    # Initialize a similarity matrix
    similarity_matrix = np.zeros((len(words), len(words)))

    # Calculate pairwise similarities
    for i in range(len(words)):
        for j in range(len(words)):
            if i != j:
                similarity_matrix[i, j] = cosine_similarity(
                    np.array(embeddings[i]), 
                    np.array(embeddings[j])
                )

    # Create a heatmap using Plotly
    heatmap = px.imshow(
        similarity_matrix,
        labels=dict(x="Words", y="Words", color="Similarity"),
        x=words,
        y=words,
        title=f"Pairwise Cosine Similarity Matrix ({model_name})",
        color_continuous_scale=color_scale,
        zmin=zmin,
        zmax=zmax,
        text_auto=True,
    )

    # Update layout for better visualization
    heatmap.update_layout(
        xaxis_title="Words",
        yaxis_title="Words",
        width=600,
        height=500,
    )

    return heatmap

# Initialize the OllamaEmbeddings model
ollama_model = OllamaEmbeddings(model="nomic-embed-text")
ollama_heatmap = calculate_similarity_heatmap(
    ollama_model, 
    word_list, 
    "llama3 embeddings", 
    color_scale="YlGnBu",
    zmin=0,   # Cosine similarity ranges from -1 to 1
    zmax=1    # Setting range from 0 to 1 for more intuitive interpretation
)

# Initialize the OpenAIEmbeddings model
openai_model = OpenAIEmbeddings()
openai_heatmap = calculate_similarity_heatmap(
    openai_model, 
    word_list, 
    "openai embeddings", 
    color_scale="Viridis",
    zmin=0,   # Cosine similarity ranges from -1 to 1
    zmax=1    # Setting range from 0 to 1 for more intuitive interpretation
)

# Create a subplot figure to display both heatmaps stacked vertically
fig = sp.make_subplots(rows=2, cols=1, subplot_titles=("Llama3 Embeddings", "OpenAI Embeddings"))

# Add the first heatmap to the first subplot (first row)
for trace in ollama_heatmap.data:
    fig.add_trace(trace, row=1, col=1)

# Add the second heatmap to the second subplot (second row)
for trace in openai_heatmap.data:
    fig.add_trace(trace, row=2, col=1)

# Update the layout to ensure vertical stacking
fig.update_layout(
    title_text="Comparison of Cosine Similarities",
    showlegend=False,
    width=700,  # Adjust width for vertical stack
    height=1000, # Adjust height for vertical stack
)

# Show the combined plot
fig.show()


There goes that hypothesis... Interesting that the values are all lower for ollama than the openai ones