# Comparing vector spaces

Let's compare the vector embeddings from different models. We will use the following models:

- Word2Vec, trained on the Google News dataset
- OpenAI text-embedding-ada002


In [None]:
# Load in vectors from openai and googlenews
import json

with open('vectors_word2vec-google-news.json') as f:
    vectors_word2vec = json.load(f)
    
with open('vectors_openai_ada.json') as f:
    vectors_ada = json.load(f)


In [None]:
import matplotlib.pyplot as plt


def render_vector(vector):
    """Visualize the values of the vector in a bar chart

    Args:
    vector (list): a list of floating point values
    """
    plt.bar(range(len(vector)), vector)
    plt.xlabel('Dimension')
    plt.ylabel('Value')
    plt.title('Vector')
    plt.show()

In [None]:
render_vector(vectors_word2vec['queen'])

In [None]:
render_vector(vectors_ada['queen'])

In [None]:
render_vector(vectors_ada['dog'])

## Visualizations with dimensionality reduction

In [None]:
import numpy as np
from sklearn.decomposition import PCA


def perform_pca(vectors: dict):
    """Perform PCA on the word vectors and return the PCA-transformed vectors"""
    X = np.array(list(vectors.values()))
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(X)
    X_pca = {list(vectors.keys())[i]: X_pca[i] for i in range(len(vectors))}
    return X_pca


vectors_google_pca = perform_pca(vectors_word2vec)
vectors_openai_pca = perform_pca(vectors_ada)

In [None]:
def render_vectors_3d(vectors: dict, title: str):
    """Render 3-dimensional vectors (key: [,,,]) in a 3D plot"""
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    # Plot the vectors as points
    for key, vector in vectors.items():
        ax.scatter(vector[0], vector[1], vector[2])
        ax.text(vector[0], vector[1], vector[2], key)

    # Figure out the max and min values for each dimension
    min_x = min(v[0] for v in vectors.values())
    max_x = max(v[0] for v in vectors.values())
    min_y = min(v[1] for v in vectors.values())
    max_y = max(v[1] for v in vectors.values())
    min_z = min(v[2] for v in vectors.values())
    max_z = max(v[2] for v in vectors.values())
    ax.set_xlim([min_x, max_x])
    ax.set_ylim([min_y, max_y])
    ax.set_zlim([min_z, max_z])
    ax.set_title(title)
    plt.show()


# Get subset of the vectors for plotting
words = ['queen', 'king', 'president', 'computer', 'dog', 'cat', 'car', 'boat', 'house', 'tree', 'god', 'technology']
word_pca_vectors_google = {word: vectors_google_pca[word] for word in words}
word_pca_vectors_openai = {word: vectors_openai_pca[word] for word in words}


# Plot the vectors
render_vectors_3d(word_pca_vectors_google, 'Word2Vec')
render_vectors_3d(word_pca_vectors_openai, 'OpenAI')

In [None]:
import pandas as pd

def cosine_similarity(v1, v2):
    """Compute the cosine similarity between two vectors"""
    dot_product = sum([a * b for a, b in zip(v1, v2)])
    magnitude = (sum([a**2 for a in v1]) * sum([a**2 for a in v2])) ** 0.5
    return dot_product / magnitude

def most_similar(word: str, vectors: dict) -> list[list]:
    """Return the 10 most similar words and similarities to the given word"""
    word_vector = vectors[word]
    similarities = {w: cosine_similarity(word_vector, vector) for w, vector in vectors.items()}
    most_similar_words = sorted(similarities, key=similarities.get, reverse=True)
    return pd.DataFrame([(word, similarities[word]) for word in most_similar_words[:10]], columns=['word', 'similarity'])

word = 'dog'
most_similar(word, vectors_word2vec)

In [None]:
most_similar(word, vectors_ada)

In [None]:
def cosine_similarity_histogram(word: str, vectors: dict, model_name: str):
    """Plot a histogram of the cosine similarities of the word to all other words"""
    word_vector = vectors[word]
    similarities = [cosine_similarity(word_vector, vectors[w]) for w in vectors if w != word]
    plt.hist(similarities, bins=20)
    plt.xlabel('Cosine similarity')
    plt.ylabel('Frequency')
    plt.title(f'{model_name}: Similarity of {word} to all words')
    plt.show()

cosine_similarity_histogram('dog', vectors_word2vec, 'Word2Vec Google News')
cosine_similarity_histogram('dog', vectors_ada, 'OpenAI ada-002')

## Resources

* [Why are Cosine Similarities of Text embeddings almost always positive?](https://vaibhavgarg1982.medium.com/why-are-cosine-similarities-of-text-embeddings-almost-always-positive-6bd31eaee4d5)
* [Expected Angular Differences in Embedding Random Text?](https://community.openai.com/t/expected-angular-differences-in-embedding-random-text/28577)
