# Multi-word vector embeddings

Some models can generate embeddings for multi-word phrases:

* Sentence-BERT
* OpenAI: ada-002, text-embedding-3

https://huggingface.co/spaces/mteb/leaderboard

In [None]:
# Load in the vectors from movies.json, {movie: [vector]}
import json
import numpy as np

with open('openai_movies.json') as f:
    movies = json.load(f)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

movie = movies['Un indien dans la ville']
plt.plot(movie)
plt.show()

In [None]:
import matplotlib.pyplot as plt

movie = movies['Frozen']

data = np.array(movie).reshape(1, -1)

# Create a figure and a set of subplots
fig, ax = plt.subplots(figsize=(25, 1))

# Display the data as an image, assuming most values between -0.1 and 0.1
cax = ax.imshow(data, cmap='coolwarm', aspect='auto', vmin=-0.1, vmax=0.1)

# Add color bar
cbar = fig.colorbar(cax, orientation='vertical')

# Set ticks
ax.set_xticks([])
ax.set_yticks([])

plt.show()

In [None]:
# Now compare the vectors for two movies with similar plots: Frozen and Moana

movie_names = ['Frozen',  'Moana', 'Pearl Harbor']

data = np.array([movies[movie_name] for movie_name in movie_names])

fig, ax = plt.subplots(figsize=(25, 5))

cax = ax.imshow(data, cmap='coolwarm', aspect='auto', vmin=-0.05, vmax=0.05)

# Put lines between the movies
for i in range(1, len(movie_names)):
    ax.axhline(i - 0.5, color='white', lw=1)

# Add color bar
cbar = fig.colorbar(cax, orientation='vertical')

# Set ticks
ax.set_xticks([])
ax.set_yticks([])
ax.set_yticks(range(len(movie_names)))
ax.set_yticklabels(movie_names)

In [None]:
# Are they unit vectors?
print(np.linalg.norm(movies['Frozen']))
print(np.linalg.norm(movies['Moana']))
print(np.linalg.norm(movies['Pearl Harbor']))

In [None]:
# find the 10 most similar movies to Frozen
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

frozen = movies['The Little Mermaid']
frozen = np.array(frozen).reshape(1, -1)

# Calculate the cosine similarity between all other movies
similarities = cosine_similarity(frozen, np.array(list(movies.values())))

most_similar = np.argsort(similarities)[0][-11:-1].tolist()
similar_movies = [(list(movies.keys())[i], round(similarities[0, i], 3)) for i in most_similar]

pd.DataFrame(similar_movies[::-1], columns=['movie', 'similarity'])


In [None]:
# Display the 10 least similar movies
most_similar = np.argsort(similarities)[0][:10].tolist()
similar_movies = [(list(movies.keys())[i], round(similarities[0, i], 3)) for i in most_similar]

pd.DataFrame(similar_movies, columns=['movie', 'similarity'])

In [None]:
# find the two least related movies to each other in the entire dataset

# Calculate the cosine similarity between all movies
similarities = cosine_similarity(np.array(list(movies.values())))
np.fill_diagonal(similarities, 1)

# Find the pair of movies with the lowest similarity
least_similar = np.unravel_index(np.argmin(similarities), similarities.shape)
movie_keys = list(movies.keys())
movie_keys[least_similar[0]], movie_keys[least_similar[1]], similarities[least_similar]


In [None]:
# Find the pair of movies with the highest similarity BUT DO NOT INCLUDE THE SAME MOVIE
np.fill_diagonal(similarities, 0)
most_similar = np.unravel_index(np.argmax(similarities), similarities.shape)
movie_keys[most_similar[0]], movie_keys[most_similar[1]], similarities[most_similar]
