In [1]:
import numpy as np
import datasets
from rich.console import Console
from rich.table import Table
from rich.layout import Layout

In [2]:
dataset = datasets.Dataset.from_parquet("embeddings.parquet")

# embeddings must be numpy arrays
dataset.set_format(type='numpy', columns=['embedding'], output_all_columns=True)
dataset

Using custom data configuration default-142e43c20aee2ea2
Found cached dataset parquet (/Users/maxwoolf/.cache/huggingface/datasets/parquet/default-142e43c20aee2ea2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['id', 'name', 'num_tokens', 'embedding'],
    num_rows: 1154
})

Because the embeddings are already unit normalized, the cosine similarity can be calculated just by calculating the dot product, which in Python is easy since you can do `A @ B`!

In [3]:
# Pikachu and Raichu
dataset[24]["embedding"] @ dataset[25]["embedding"]

0.961932

However, because the input data is semantically similar, cosine similarity will always be very high (in testing, the lowest is `0.88`, so rescaling is an option)

Let's try inputting your own ID. Feel free to select your own Pokemon as the target ID to find similarity results you want!

In [4]:
target_id = 1

# instead of looking at array indices, we need to find an exact match in case for certain IDs
target_poke = dataset.filter(lambda x: x["id"] == target_id)[0]
target_poke["name"]

Loading cached processed dataset at /Users/maxwoolf/.cache/huggingface/datasets/parquet/default-142e43c20aee2ea2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-1db0bacecf6d748b.arrow


'bulbasaur'

However, to compare one Pokemon with all Pokemon, it's much faster to do the dot product of all Pokemon embeddings at the same time, then select the top ones (ignoring the top match, which will always be the original one)

In [5]:
cos_sims = np.dot(dataset["embedding"], target_poke["embedding"])

# min-max norm to scale to 0-1: comment out if do not want to rescale.
cos_sims = (cos_sims - cos_sims.min()) / (cos_sims.max() - cos_sims.min())

# argsort sorts ascending, so must reverse order to get most similar.
top_indicies = np.argsort(cos_sims)[::-1]
top_indicies

array([   0,    1,    2, ..., 1130,  692,  894])

Map these indicies to the dataset.

For fun, we'll style the similarity results as a table using `rich`.

In [6]:
top_n = 25
filtered_indices_pos = top_indicies[1 : (top_n + 1)]
filtered_indices_neg = top_indicies[-(top_n):][::-1]

layout = Layout()
layout.split_row(
    Layout(name="left"),
    Layout(name="right"),
)

# Top N Pokemon
pos_table = Table(title=f"Pokémon Most Similar to {target_poke['name']}")

pos_table.add_column("ID", style="bold")
pos_table.add_column("Pokémon")
pos_table.add_column("Similarity", style="green")

for index in filtered_indices_pos:
    idx = int(index)
    pos_table.add_row(str(dataset[idx]["id"]), dataset[idx]["name"], f"{cos_sims[idx]:.3%}")

layout["left"].update(pos_table)

# Bottom N Pokemon
neg_table = Table(title=f"Pokémon Least Similar to {target_poke['name']}")

neg_table.add_column("ID", style="bold")
neg_table.add_column("Pokémon")
neg_table.add_column("Similarity", style="cyan")

for index in filtered_indices_neg:
    idx = int(index)
    neg_table.add_row(str(dataset[idx]["id"]), dataset[idx]["name"], f"{cos_sims[idx]:.3%}")

layout["right"].update(neg_table)

console = Console(width=90, height=top_n+10)
console.print(layout)