# Visualize Embeddings

The following notebook hels to visualize embeddings using UMAP algorithm.
Make sure to generate them using the `generate_embeddings.py` script.


In [None]:
"""
DISCLAIMER:
This code is provided "as-is" without any warranty of any kind, either expressed or implied,
including but not limited to the implied warranties of merchantability and fitness for a particular purpose.
The author assumes no liability for any damages or consequences resulting from the use of this code.
Use it at your own risk.

Utility to download and extract all resources needed for the MLAADv5 project.

This script handles the downloading of large files with progress bars, ensures
caching of already downloaded files, and extracts `.zip` files using 7-Zip.

## Author: Piotr KAWA
## December 2024
"""

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap

In [None]:
embeddings_root_dir = "../data/embeddings"

In [None]:
def find_samples(embeddings_dir_path: Path) -> list[dict]:
    embeddings_dir_path = Path(embeddings_dir_path)
    samples = []
    for p in embeddings_dir_path.rglob("*.npy"):
        samples.append(
            {
                "embedding_path": str(p),
                "class_id": p.parent.name,
            }
        )
    return samples

In [None]:
embeddings_dir_root = Path(embeddings_root_dir)
train_subdir_root = embeddings_dir_root / "train"
dev_subdir_root = embeddings_dir_root / "dev"
test_subdir_root = embeddings_dir_root / "test"

train_and_dev_samples = pd.DataFrame(
    find_samples(train_subdir_root) + find_samples(dev_subdir_root)
)
test_samples = pd.DataFrame(find_samples(test_subdir_root))

In [None]:
with open("../class_mapping.json") as f:
    class_mapping = json.load(f)

inv_class_mapping = {}

for k, v in class_mapping.items():
    inv_class_mapping[v] = k

train_and_dev_samples["class_name"] = train_and_dev_samples["class_id"].apply(
    lambda x: str(inv_class_mapping[int(x)])
)
test_samples["class_name"] = test_samples["class_id"].apply(
    lambda x: str(inv_class_mapping[int(x)])
)

In [None]:
train_and_dev_embeddings = [
    np.load(path) for path in train_and_dev_samples["embedding_path"]
]
test_embeddings = [np.load(path) for path in test_samples["embedding_path"]]

In [None]:
reducer = umap.UMAP()
print("Fit + transform train and dev embeddings")
train_embedding_umap = reducer.fit_transform(train_and_dev_embeddings)

In [None]:
plt.figure(figsize=(15, 15))
for class_name in train_and_dev_samples["class_name"].unique():
    indices = train_and_dev_samples["class_name"] == class_name
    plt.scatter(
        train_embedding_umap[indices, 0],
        train_embedding_umap[indices, 1],
        s=3,
        label=class_name,
    )

plt.title("UMAP projection of the train and dev embeddings")
plt.legend(markerscale=5, bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
print("Transforming test embeddings")
test_embedding_umap = reducer.transform(test_embeddings)

In [None]:
plt.figure(figsize=(15, 15))
for class_name in test_samples["class_name"].unique():
    indices = test_samples["class_name"] == class_name
    plt.scatter(
        test_embedding_umap[indices, 0],
        test_embedding_umap[indices, 1],
        s=3,
        label=class_name,
    )

plt.title("UMAP projection of the test embeddings")
plt.legend(markerscale=5, bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()