# Comparing Embedding Models

## Setup

In [None]:
%matplotlib widget
import os
import mplcursors
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image

In [None]:
from datasets import load_dataset
from langchain.embeddings import HuggingFaceEmbeddings

## Load the "Tiny Shakespeare" dataset

In [None]:
dataset = load_dataset("Trelis/tiny-shakespeare")

texts = dataset["train"]["Text"] + dataset["test"]["Text"]
print(f'Imported {len(texts)} texts')

print('First 100 characters of text #42:')
print(texts[42][:99])

In [None]:
# These color values derived from the length of each text will be used further down in both visualizations

colors = np.array([len(text) for text in texts])
print(f"First 4 color values: {colors[:4]}")

## Configure the embedding model

In [None]:
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L6-v2"
minilm_model = SentenceTransformer(model_name)

## Generate the embeddings

In [None]:
embeddings = minilm_model.encode(texts, show_progress_bar=True)

In [None]:
print(embeddings.shape)

### Reduce the embeddings using Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_two_d = PCA(n_components=2)
pca_two_d.fit(embeddings)
pca_two_d_embeddings = pca_two_d.transform(embeddings)

In [None]:
print(pca_two_d_embeddings.shape)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(pca_two_d_embeddings[:, 0], pca_two_d_embeddings[:, 1], c=colors, marker='o')
plt.title('2D PCA of Embeddings')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

### Reduce the embeddings using t-distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2)
tsne_reduced_embeddings = tsne.fit_transform(embeddings)

In [None]:
print(tsne_reduced_embeddings.shape)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(tsne_reduced_embeddings[:, 0], tsne_reduced_embeddings[:, 1], c=colors, marker='o')
plt.title('2D PCA of Embeddings')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

## Exercises

- Experiment with modifying the plots. A few ideas: change colors, plot a different number of points

## Discussion Questions

- Do you notice any differences between the two plots?
- We chose text length to determine the color value, what else could we use to differentiate points?