In [None]:
%matplotlib widget
import os
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image

In [None]:
from datasets import load_dataset
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
dataset = load_dataset("Trelis/tiny-shakespeare")

texts = dataset["train"]["Text"] + dataset["test"]["Text"]
print(f'Imported {len(texts)} texts')

print('First 100 characters of text #42:')
print(texts[42][:99])

In [4]:
colors = np.array([len(text) for text in texts])
print(colors.shape)

(521,)


In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
minilm_model = SentenceTransformer(model_name)



In [7]:
embeddings = minilm_model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [None]:
print(embeddings.shape)

In [None]:
# Reduce the dimensionality of the embeddings with PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca.fit(embeddings)
pca_embeddings = pca.transform(embeddings)


In [None]:
print(pca_embeddings.shape)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
tsne_three_d = tsne.fit_transform(embeddings)

In [None]:
print(tsne_three_d.shape)

In [None]:
import mplcursors

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(tsne_three_d[:, 0], tsne_three_d[:, 1], tsne_three_d[:, 2], c=colors, cmap='viridis', edgecolor='k', s=60, alpha=0.6)
plt.colorbar(scatter)
plt.title('t-SNE visualization of "Tiny Shakespeare" dataset')

# Adding hover functionality
cursor = mplcursors.cursor(scatter, hover=True)

@cursor.connect("add")
def on_add(sel):
    index = sel.target.index
    sel.annotation.set_text(texts[index])

plt.show()