In [3]:
import os

import h5py
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE

from common.file_paths import BASE_DIR

embeddings = []
descriptions = []
labels = []

with h5py.File(os.path.join(BASE_DIR, 'data.h5'), 'r') as hf:
    # Iterate over dataset splits (train, val, test)
    for split in hf:
        group = hf[split]

        # Iterate over datasets
        for dataset in group:
            for sample in group[dataset]:
                # Get the embedding
                embedding = group[dataset][sample]['features'][:]
                embeddings.append(embedding)
                description = group[dataset][sample]['description'][()].decode('utf-8')
                descriptions.append(description)

                # Get the label
                labels.append(dataset)

# Convert your list of embeddings into a numpy array
embeddings_array = np.array(embeddings)

perplexity_value = min(30, len(embeddings_array) - 1)

# Use t-SNE to reduce the dimensionality
tsne = TSNE(n_components=2, random_state=0, perplexity=perplexity_value)
reduced_embeddings = tsne.fit_transform(embeddings_array)

# Create a DataFrame for Plotly
import pandas as pd
df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
df['label'] = labels
df['description'] = descriptions

# Create an interactive scatter plot
fig = px.scatter(df, x='x', y='y', color='label', hover_data=['description'])
fig.update_layout(title='t-SNE visualization of sentence embeddings', xaxis_title='t-SNE dimension 1', yaxis_title='t-SNE dimension 2')

# Update the layout for dark mode
fig.update_layout(
    template="plotly_dark",  # Use the built-in dark mode template
    plot_bgcolor='rgba(0,0,0,0)',  # Set plot background to black
    paper_bgcolor='rgba(0,0,0,0)',  # Set the overall background to black
    font_color="white",  # Set text color to white
)

fig.show()