In [None]:
import numpy as np
import matplotlib.pyplot as plt
from datasets.tensor_storage import TensorStorage
from sklearn.decomposition import PCA
from tqdm import tqdm
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
# import pandas as pd
from sklearn.metrics import classification_report
import numpy as np
from sklearn.manifold import TSNE
from umap import UMAP
import ncvis
import time

In [2]:
dim_experiment_id = "deepinfomax_20241218_052013"
ae_experiment_id = "autoencoder_20241218_054318"

In [3]:
dim_store = TensorStorage(f"storages/{dim_experiment_id}")
ae_store = TensorStorage(f"storages/{ae_experiment_id}")

In [9]:
def reduce_and_plot_embeddings(embeddings, metadata_df, split='train', methods=None, n_samples=1000, seed=42):
    """Apply different dimensionality reduction methods and plot results on a subset."""
    if methods is None:
        methods = ['pca', 'tsne', 'umap', 'ncvis']  # Added PCA to default methods
        
    # CIFAR-10 class names
    class_names = [
        'airplane', 'automobile', 'bird', 'cat', 'deer',
        'dog', 'frog', 'horse', 'ship', 'truck'
    ]
    
    # Filter by split
    split_mask = metadata_df['split'] == split
    split_embeddings = embeddings[split_mask]
    split_labels = metadata_df[split_mask]['label'].values
    
    # Take stratified random sample
    np.random.seed(seed)
    samples_per_class = n_samples // len(class_names)
    sampled_indices = []
    
    for label in range(len(class_names)):
        label_indices = np.where(split_labels == label)[0]
        if len(label_indices) > samples_per_class:
            sampled_indices.extend(
                np.random.choice(label_indices, samples_per_class, replace=False)
            )
        else:
            sampled_indices.extend(label_indices)
    
    # Get subset of data
    subset_embeddings = split_embeddings[sampled_indices]
    subset_labels = split_labels[sampled_indices]
    
    # Scale the features
    scaler = StandardScaler()
    subset_embeddings = scaler.fit_transform(subset_embeddings)
    
    # Dictionary to store reduced embeddings and explained variance for PCA
    reduced_embeddings = {}
    times = {}
    explained_var = {}
    
    # Apply each method
    for method in methods:
        print(f"\nApplying {method.upper()}...")
        start_time = time.time()
        
        if method == 'pca':
            reducer = PCA(n_components=2, random_state=42)
            reduced_embeddings[method] = reducer.fit_transform(subset_embeddings)
            explained_var[method] = reducer.explained_variance_ratio_
        elif method == 'tsne':
            reducer = TSNE(n_components=2, random_state=42, perplexity=30)
            reduced_embeddings[method] = reducer.fit_transform(subset_embeddings)
        elif method == 'umap':
            reducer = UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.3)
            reduced_embeddings[method] = reducer.fit_transform(subset_embeddings)
        elif method == 'ncvis':
            reducer = ncvis.NCVis(n_neighbors=30)
            reduced_embeddings[method] = reducer.fit_transform(subset_embeddings)
            
        times[method] = time.time() - start_time
        print(f"{method.upper()} took {times[method]:.2f} seconds")
    
    # Plotting
    n_methods = len(methods)
    fig = plt.figure(figsize=(6*n_methods, 5))
    gs = fig.add_gridspec(1, n_methods + 1, width_ratios=[1]*n_methods + [0.2])
    axes = [fig.add_subplot(gs[0, i]) for i in range(n_methods)]
    
    # Use a colorful and visually appealing palette
    palette = sns.color_palette('husl', n_colors=len(class_names))
    
    for ax, method in zip(axes, methods):
        embeddings_2d = reduced_embeddings[method]
        
        # Plot each class
        for label in range(len(class_names)):
            mask = subset_labels == label
            ax.scatter(
                embeddings_2d[mask, 0],
                embeddings_2d[mask, 1],
                c=[palette[label]],
                label=class_names[label],
                alpha=0.7,
                s=40,
                rasterized=True
            )
        
        # Customize the plot
        title = method.upper()
        if method == 'pca':
            var_1, var_2 = explained_var[method]
            title += f'\nVar: ({var_1:.1%}, {var_2:.1%})'
            
        ax.set_title(title, fontsize=14, pad=10)
        ax.grid(False)
        ax.set_xticks([])
        ax.set_yticks([])
    
    # Add legend
    legend_ax = fig.add_subplot(gs[0, -1])
    legend_ax.axis('off')
    
    # Create legend handles
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                                markerfacecolor=palette[i], label=class_names[i],
                                markersize=8)
                      for i in range(len(class_names))]
    legend_ax.legend(handles=legend_elements, loc='center left', fontsize=10)
    
    plt.tight_layout()
    
    return fig, reduced_embeddings, times, subset_embeddings, subset_labels

In [10]:
def load_embeddings_and_metadata(store):
    metadata_df = store.load_metadata_table()
    
    # Load all embeddings into memory
    embeddings = []
    for i in tqdm(range(len(store)), desc="Loading embeddings"):
        embeddings.append(store[i])
    
    return np.array(embeddings), metadata_df

In [None]:
dim_embeddings, dim_metadata_df = load_embeddings_and_metadata(dim_store)

In [None]:
ae_embeddings, ae_metadata_df = load_embeddings_and_metadata(ae_store)

In [None]:
print("Applying dimensionality reduction to DeepInfoMax embeddings...")
dim_fig, dim_reduced, dim_times, _, _ = reduce_and_plot_embeddings(
    dim_embeddings, dim_metadata_df, split='train'
)

In [None]:
print("Applying dimensionality reduction to AE embeddings...")
ae_fig, ae_reduced, ae_times, _, _ = reduce_and_plot_embeddings(
    ae_embeddings, ae_metadata_df, split='train'
)