In [None]:
import time
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.manifold import trustworthiness

from umap import UMAP
from umap.parametric_umap import ParametricUMAP
from approx_umap import ApproxUMAP

# Umap vs. Approximate Umap vs. Parametic Umap

### Loading Dataset

In [None]:
# Load MNIST
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X = X.astype(np.float32) / 255.0

# Split (1000 samples for base, 500 samples for projection, 784 features)
X_base = X[:1000]
X_new = X[1000:1500]

In [58]:
# Prepare methods
methods = {
    'UMAP': UMAP(n_neighbors=15, n_components=2, random_state=42),
    'Approximate UMAP': ApproxUMAP(n_neighbors=15, n_components=2, random_state=42),
    'Parametric UMAP': ParametricUMAP(n_neighbors=15, n_components=2, random_state=42)
}

results = {}

### Embedding the base dataset to low dimensional space

In [59]:
print("Running baseline embedding on base MNIST samples")

for name, model in methods.items():
    start = time.time()
    emb_base = model.fit_transform(X_base)
    duration = time.time() - start
    tw = trustworthiness(X_base, emb_base, n_neighbors=5)
    results[name] = {
        'base_time': duration,
        'base_trustworthiness': tw,
        'model': model,
        'emb_base': emb_base
    }
    print(f"{name}: time = {duration:.2f}s, trustworthiness = {tw:.4f}")

Running baseline embedding on base MNIST samples


  warn(


UMAP: time = 4.66s, trustworthiness = 0.9678


  warn(


Approximate UMAP: time = 4.61s, trustworthiness = 0.9678


  warn(


Epoch 1/10




[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - loss: 0.3019
Epoch 2/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.2345
Epoch 3/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - loss: 0.2327
Epoch 4/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 0.2317
Epoch 5/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 0.2314
Epoch 6/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 0.2311
Epoch 7/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 0.2314
Epoch 8/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 0.2300
Epoch 9/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - loss: 0.2314
Epoch 10/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - los

### Updating embeddings with added 500 MNIST samples (one-go)

In [None]:
print("Updating embeddings with new 500 MNIST samples (transform only)")
for name, info in results.items():
    model = info['model']
    emb_base = info['emb_base']
    start = time.time()
    emb_new = model.transform(X_new)
    duration = time.time() - start
    
    
    X_combined = np.vstack([X_base, X_new])
    emb_combined = np.vstack([emb_base, emb_new])
    tw_combined = trustworthiness(X_combined, emb_combined, n_neighbors=5)
    results[name].update({
        'update_time': duration,
        'update_trustworthiness': tw_combined
    })
    print(f"{name}: transform time = {duration:.2f}s, combined trustworthiness = {tw_combined:.4f}")


Updating embeddings with new 500 MNIST samples (transform only)




UMAP: transform time = 1.92s, combined trustworthiness = 0.9478
Approximate UMAP: transform time = 0.05s, combined trustworthiness = 0.9128
Parametric UMAP: transform time = 0.11s, combined trustworthiness = 0.8969


In [None]:
# Summary print
print("Summary of results:")
for name, info in results.items():
    print(f"\n{name}:\n  Initial fit (1000 samples):            {info['base_time']:.2f}s, TW = {info['base_trustworthiness']:.4f}\n  Update step (500 new samples added):   {info['update_time']:.2f}s, TW = {info['update_trustworthiness']:.4f}")


Summary of results:

UMAP:
  Initial fit (1000 samples):            4.66s, TW = 0.9678
  Update step (500 new samples added):   1.92s, TW = 0.9478

Approximate UMAP:
  Initial fit (1000 samples):            4.61s, TW = 0.9678
  Update step (500 new samples added):   0.05s, TW = 0.9128

Parametric UMAP:
  Initial fit (1000 samples):            52.36s, TW = 0.9605
  Update step (500 new samples added):   0.11s, TW = 0.8969


### Updating embeddings with added 10000 MNIST samples (batching with 10 samples at each time)

In [62]:
for name, info in results.items():
        results[name]['batching_update_time'] = 0
        results[name]['batching_update_trustworthiness'] = 0

In [63]:
print("Updating embeddings with additions of batched 1k MNIST samples")
for i in range(1, 1000):
    a = 1500 + (i*10)
    x_new = X[a : a+10]
    for name, info in results.items():
        model = info['model']
        emb_base = info['emb_base']
        start = time.time()
        emb_new = model.transform(x_new)
        duration = time.time() - start

        X_combined = np.vstack([X_base, x_new])
        emb_combined = np.vstack([emb_base, emb_new])
        tw_combined = trustworthiness(X_combined, emb_combined, n_neighbors=5)

        results[name]['batching_update_time'] = results[name].get('batching_update_time', 0.0) + duration
        results[name]['batching_update_trustworthiness'] = tw_combined


Updating embeddings with additions of batched 1k MNIST samples




In [64]:
# Summary print
print("Summary of results:")
for name, info in results.items():
    print(f"\n{name}:\n  Initial fit:                                     {info['base_time']:.2f}s, TW = {info['base_trustworthiness']:.4f}\n  Update step (one-go 500 samples):                {info['update_time']:.2f}s, TW = {info['update_trustworthiness']:.4f}\n  Update step (100 batches with 10 samples):       {info['batching_update_time']:.2f}s, TW = {info['batching_update_trustworthiness']:.4f}")

Summary of results:

UMAP:
  Initial fit:                                     4.66s, TW = 0.9678
  Update step (one-go 500 samples):                1.92s, TW = 0.9478
  Update step (100 batches with 10 samples):       67.42s, TW = 0.9678

Approximate UMAP:
  Initial fit:                                     4.61s, TW = 0.9678
  Update step (one-go 500 samples):                0.05s, TW = 0.9128
  Update step (100 batches with 10 samples):       21.62s, TW = 0.9663

Parametric UMAP:
  Initial fit:                                     52.36s, TW = 0.9605
  Update step (one-go 500 samples):                0.11s, TW = 0.8969
  Update step (100 batches with 10 samples):       167.34s, TW = 0.9590


### Updating embeddings with 10k Samples Added

In [65]:
for name, info in results.items():
        results[name]['big_batch_update_time'] = 0
        results[name]['big_batch_update_trustworthiness'] = 0

In [66]:
print("Updating embeddings with additions of new 7k MNIST samples")

x_new = X[11500 : 21500]
for name, info in results.items():
    model = info['model']
    emb_base = info['emb_base']
    start = time.time()
    emb_new = model.transform(x_new)
    duration = time.time() - start
    # combine embeddings and data
    X_combined = np.vstack([X_base, x_new])
    emb_combined = np.vstack([emb_base, emb_new])
    tw_combined = trustworthiness(X_combined, emb_combined, n_neighbors=5)

    results[name]['big_batch_update_time'] = results[name].get('big_batch_update_time', 0.0) + duration
    results[name]['big_batch_update_trustworthiness'] = tw_combined
    
    print(f"{name}: transform time = {duration:.2f}s, combined trustworthiness = {tw_combined:.4f}")

Updating embeddings with additions of new 7k MNIST samples




UMAP: transform time = 41.77s, combined trustworthiness = 0.9222
Approximate UMAP: transform time = 0.47s, combined trustworthiness = 0.8492
Parametric UMAP: transform time = 0.26s, combined trustworthiness = 0.8221


In [70]:
# Summary print
print("Summary of results:")
for name, info in results.items():
    print(f"\n{name}:\n  Initial fit:                                     {info['base_time']:.2f}s, TW = {info['base_trustworthiness']:.4f}\n  Update step (one-go 500 samples):                {info['update_time']:.2f}s, TW = {info['update_trustworthiness']:.4f}\n  Update step (100 batches with 10 samples):       {info['batching_update_time']:.2f}s, TW = {info['batching_update_trustworthiness']:.4f}\n  Update step (1 batch with 10k samples):          {info['big_batch_update_time']:.2f}s, TW = {info['big_batch_update_trustworthiness']:.4f}")

Summary of results:

UMAP:
  Initial fit:                                     4.66s, TW = 0.9678
  Update step (one-go 500 samples):                1.92s, TW = 0.9478
  Update step (100 batches with 10 samples):       67.42s, TW = 0.9678
  Update step (1 batch with 10k samples):          41.77s, TW = 0.9222

Approximate UMAP:
  Initial fit:                                     4.61s, TW = 0.9678
  Update step (one-go 500 samples):                0.05s, TW = 0.9128
  Update step (100 batches with 10 samples):       21.62s, TW = 0.9663
  Update step (1 batch with 10k samples):          0.47s, TW = 0.8492

Parametric UMAP:
  Initial fit:                                     52.36s, TW = 0.9605
  Update step (one-go 500 samples):                0.11s, TW = 0.8969
  Update step (100 batches with 10 samples):       167.34s, TW = 0.9590
  Update step (1 batch with 10k samples):          0.26s, TW = 0.8221
