In [1]:
import pandas as pd
from arrow import now

N = 2000
TRAIN = '/kaggle/input/mnist-in-csv/mnist_train.csv'

time_start = now()
df = pd.read_csv(filepath_or_buffer=TRAIN)
df = df.sample(n=N, random_state=2024).reset_index()
print('load and sample {} rows in {}.'.format(N, now() - time_start))
df.head()

load and sample 2000 rows in 0:00:06.282382.


Unnamed: 0,index,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,50820,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,673,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,43267,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8330,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,59464,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
from plotly.express import scatter
from sklearn.manifold import TSNE
from arrow import now

time_start = now()
tsne = TSNE(verbose=1, random_state=2024, n_jobs=1, n_iter=2000)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df.drop(columns=['index', 'label'])), columns=['tx', 'ty'])
tsne_df[['index', 'label']] = df[['index', 'label']].copy() 
print('done with TSNE in {}'.format(now() - time_start))
scatter(data_frame=tsne_df, x='tx', y='ty', color='label', hover_data='index', height=900)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.004s...
[t-SNE] Computed neighbors for 2000 samples in 0.232s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 629.255986
[t-SNE] KL divergence after 250 iterations with early exaggeration: 75.954445
[t-SNE] KL divergence after 2000 iterations: 1.244897
done with TSNE in 0:00:20.555339


In [3]:
from umap import UMAP
from arrow import now
from plotly.express import scatter

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1,)
umap_df = pd.DataFrame(data=umap.fit_transform(X=df.drop(columns=['index', 'label'])), columns=['ux', 'uy'])
umap_df[['index', 'label']] = df[['index', 'label']].copy() 

print('done with UMAP in {}'.format(now() - time_start))
scatter(data_frame=umap_df, x='ux', y='uy', color=df['label'], height=900)

2024-02-14 18:06:39.509776: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 18:06:39.509935: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 18:06:39.697182: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_jobs=1, random_state=2024, verbose=True)
Wed Feb 14 18:06:57 2024 Construct fuzzy simplicial set
Wed Feb 14 18:07:02 2024 Finding Nearest Neighbors
Wed Feb 14 18:07:06 2024 Finished Nearest Neighbor Search
Wed Feb 14 18:07:10 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Feb 14 18:07:15 2024 Finished embedding
done with UMAP in 0:00:18.530949


Adapted from https://www.kaggle.com/code/parulpandey/part3-visualising-kannada-mnist-with-umap#Using-Bokeh-to-visualize-UMAP

In [4]:
import numpy as np
import base64
from io import BytesIO
from PIL import Image

from bokeh.models import CategoricalColorMapper
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.palettes import TolRainbow10
from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show


def embeddable_image(data):
    image = Image.fromarray(255 - 15 * data.astype(np.uint8), mode='L').resize((28,28), Image.BICUBIC)
    buffer = BytesIO()
    image.save(buffer, format='png')
    return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

output_notebook()

# Generating the plot itself with a custom hover tooltip 

digits_df = umap_df.copy()
digits_df['digit'] = digits_df['label'].astype(str)
digits_df['image'] = list(map(embeddable_image, df.drop(columns=['index', 'label']).values.reshape(N, 28, 28)))

datasource = ColumnDataSource(digits_df)
color_mapping = CategoricalColorMapper(factors=[str(x) for x in digits_df['label'].values.tolist()],
                                       palette=TolRainbow10)

plot_figure = figure(title='UMAP projection:{} samples from MNIST dataset'.format(N), width=800, height=800, 
                     tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>Digit:</span>
        <span style='font-size: 18px'>@digit</span>
    </div>
</div>
"""))

plot_figure.circle('ux', 'uy', source=datasource, color=dict(field='digit', transform=color_mapping), line_alpha=0.6,
                   fill_alpha=0.6, size=8,)
show(plot_figure)