Let's load the training data; we can't really plot 60k datapoints so let's take a sample.

In [1]:
import pandas as pd
from arrow import now

N = 5000
TRAIN = '/kaggle/input/mnist-in-csv/mnist_train.csv'

time_start = now()
df = pd.read_csv(filepath_or_buffer=TRAIN)
df = df.sample(n=N, random_state=2024)
print('load and sample {} rows in {}.'.format(N, now() - time_start))
df.head()

load and sample 5000 rows in 0:00:04.706642.


Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
50820,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
673,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43267,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8330,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59464,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now let's build the data we're actually going to visualize. We're going to use dimension reduction to locate our 'binary' data in two-dimensional space, then add the label for the location, and finally a PNG rendering of the original digit data.

In [2]:
import numpy as np
import base64

from arrow import now
from io import BytesIO
from PIL import Image
from sklearn.manifold import TSNE
from umap import UMAP


def embeddable_image(data):
    image = Image.fromarray(255 - 15 * data.astype(np.uint8), mode='L').resize((28,28), Image.BICUBIC)
    buffer = BytesIO()
    image.save(buffer, format='png')
    return 'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()

REDUCER = ['TSNE', 'UMAP'][0]

time_start = now()
if REDUCER == 'TSNE':
    tsne = TSNE(verbose=1, random_state=2024, n_jobs=1, n_iter=2000)
    plot_df = pd.DataFrame(data=tsne.fit_transform(X=df.drop(columns=['label'])), columns=['tx', 'ty'])
    print('done with TSNE in {}'.format(now() - time_start))
elif REDUCER == 'UMAP':
    umap = UMAP(random_state=2024, verbose=True, n_jobs=1,)
    plot_df = pd.DataFrame(data=umap.fit_transform(X=df.drop(columns=['label'])), columns=['ux', 'uy'])
    print('done with UMAP in {}'.format(now() - time_start))
else:
    raise(NotImplemented(REDUCER))

# now let's add on the remaining fields
plot_df['label'] = df['label'].values.tolist() 
plot_df['digit'] = plot_df['label'].astype(str)
plot_df['image'] = list(map(embeddable_image, df.drop(columns=['label']).values.reshape(N, 28, 28)))

2024-02-14 19:10:50.887448: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 19:10:50.887546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 19:10:51.026189: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.011s...
[t-SNE] Computed neighbors for 5000 samples in 0.793s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 565.828156
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.254356
[t-SNE] KL divergence after 2000 iterations: 1.531598
done with TSNE in 0:00:46.615164


Adapted from https://www.kaggle.com/code/parulpandey/part3-visualising-kannada-mnist-with-umap#Using-Bokeh-to-visualize-UMAP

In [3]:
from bokeh.models import CategoricalColorMapper
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import LinearColorMapper
from bokeh.palettes import TolRainbow16
from bokeh.plotting import figure
from bokeh.plotting import output_notebook
from bokeh.plotting import show

output_notebook()

# now let's build the plot and tooltip
datasource = ColumnDataSource(plot_df)
color_mapping = CategoricalColorMapper(factors=plot_df['digit'].values.tolist(), palette=TolRainbow16, start=4, end=14)

plot_figure = figure(title='UMAP projection:{} samples from MNIST dataset'.format(N), width=800, height=800, 
                     tools=('pan, wheel_zoom, reset'))

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>Digit:</span>
        <span style='font-size: 18px'>@digit</span>
    </div>
</div>
"""))

plot_figure.circle(plot_df.columns[0], plot_df.columns[1], source=datasource, 
                   color=dict(field='digit', transform=color_mapping), line_alpha=0.6,
                   fill_alpha=0.6, size=8,)
show(plot_figure)