<a href="https://colab.research.google.com/github/niamh-m/Clustering-Analysis/blob/main/fashiondata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np

In [5]:
def read_datafile(file_path):
    return pd.read_csv(file_path)

In [6]:
# change the file path
data = read_datafile(r'fashion_mnist.csv')

In [7]:
y = data.loc[:,'label'].values
x = data.loc[:,'pixel1':].values

In [8]:
x_subset = x[0:10000]
y_subset = y[0:10000]

print(np.unique(y_subset))

[0 1 2 3 4 5 6 7 8 9]


In [9]:
!pip install umap-learn[plot]

import umap.umap_ as umap
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(x_subset)
embedding.shape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn[plot]
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.0 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.7.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 28.1 MB/s 
Collecting datashader
  Downloading datashader-0.14.2-py2.py3-none-any.whl (18.2 MB)
[K     |████████████████████████████████| 18.2 MB 529 kB/s 
Collecting datashape>=0.5.1
  Downloading datashape-0.5.2.tar.gz (76 kB)
[K     |████████████████████████████████| 76 kB 4.9 MB/s 
Building wheels for collected packages: umap-learn, pynndescent, datashape
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=9ecfebf66248035372dcc412f22bb0e0b6d6a16ed11c72f02eb808c1ea82a4c8
  Stored in directory: /root/.cache/pip/wheels/b3/52/a5/

(10000, 2)

In [10]:
# Encoding all the images for inclusion in a dataframe.
from io import BytesIO
from PIL import Image
import base64


def embeddable_image(data):
    img_data = 255 - 15 * data.astype(np.uint8)
    image = Image.fromarray(img_data, mode='L').resize((28,28), Image.BICUBIC)
    buffer = BytesIO()
    image.save(buffer, format='png')
    for_encoding = buffer.getvalue()
    return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode()

# loading up bokeh and other tools to generate a suitable interactive plot.

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10

output_notebook()

In [11]:
x_subset_reshape = x_subset.reshape(10000,28,28)

digits_df = pd.DataFrame(embedding, columns=('x', 'y'))
digits_df['digit'] = [str(x) for x in y_subset]
digits_df['digit_text'] = digits_df['digit'].replace({'0':'T-shirt/top','1':'Trouser','2':'Pullover','3':'Dress','4':'Coat','5':'Sandal','6':'Shirt',
                           '7':'Sneaker','8':'Bag','9':'Ankle boot'})
digits_df['image'] = list(map(embeddable_image, x_subset_reshape))


datasource = ColumnDataSource(digits_df)
color_mapping = CategoricalColorMapper(factors=[str(9 - x) for x in y_subset],
                                       palette=Spectral10)

plot_figure = figure(
    title='UMAP projection of the FASHION MNIST dataset',
    plot_width=600,
    plot_height=600,
    tools=('pan, wheel_zoom, reset')
)

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>Digit:</span>
        <span style='font-size: 18px'>@digit_text</span>
    </div>
</div>
"""))

plot_figure.circle(
    'x',
    'y',
    source=datasource,
    color=dict(field='digit', transform=color_mapping),
    line_alpha=0.6,
    fill_alpha=0.6,
    size=4
)
show(plot_figure)

