# Clustering with a saved model

## for Google Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip -q /content/drive/MyDrive/data/mn10_64.zip

In [None]:
!pip install tensorflow-determinism kaleido

## Import modules and set parameters

In [None]:
import os
import datetime
import random
from glob import glob
import re
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


In [None]:
def set_seed(seed=200):
    tf.random.set_seed(seed)

    # optional
    # for numpy.random
    np.random.seed(seed)
    # for built-in random
    random.seed(seed)
    # for hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    
set_seed(123)

In [None]:
MODEL_PATH = 'autoencoder_64/saved_models/encoder'
OUTPUT_DIR = 'autoencoder_64/outputs'
NUM_CLASSES = 10
NUM_CLUSTERS = 10
NUM_PCA_COMPONENTS = 10
BUFFER_SIZE = 1000
BATCH_SIZE = 8
EPOCHS = 100
CLUSTERING_INTERVAL = 5
DATA_DIR = '/content/mn10/64'
IMAGE_SIZE = 64
NUM_CHANNELS = 1

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Prepare Data

In [None]:
categories = ['bathtub', 'bed', 'chair', 'desk', 'dresser',
              'monitor', 'night_stand', 'sofa', 'table', 'toilet']

In [None]:
data_pattern = DATA_DIR +'/train/*.npy'

data_list_ds = tf.data.Dataset.list_files(data_pattern, shuffle=False)

In [None]:
name_re = re.compile(r'.+/(.+?_[0-9]+)\.npy')
data_names = [name_re.match(item.numpy().decode())[1] for item in data_list_ds]
data_labels = [re.match(r'(.+?)_[0-9]+',name)[1] for name in data_names]
data_ids = [categories.index(cat) for cat in data_labels]

In [None]:
def read_npy_file(path):
    data = np.load(path.numpy())
    data = np.expand_dims(data, axis=-1)
    return tf.convert_to_tensor(data, dtype=tf.float32)

In [None]:
data_3d_ds = data_list_ds.map(
        lambda item: tf.py_function(read_npy_file, [item], tf.float32)).cache(filename='./cache.tf-data').batch(BATCH_SIZE)

## Load a saved model

In [None]:
feature_extractor = tf.keras.models.load_model(MODEL_PATH)

## Clustering with PCA, Standardizing and k-means++

In [None]:
pca = PCA(n_components=NUM_PCA_COMPONENTS)
stdsc = StandardScaler()
kmc = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=10, max_iter=300,
                       tol=0.0001, verbose=0, random_state=123, copy_x=True)

In [None]:
features = feature_extractor.predict(data_3d_ds)
features_pca = pca.fit_transform(features)
features_std = stdsc.fit_transform(features_pca)
km_predictions = kmc.fit_predict(features_std)

## Matrix with data labels and cluster ids

In [None]:
cluster_matrix = np.zeros((NUM_CLASSES, NUM_CLUSTERS), dtype=np.int32)

for i, cat_id in enumerate(data_ids):
    cluster_matrix[cat_id, km_predictions[i]] += 1

In [None]:
def plot_matrix(cm, x_labels, y_labels):
    fig, ax = plt.subplots(figsize=(7,7))
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(
        xticks=np.arange(cm.shape[1]),
        yticks=np.arange(cm.shape[0]),
        xticklabels=x_labels,
        yticklabels=y_labels,
        title='Cluster Matrix',
        ylabel='Class label',
        xlabel='Cluster id',
        xlim=(-0.5,cm.shape[1]-0.5),
        ylim=(cm.shape[0]-0.5,-0.5)
    )
    plt.setp(
        ax.get_xticklabels(),
        rotation=45,
        ha="right",
        rotation_mode="anchor"
    )
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    plt.show()
    return fig

In [None]:
fig = plot_matrix(cluster_matrix, range(NUM_CLUSTERS), categories)

In [None]:
fig.savefig(os.path.join(OUTPUT_DIR, 'cluster_class_matrix.png'))

## Comparing SSE with number of cluster to find a "elbow".

In [None]:
inertias = []
for k in range(2,21):
    kmc = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300,
                           tol=0.0001, verbose=0, random_state=None, copy_x=True)

    _ = kmc.fit_predict(features_std)
    inertias.append(kmc.inertia_)

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(range(2,21), inertias, '-')
ax.set_xticks(range(2,21))
ax.set_xlabel('Num of clusters')
ax.set_ylabel('SSE/Distortion')
fig.show()

In [None]:
fig.savefig(os.path.join(OUTPUT_DIR, 'sse_elbow_chart.png'))

## Plot standardized principal components

In [None]:
def plot_and_save(data, color, file_path, size=(10,10)):
    fig, ax = plt.subplots(figsize=size)
    sns.set_theme()
    sns.scatterplot(x=data[:,0], y=data[:,1], hue=color, style=color, palette='bright')
    plt.savefig(file_path)
    plt.show()

In [None]:
plot_and_save(features_std, data_labels,
              os.path.join(OUTPUT_DIR, 'feature_std_2d_labels.png'))

In [None]:
plot_and_save(features_std, km_predictions,
              os.path.join(OUTPUT_DIR, 'features_std_2d_clusters.png'))

In [None]:
def px_scatter(data, names, color):
    symbol_seq = ['circle', 'square', 'diamond', 'cross', 'x',
              'triangle-up', 'triangle-down', 'pentagon', 'star', 'circle-cross']
    fig = px.scatter(
        data, x=0, y=1,
        color=color,
        symbol=color,
        symbol_sequence=symbol_seq,
        hover_name=names,
        width=1000, height=1000,
    )
    fig.show()

In [None]:
px_scatter(features_std, data_names, data_labels)

In [None]:
px_scatter(features_std, data_names, km_predictions.astype(np.str))

### 3D plot with standardized principal components

In [None]:
def px_scatter_3d(data, names, color):
    fig = px.scatter_3d(
        data, x=0, y=1, z=2,
        color=color,
        hover_name=names,
        width=1000, height=1000,
    )
    fig.show()

In [None]:
px_scatter_3d(features_std, data_names, data_labels)

In [None]:
px_scatter_3d(features_std, data_names, km_predictions.astype(np.str))

## Visualize clusters with t-SNE

### Raw features with t-SNE

In [None]:
tsne = TSNE(n_components=2, random_state=0)
proj_features = tsne.fit_transform(features)

In [None]:
plot_and_save(proj_features, data_labels,
              os.path.join(OUTPUT_DIR, 'features_tsne_2d_labels.png'))

In [None]:
plot_and_save(proj_features, km_predictions,
              os.path.join(OUTPUT_DIR, 'features_tsne_2d_cluster.png'))

In [None]:
px_scatter(proj_features, data_names, data_labels)

In [None]:
px_scatter(proj_features, data_names, km_predictions.astype(np.str))

### Pricipal components with t-SNE

In [None]:
tsne = TSNE(n_components=2, random_state=0)
proj_features_pca = tsne.fit_transform(features_pca)

In [None]:
plot_and_save(proj_features_pca, data_labels,
              os.path.join(OUTPUT_DIR, 'features_pca_tsne_2d_labels.png'))

In [None]:
plot_and_save(proj_features_pca, km_predictions,
              os.path.join(OUTPUT_DIR, 'features_pca_tsne_2d_cluster.png'))

In [None]:
px_scatter(proj_features_pca, data_names, data_labels)

In [None]:
px_scatter(proj_features_pca, data_names, km_predictions.astype(np.str))

### Standardized principal components

In [None]:
tsne = TSNE(n_components=2, random_state=0)
proj_features_std = tsne.fit_transform(features_std)

In [None]:
plot_and_save(proj_features_std, data_labels,
              os.path.join(OUTPUT_DIR, 'features_std_tsne_2d_labels.png'))

In [None]:
plot_and_save(proj_features_std, km_predictions,
              os.path.join(OUTPUT_DIR, 'features_std_tsne_2d_cluster.png'))

In [None]:
px_scatter(proj_features_std, data_names, data_labels)

In [None]:
px_scatter(proj_features_std, data_names, km_predictions.astype(np.str))

### 3D Plot with standardized principal components

In [None]:
tsne = TSNE(n_components=3, random_state=0)
proj_features_std_3d = tsne.fit_transform(features_std)

In [None]:
px_scatter_3d(proj_features_std_3d, data_names, data_labels)

In [None]:
px_scatter_3d(proj_features_std_3d, data_names, km_predictions.astype(np.str))