# 94-775/95-865: Dimensionality Reduction with Images

Author: George H. Chen (georgechen [at symbol] cmu.edu)

We start with some usual boilerplate code:

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import numpy as np

## Loading in a handwritten digit dataset

In [2]:
from keras.datasets import mnist

images = mnist.load_data()[0][0]

ModuleNotFoundError: No module named 'keras'

In [None]:
images.shape

In [None]:
num_images = images.shape[0]

In [None]:
flattened_images = images.reshape(num_images, -1)  # flattens out each training image

In [None]:
flattened_images.shape

In [None]:
feature_vectors = flattened_images.astype(np.float32) / 255  # rescale to be between 0 and 1

In [None]:
# we will work with a subsample of the data
np.random.seed(0)
random_sample = np.random.permutation(num_images)[:10000]  # 10000 random indices

In [None]:
random_sample

In [None]:
# you can re-run this to see random digits
plt.imshow(feature_vectors[np.random.randint(num_images)].reshape(28, 28),
           cmap='gray')

## Dimensionality reduction with PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # project data down to 2 dimensions
feature_vectors_pca2d = pca.fit_transform(feature_vectors[random_sample])

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.scatter(feature_vectors_pca2d[:, 0], feature_vectors_pca2d[:, 1])

## Dimensionality reduction with t-SNE

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=5000, random_state=0)
feature_vectors_tsne2d = tsne.fit_transform(feature_vectors[random_sample])

In [None]:
plt.scatter(feature_vectors_tsne2d[:, 0], feature_vectors_tsne2d[:, 1])

In [None]:
labels = mnist.load_data()[0][1]

In [None]:
# here we color each low-dimensional t-SNE point differently depending on its true known label for what digit the point corresponds to
# (in many applications we do not have this sort of label information!)
plt.scatter(feature_vectors_tsne2d[:, 0], feature_vectors_tsne2d[:, 1],
            c=[str(i) for i in labels[random_sample]], cmap='Spectral')

In [None]:
# coloring the points by true digit label, now for PCA
plt.scatter(feature_vectors_pca2d[:, 0], feature_vectors_pca2d[:, 1],
            c=[str(i) for i in labels[random_sample]], cmap='Spectral')