In [None]:
import matplotlib.pyplot as plt
from sklearn import tree, metrics, model_selection, datasets, decomposition
from tqdm import tqdm
import seaborn as sns

In [None]:
mnist = datasets.load_digits()
X_mnist = mnist.data
y_mnist = mnist.target

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_mnist, y_mnist, test_size=0.2, random_state=42)

# PCA - Principal Component Analysis

Find the smallest dimension after PCA such that 95% of the variance is explained. (numpy.cumsum and numpy.where)

In [None]:
for i in tqdm(range(1, 64)):
    pca = decomposition.PCA(n_components=i)
    pca.fit(X_train)
    if sum(pca.explained_variance_ratio_) > 0.95:
        print(f"Number of components: {i}, explained variance: {sum(pca.explained_variance_ratio_)}")
        break

What is the proportion of explained variance in dimension 2 ?

In [None]:
pca = decomposition.PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)

print(f"Explained variance: {sum(pca.explained_variance_ratio_)}")

Plot the digits after a PCA in 2D. Compare with the previous approach. (use digit as a shape parameter)

In [None]:
colors = sns.color_palette("hls", 10)

for X, y in zip(X_train_pca, y_train):
    plt.scatter(X[0], X[1], marker=f"${y}$", color=colors[y])

Compare PCA and tSNE for the visualisation in 2D of the digit dataset

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X_train_tsne = tsne.fit_transform(X_train)

for X, y in zip(X_train_tsne, y_train):
    plt.scatter(X[0], X[1], marker=f"${y}$", color=colors[y])

# Now we have fun with 3D

In [None]:
tsne = TSNE(n_components=3)
X_train_tsne = tsne.fit_transform(X_train)

In [None]:
# Import dependencies
import plotly
import plotly.graph_objs as go

# Configure Plotly to be rendered inline in the notebook.
plotly.offline.init_notebook_mode()

In [None]:
# Configure the trace.
trace = go.Scatter3d(
    x=X_train_tsne[:, 0],
    y=X_train_tsne[:, 1],
    z=X_train_tsne[:, 2],
    mode='markers',
    marker={
        'size': 10,
        'opacity': 0.8,
        'color': y_train,
        'colorscale': 'Viridis'
    }
)

# Configure the layout.
layout = go.Layout(
    margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
)

data = [trace]

# Create a fig from data and layout, and plot the fig.
fig = go.Figure(data=data, layout=layout)

plotly.offline.iplot(fig)

# Save to html
plotly.offline.plot(fig, filename='3d-scatter.html')

# Create video of TSNE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from matplotlib.animation import FuncAnimation
from sklearn import datasets

# Load the MNIST dataset
mnist = datasets.load_digits()
X_mnist = mnist.data
y_mnist = mnist.target

# Initialize t-SNE model
tsne = TSNE(n_components=2, perplexity=30, n_iter=251)

# Create a figure and axis for your plot
fig, ax = plt.subplots()

# Create an empty scatter plot for the initial state
colors = sns.color_palette("hls", 10)
sc = ax.scatter([0 for _ in range(X_mnist.shape[0])], [0 for _ in range(X_mnist.shape[0])], c=y_mnist, cmap='tab10')

# Add a colorbar to indicate class labels
cbar = plt.colorbar(sc)
cbar.set_ticks(np.arange(10))
cbar.set_label('Class Label')

# Initialize variables to store the evolving t-SNE embeddings
X_embedded_history = []

# Define an initialization function for the animation
def init():
    sc.set_offsets(np.c_[[0 for _ in range(X_mnist.shape[0])], [0 for _ in range(X_mnist.shape[0])]])
    return sc,

# Define an update function for the animation
def update(frame):
    global tsne
    if frame > 251:
        # Perform one iteration of t-SNE optimization
        tsne = TSNE(n_components=2, perplexity=30, n_iter=frame+1)
        X_embedded = tsne.fit_transform(X_mnist)
        
        # Update the scatter plot with the new t-SNE embeddings
        sc.set_offsets(X_embedded)
    
    return sc,

# Create the animation object using FuncAnimation
num_frames = 500  # Number of frames is the number of iterations + 1 (for initial state)
animation = FuncAnimation(fig, update, frames=num_frames, init_func=init, blit=True)

animation.save('animation.mp4', fps=30, extra_args=['-vcodec', 'libx264'])