In [1]:
# TRIMAP


import umap
import sklearn.datasets
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

###   t-SNE, UMAP and LargeVis

In this and the next notebook we will use manifold learning for data visualization of large data sets (with high dimensionality). In addition to t-SNE, two relatively new methods will be used that are more efficient on large data sets.


- UMAP (Uniform Manifold Approximation and Projection) - Install this Python package: https://umap-learn.readthedocs.io/en/latest/index.html. UMAP package is compatible with scikit-learn, making use of the same API and able to be added to sklearn pipelines. UMAP can work as a drop in replacement for t-SNE and other dimension reduction classes from scikit-learn


- LargeVis (Visualizing Large-scale and High-dimensional Data) - Many techniques (like t-SNE, UMAP and LargeVis) first compute a similarity structure of the data points and then project them into a low-dimensional space with the structure preserved. These two steps suffer from considerable computational costs Comparing to tSNE, LargeVis significantly reduces the computational cost of the graph construction step and employs a principled probabilistic model for the visualization step, the objective of which can be effectively optimized through asynchronous stochastic gradient descent with a linear time complexity. Download this algorithm repository and follow the installation instructions. https://github.com/lferry007/LargeVis


In [2]:
from sklearn.manifold import TSNE
import umap

To get data we use the sklearn.datasets.fetch_openml method, which as the name requires, Fetch dataset from openml by name or dataset id. We will use MNIST and Fashion-MNIST(Zalando's article images). Fashion-MNIST is intended to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. Instead of numbers it contains thumbnails of clothes images.

In [None]:
# mnist = sklearn.datasets.fetch_openml('mnist_784')
# fmnist = sklearn.datasets.fetch_openml('Fashion-MNIST')

Below are drawings of some samples from mnist and fmnist data sets

In [None]:
mnist_names = [i for i in range(10)]
 
plt.figure(figsize=(14,10))
for i in range(40):
    plt.subplot(5, 8, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(mnist['data'].values[i].reshape((28, 28)), cmap=plt.cm.binary)
    plt.xlabel(mnist_names[int(mnist.target[i])])
plt.show()

In [None]:
fmnist_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
 
plt.figure(figsize=(14,10))
for i in range(40):
    plt.subplot(5, 8, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(fmnist['data'].values[i].reshape((28, 28)), cmap=plt.cm.binary)
    plt.xlabel(fmnist_names[int(fmnist.target[i])])
plt.show()

#### Use t-SNE, UMAP and LargeVis to project mnist and fmnist data sets into a 2-dimensional space. For LargeVis, you need to create a function that saves the data to the required by LargeVis txt file format, and a function that loads the resulting file. Draw charts for all visualizations.

In [None]:
# X_mnist, y_mnist = mnist['data'].values[:500], mnist.target.astype(np.int32)[:500] 
X_mnist, y_mnist = mnist['data'].values, mnist.target.astype(np.int32) 


# X_fmnist, y_fmnist = fmnist['data'].values[:500], fmnist.target.astype(np.int32)[:500] 
X_fmnist, y_fmnist = fmnist['data'].values, fmnist.target.astype(np.int32) 



In [None]:

tsne = TSNE(n_components=2)
mnist_tsne_points = tsne.fit_transform(X_mnist)
plt.scatter(mnist_tsne_points[:,0], mnist_tsne_points[:,1], c=y_mnist, cmap=plt.cm.get_cmap('Paired'))

In [None]:
import umap
umap_ = umap.UMAP(n_components=2)
mnist_umap_points = umap_.fit_transform(X_mnist)
plt.scatter(mnist_umap_points[:,0], mnist_umap_points[:,1], c=y_mnist, cmap=plt.cm.get_cmap('Paired'))

Fmnist

In [None]:
tsne = TSNE(n_components=2)
fmnist_tsne_points = tsne.fit_transform(X_fmnist)
plt.scatter(fmnist_tsne_points[:,0], fmnist_tsne_points[:,1], c=y_fmnist, cmap=plt.cm.get_cmap('Paired'))

#### In order to compare the results of these three methods, calculate for each case the average distance between two points belonging to the same class divided by the average distance between points belonging to 2 different classes