## Dimensionality reduction
Historically the major SoTa development seems to follow this timeline
- Linear & Random projections
    - PCA, SVD etc
- [Manifold learning](https://scikit-learn.org/stable/modules/manifold.html)
    - autoencoders, e.g. [CVAE](https://github.com/maxfrenzel/CompressionVAE)
    - t-SNE
    - [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html)
    - [trimap](https://github.com/google-research/google-research/tree/master/trimap)

In [None]:
!pip install numpy scikit-learn matplotlib seaborn umap-learn trimap

In [None]:
%matplotlib inline
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import trimap
import umap

from sklearn.manifold import TSNE

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
penguins = pd.read_csv("https://github.com/allisonhorst/palmerpenguins/raw/5b5891f01b52ae26ad8cb9755ec93672f49328a8/data/penguins_size.csv")
penguins = penguins.dropna()
penguins.head()

In [None]:
sns.pairplot(penguins, hue='species_short')

In [None]:
reducer = umap.UMAP()
penguin_data = penguins[
    [
        "culmen_length_mm",
        "culmen_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
    ]
].values
scaled_penguin_data = StandardScaler().fit_transform(penguin_data)
scaled_penguin_data.shape

In [None]:
%%time
embedding = reducer.fit_transform(scaled_penguin_data)
embedding.shape

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[sns.color_palette()[x] for x in penguins.species_short.map({"Adelie":0, "Chinstrap":1, "Gentoo":2})])
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Penguin dataset', fontsize=24)

In [None]:
%%time
embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(scaled_penguin_data)
embedding.shape

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[sns.color_palette()[x] for x in penguins.species_short.map({"Adelie":0, "Chinstrap":1, "Gentoo":2})])
plt.gca().set_aspect('equal', 'datalim')
plt.title('T-SNE projection of the Penguin dataset', fontsize=24)

In [None]:
%%time
embedding = trimap.TRIMAP().fit_transform(scaled_penguin_data)
embedding.shape

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[sns.color_palette()[x] for x in penguins.species_short.map({"Adelie":0, "Chinstrap":1, "Gentoo":2})])
plt.gca().set_aspect('equal', 'datalim')
plt.title('trimap projection of the Penguin dataset', fontsize=24)