<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Overview" data-toc-modified-id="Overview-0.1"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Overview</a></span></li></ul></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Kernel-PCA-Example" data-toc-modified-id="Kernel-PCA-Example-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Kernel PCA Example</a></span></li><li><span><a href="#Isomap-Example" data-toc-modified-id="Isomap-Example-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Isomap Example</a></span></li><li><span><a href="#LLE-Example" data-toc-modified-id="LLE-Example-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>LLE Example</a></span></li><li><span><a href="#UMAP-Example" data-toc-modified-id="UMAP-Example-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>UMAP Example</a></span></li></ul></div>

In [None]:
using_colab = 'google.colab' in str(get_ipython())
if using_colab:
    !git clone https://github.com/mahynski/pychemauth.git --depth 1
    !cd pychemauth; pip3 install .; cd ..

import pychemauth

import matplotlib.pyplot as plt
%matplotlib notebook

import watermark
%load_ext watermark

%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import sklearn.decomposition
import sklearn.manifold

from pychemauth.manifold.elliptic import EllipticManifold

Overview
--------
This is a set of simple examples using various manifold methods to perform dimensionality reduction, followed by fitting with an elliptical boundary.

In [None]:
%watermark -t -m -v --iversions

# Load Data

In [None]:
import sklearn.datasets
data = sklearn.datasets.load_iris()

X = data.data
y = data.target

X_0 = X[y == 0]
y_0 = y[y == 0]

X_1 = X[y == 1]
y_1 = y[y == 1]

X_2 = X[y == 2]
y_2 = y[y == 2]

import sklearn.model_selection
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_0, 
                                                                            y_0,
                                                                            test_size=0.2, 
                                                                            shuffle=True, 
                                                                            random_state=0)

from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=False, with_std=False)

ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)
X_1_scaled = ss.transform(X_1)
X_2_scaled = ss.transform(X_2)

# Kernel PCA Example

See [scikit-learn's documentation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html?highlight=kernel%20pca#sklearn.decomposition.KernelPCA) for details on hyperparameters.

In [None]:
model = sklearn.decomposition.KernelPCA
kwargs = {"n_components":1, "kernel":"linear"}

m = EllipticManifold(0.05, model, kwargs)
_ = m.fit(X_train)

_ = m.visualize([X_train_scaled, X_test_scaled, X_1_scaled, X_2_scaled], ["Training Set", "Test Set", "Class 1", "Class 2"])

In [None]:
m.score(X_test_scaled, [1]*len(X_test_scaled))

In [None]:
m.score(X_1_scaled, [0]*len(X_1_scaled)) # 1D is not enough to distinguish class 0 from 1

In [None]:
# Try a 2D model instead
kwargs = {"n_components":2, "kernel":"linear"}

m = EllipticManifold(0.05, model, kwargs)

In [None]:
_ = m.fit(X_train_scaled)

In [None]:
_ = m.visualize([X_train_scaled, X_test_scaled, X_1_scaled, X_2_scaled], ["Training Set", "Test Set", "Class 1", "Class 2"])

In [None]:
m.predict(X_test_scaled, y_test) # 1 = inliner, 0 = outlier

In [None]:
m.score(X_test_scaled, [1]*len(X_test_scaled)) # Score the test set (all same class as training)

In [None]:
m.score(X_1_scaled, [0]*len(X_1_scaled)) # Score a different class (different from training)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16,6))
ax = axes.flatten()

idx = 0
for dims in [1, 2]:
    scores = []
    for degree in [1, 3, 5]:
        model = sklearn.decomposition.KernelPCA
        kwargs = {"n_components":dims, 
                  "kernel":"poly", 
                  "degree":degree 
                 }
        m = EllipticManifold(0.05, model, kwargs)
        _ = m.fit(X_train_scaled)
        train_score = m.score(X_train_scaled, [1]*len(X_train_scaled))
        test_score = m.score(X_test_scaled, [1]*len(X_test_scaled))
        x1_score = m.score(X_1_scaled, [0]*len(X_1_scaled))
        x2_score = m.score(X_2_scaled, [0]*len(X_2_scaled))
        scores.append([train_score, test_score, x1_score, x2_score])
        
        ax_ = m.visualize([X_train_scaled, X_test_scaled, X_1_scaled, X_2_scaled], ["Training Set", "Test Set", "Class 1", "Class 2"], ax[idx])
        ax_.set_title('Degree = {}, Test Score = {}\nX_1 Score = {}, X_2 Score = {}'.format(
            '%.3f'%degree, '%.3f'%test_score, '%.3f'%x1_score, '%.3f'%x2_score))
        idx += 1
    scores = np.array(scores)
    ax[idx].plot([1, 3, 5], scores[:, 0], '-o', alpha=0.5, label='Train')
    ax[idx].plot([1, 3, 5], scores[:, 1], '-s', alpha=0.5, label='Test')
    ax[idx].plot([1, 3, 5], scores[:, 2], '-*', alpha=0.5, label='X_1')
    ax[idx].plot([1, 3, 5], scores[:, 3], '-^', alpha=0.5, label='X_2')
    ax[idx].set_xlabel('Degree')
    ax[idx].set_ylabel('Score')
    ax[idx].legend(loc='best')
    idx += 1

plt.tight_layout()

# Isomap Example

See [scikit-learn's documentation](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html?highlight=isomap#sklearn.manifold.Isomap) for details on hyperparameters.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16,6))
ax = axes.flatten()

idx = 0
for dims in [1, 2]:
    scores = []
    for nebrs in [5, 10, 20]:
        model = sklearn.manifold.Isomap
        kwargs = {"n_neighbors":nebrs, 
                  "n_components":dims, 
                  "metric":'minkowski', 
                  "p":2, 
                 }
        m = EllipticManifold(0.05, model, kwargs)
        _ = m.fit(X_train_scaled)
        train_score = m.score(X_train_scaled, [1]*len(X_train_scaled))
        test_score = m.score(X_test_scaled, [1]*len(X_test_scaled))
        x1_score = m.score(X_1_scaled, [0]*len(X_1_scaled))
        x2_score = m.score(X_2_scaled, [0]*len(X_2_scaled))
        scores.append([train_score, test_score, x1_score, x2_score])
        
        ax_ = m.visualize([X_train_scaled, X_test_scaled, X_1_scaled, X_2_scaled], ["Training Set", "Test Set", "Class 1", "Class 2"], ax[idx])
        ax_.set_title('Nebrs = {}, Test Score = {}\nX_1 Score = {}, X_2 Score = {}'.format(
            nebrs, '%.3f'%test_score, '%.3f'%x1_score, '%.3f'%x2_score))
        idx += 1
    scores = np.array(scores)
    ax[idx].plot([5, 15, 30], scores[:, 0], '-o', alpha=0.5, label='Train')
    ax[idx].plot([5, 15, 30], scores[:, 1], '-s', alpha=0.5, label='Test')
    ax[idx].plot([5, 15, 30], scores[:, 2], '-*', alpha=0.5, label='X_1')
    ax[idx].plot([5, 15, 30], scores[:, 3], '-^', alpha=0.5, label='X_2')
    ax[idx].set_xlabel('Nebrs')
    ax[idx].set_ylabel('Score')
    ax[idx].legend(loc='best')
    idx += 1

plt.tight_layout()

# LLE Example

See [scikit-learn's documentation](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.LocallyLinearEmbedding.html?highlight=locally%20linear#sklearn.manifold.LocallyLinearEmbedding) for details on hyperparameters.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16,6))
ax = axes.flatten()

idx = 0
for dims in [1, 2]:
    scores = []
    for nebrs in [5, 15, 30]:
        model = sklearn.manifold.LocallyLinearEmbedding
        kwargs = {"n_neighbors":nebrs, 
                  "n_components":dims,
                 }
        m = EllipticManifold(0.05, model, kwargs)
        _ = m.fit(X_train_scaled)
        train_score = m.score(X_train_scaled, [1]*len(X_train_scaled))
        test_score = m.score(X_test_scaled, [1]*len(X_test_scaled))
        x1_score = m.score(X_1_scaled, [0]*len(X_1_scaled))
        x2_score = m.score(X_2_scaled, [0]*len(X_2_scaled))
        scores.append([train_score, test_score, x1_score, x2_score])
        
        ax_ = m.visualize([X_train_scaled, X_test_scaled, X_1_scaled, X_2_scaled], ["Training Set", "Test Set", "Class 1", "Class 2"], ax[idx])
        ax_.set_title('Nebrs = {}, Test Score = {}\nX_1 Score = {}, X_2 Score = {}'.format(
            nebrs, '%.3f'%test_score, '%.3f'%x1_score, '%.3f'%x2_score))
        idx += 1
    scores = np.array(scores)
    ax[idx].plot([5, 15, 30], scores[:, 0], '-o', alpha=0.5, label='Train')
    ax[idx].plot([5, 15, 30], scores[:, 1], '-s', alpha=0.5, label='Test')
    ax[idx].plot([5, 15, 30], scores[:, 2], '-*', alpha=0.5, label='X_1')
    ax[idx].plot([5, 15, 30], scores[:, 3], '-^', alpha=0.5, label='X_2')
    ax[idx].set_xlabel('Nebrs')
    ax[idx].set_ylabel('Score')
    ax[idx].legend(loc='best')
    idx += 1

plt.tight_layout()

# UMAP Example

UMAP has a lot of parameters that should be understood before using it.  See the [documentation](https://umap-learn.readthedocs.io/en/latest/parameters.html) for explanation. Briefly, there are 4 that matter the most:
* n_neighbors
* n_components
* metric
* min_dist

**IMPORTANTLY** you should always set random_state to ensure reproducibility between runs since UMAP is stochastic.

In [None]:
import umap

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16,6))
ax = axes.flatten()

idx = 0
for dims in [1, 2]:
    scores = []
    for nebrs in [5, 15, 30]:
        model = umap.UMAP
        kwargs = {"n_neighbors":nebrs, # Less focuses on finer detail, more is "bigger picture"
                  "n_components":dims, # Final dimensionality
                  "random_state":0, # Always set this for reproducibility
                  "metric":"euclidean", # How to compute distance between points in the ambient input space
                  "min_dist":1.0 # Closest points are allowed to come in the embedding (only controls training data, test can end up closer)
                 }
        m = EllipticManifold(0.05, model, kwargs)
        _ = m.fit(X_train_scaled)
        train_score = m.score(X_train_scaled, [1]*len(X_train_scaled))
        test_score = m.score(X_test_scaled, [1]*len(X_test_scaled))
        x1_score = m.score(X_1_scaled, [0]*len(X_1_scaled))
        x2_score = m.score(X_2_scaled, [0]*len(X_2_scaled))
        scores.append([train_score, test_score, x1_score, x2_score])
        
        ax_ = m.visualize([X_train_scaled, X_test_scaled, X_1_scaled, X_2_scaled], ["Training Set", "Test Set", "Class 1", "Class 2"], ax[idx])
        ax_.set_title('Nebrs = {}, Test Score = {}\nX_1 Score = {}, X_2 Score = {}'.format(
            nebrs, '%.3f'%test_score, '%.3f'%x1_score, '%.3f'%x2_score))
        idx += 1
    scores = np.array(scores)
    ax[idx].plot([5, 15, 30], scores[:, 0], '-o', alpha=0.5, label='Train')
    ax[idx].plot([5, 15, 30], scores[:, 1], '-s', alpha=0.5, label='Test')
    ax[idx].plot([5, 15, 30], scores[:, 2], '-*', alpha=0.5, label='X_1')
    ax[idx].plot([5, 15, 30], scores[:, 3], '-^', alpha=0.5, label='X_2')
    ax[idx].set_xlabel('Nebrs')
    ax[idx].set_ylabel('Score')
    ax[idx].legend(loc='best')
    idx += 1

plt.tight_layout()