In [1]:
import numpy as np
from bokeh.io import save
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import Arrow, OpenHead
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from sklearn.utils.extmath import randomized_svd
from soydata.data.classification import make_spiral
from soydata.visualize import scatterplot

import warnings
warnings.filterwarnings('ignore')

output_notebook()

## PCA, kPCA & SVD

In [2]:
np.random.seed(0)
x = np.random.random_sample((30, 2))
x = 0.3 + x * np.array([0.1, 1])

theta = np.radians(30)
c, s = np.cos(theta), np.sin(theta)
R = np.array(((c,-s), (s, c)))
print(R)

x = x.dot(R)

[[ 0.8660254 -0.5      ]
 [ 0.5        0.8660254]]


In [3]:
def draw_rotated_space(x, z=None, axis=None, x_range=(-1.5, 1.5),
    y_range=(-1.5, 1.5), height=400, width=400, title=None, size=5):

    p = figure(x_range=x_range, y_range=y_range,
               height=height, width=width, title=title)
    p.scatter(x[:,0], x[:,1], color='#2b83ba', size=size,
              alpha=0.7, legend_label='input data')

    if z is not None:
        p.scatter(z[:,0], z[:,1], color='#d7191c', size=size,
                  alpha=0.7, legend_label='transformed data')

    if axis is not None:
        colors = ['#31a354', '#addd8e']
        for i, row in enumerate(axis):
            p.line([0, row[0]], [0, row[1]], line_color=colors[i],
                   line_width=2, line_dash=(4,4), legend_label=f'axis {i+1}')
            arrow = Arrow(end=OpenHead(size=10, line_color=colors[i]),
                          x_start=0, y_start=0, x_end=row[0], y_end=row[1])
            p.add_layout(arrow)

    p.scatter(0, 0, marker='+', size=size+5, color='black')

    return p

show(draw_rotated_space(x))

In [4]:
from bokeh.layouts import gridplot

figures = []

model = PCA(n_components=2)
z = model.fit_transform(x)
axis = model.components_
if axis[0,0] < 0:
    axis *= -1
    z *= -1
p = draw_rotated_space(x, z, axis, title='PCA')
figures.append(p)

z = KernelPCA(n_components=2).fit_transform(x)
p = draw_rotated_space(x, z, title='Kernel PCA')
figures.append(p)

x_ = x - x.mean(axis=0)

for name, data in [('origin', x), ('centered', x_)]:
    U, Sigma, VT = randomized_svd(data, n_components=2)
    z = U * Sigma

    if VT[0,0] < 0:
        VT *= -1
        z *= -1

    title = f'U*S & axis V from SVD with {name}'
    p = draw_rotated_space(x, z, VT, title=title)
    figures.append(p)

    title = f'U & axis S*V from SVD with {name}'
    p = draw_rotated_space(x, U, VT * Sigma[:,np.newaxis], title=title)
    figures.append(p)

gp = gridplot([
    [figures[2*i] for i in range(3)],
    [figures[2*i+1] for i in range(3)]
])
show(gp)

save(gp, './figures/pca_kpca_svd.html')

'/mnt/lovit/works/python_ml_intro/python_ml_intro/04_preprocessing_and_feature_extraction/figures/pca_kpca_svd.html'

In [5]:
x, labels = make_spiral(n_samples_per_class=500, n_classes=2,
    n_rotations=2.5, gap_between_spiral=0.1, noise=0.2,
    gap_between_start_point=0.1, equal_interval=True)

p = scatterplot(x[np.where(labels==0)[0]], title=f'Dataset',
    color='#2b83ba', show_inline=False, size=3, alpha=0.5)
p = scatterplot(x[np.where(labels==1)[0]], p=p, color='#d7191c',
    show_inline=False, size=3, alpha=0.5)
p.height=400
p.width=400
figures = [p]

for kernel in 'linear poly rbf sigmoid cosine'.split():
    z = KernelPCA(n_components=2, kernel=kernel).fit_transform(x)
    p = scatterplot(z[np.where(labels==0)[0]], title=f'{kernel} Kernel PCA',
        color='#2b83ba', show_inline=False, size=3, alpha=0.5)
    p = scatterplot(z[np.where(labels==1)[0]], p=p, color='#d7191c',
        show_inline=False, size=3, alpha=0.5)
    p.height=400
    p.width=400
    figures.append(p)

gp_kpca = gridplot([figures[:3], figures[3:]])
show(gp_kpca)
save(gp_kpca, './figures/kpca_various_kernels_spiral.html')

'/mnt/lovit/works/python_ml_intro/python_ml_intro/04_preprocessing_and_feature_extraction/figures/kpca_various_kernels_spiral.html'

In [6]:
from soydata.data.classification import make_moons
from soydata.visualize import scatterplot

x, labels = make_moons(n_samples=500, xy_ratio=2.0, x_gap=-0.2, y_gap=0.2, noise=0.1)

p = scatterplot(x[np.where(labels==0)[0]], title='Dataset',
    color='#2b83ba', show_inline=False, size=3, alpha=0.5)
p = scatterplot(x[np.where(labels==1)[0]], p=p, color='#d7191c',
    show_inline=False, size=3, alpha=0.5)
p.height=400
p.width=400
figures = [p]

for kernel in 'linear poly rbf sigmoid cosine'.split():
    z = KernelPCA(n_components=2, kernel=kernel).fit_transform(x)
    p = scatterplot(z[np.where(labels==0)[0]], title=f'{kernel} Kernel PCA',
        color='#2b83ba', show_inline=False, size=3, alpha=0.5)
    p = scatterplot(z[np.where(labels==1)[0]], p=p, color='#d7191c',
        show_inline=False, size=3, alpha=0.5)
    p.height=400
    p.width=400
    figures.append(p)

gp_kpca = gridplot([figures[:3], figures[3:]])
show(gp_kpca)
save(gp_kpca, './figures/kpca_various_kernels_twomoon.html')

'/mnt/lovit/works/python_ml_intro/python_ml_intro/04_preprocessing_and_feature_extraction/figures/kpca_various_kernels_twomoon.html'

## Perplexity in t-SNE

In [8]:
import math
from sklearn.manifold import TSNE


def draw_tsne_perplexity(x, y, perplexity, title):
    colors = 'firebrick darksalmon lightseagreen'.split()
    p = figure(width=400, height=400, title=title)
    for c in range(n_classes):
        idx = np.where(y == c)[0]
        x_ = x[idx]
        p.scatter(x_[:,0], x_[:,1], fill_color=colors[c], line_color=colors[c])
    return p

def list_to_grid(plots, n_cols=4):    
    n_rows = math.ceil(len(plots)/n_cols)
    grid = []
    for r in range(n_rows):
        b = r * n_cols
        e = (r+1) * n_cols
        grid.append(plots[b:e])
    return grid

np.random.seed(3)
n_data_per_class = 50
n_classes = 3

x = []
y = []
for c in range(n_classes):
    x_ = 0.3 * np.random.random_sample((n_data_per_class, 2))
    x_ += np.random.random_sample((1, 2))
    x.append(x_)
    y.append(np.asarray([c] * n_data_per_class))
x = np.vstack(x)
y = np.concatenate(y)

plots = [draw_tsne_perplexity(x, y, -1, title='Dataset')]
for perplexity in [2, 5, 10, 30, 50, 100, 200]:
    z = TSNE(
        n_components=2,
        perplexity=perplexity
    ).fit_transform(x)
    title = f'3 classes t-SNE embedding with perplexity = {perplexity}'
    plots.append(draw_tsne_perplexity(z, y, perplexity, title))

gp = gridplot(list_to_grid(plots))
show(gp)
save(gp, './figures/tsne_perplexity.html')

'/mnt/lovit/works/python_ml_intro/python_ml_intro/04_preprocessing_and_feature_extraction/figures/tsne_perplexity.html'

In [9]:
import umap

np.random.seed(3)
n_data_per_class = 50
n_classes = 3

x = []
y = []
for c in range(n_classes):
    x_ = 0.3 * np.random.random_sample((n_data_per_class, 2))
    x_ += np.random.random_sample((1, 2))
    x.append(x_)
    y.append(np.asarray([c] * n_data_per_class))
x = np.vstack(x)
y = np.concatenate(y)

figures = [draw_tsne_perplexity(x, y, -1, title='Dataset')]
for n_neighbors in [2, 5, 10, 20, 50]:
    z = umap.UMAP(n_neighbors=n_neighbors).fit_transform(x)
    title = f'UMAP with neighbors = {n_neighbors}'
    figures.append(draw_tsne_perplexity(z, y, perplexity, title))

gp = gridplot(list_to_grid(figures, n_cols=3))
show(gp)
save(gp, './figures/umap_neighbors.html')

'/mnt/lovit/works/python_ml_intro/python_ml_intro/04_preprocessing_and_feature_extraction/figures/umap_neighbors.html'

In [11]:
figures = []
for spread in [0.1, 1, 10]:
    for factor in [0.5, 0.1, 0.01]:
        min_dist = spread * factor
        title = f'UMAP min_dist={min_dist:.2}, spread={spread}, neighbors=10'
        z = umap.UMAP(n_neighbors=10, min_dist=min_dist, spread=spread).fit_transform(x)
        figures.append(draw_tsne_perplexity(z, y, perplexity, title))

gp = gridplot(list_to_grid(figures, n_cols=3))
show(gp)
save(gp, './figures/umap_mindist_spread.html')

'/mnt/lovit/works/python_ml_intro/python_ml_intro/04_preprocessing_and_feature_extraction/figures/umap_mindist_spread.html'