In [1]:
import warnings
warnings.filterwarnings('ignore')

from bokeh.plotting import figure, output_notebook, show
from bokeh.io import export_png

output_notebook()

In [2]:
import numpy as np
n_data = 1000

# uniform
uniform = np.random.random_sample(n_data)

# leftside-skewed
leftskewed = uniform **(1.5)

# rightside-skewed
rightskewed = uniform ** (1/3)

dist_samples = [
    uniform,
    leftskewed,
    rightskewed
]

In [3]:
distance_figures = []

for i, dist in enumerate(dist_samples):

    hist, edges = np.histogram(dist, density=True, bins=20)

    title = 'Data #{}'.format(i+1)
    p = figure(background_fill_color="#E8DDCB", height=600, width=600, title=title)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")

    distance_figures.append(p)
    show(p)

    export_png(p, '../assets/figures/tsne_perplexity_data%d.png'%(i+1))





In [4]:
def get_entropy(dist_, var):
    prob = to_prob(dist_, var)
    entropy = - (prob * np.log(prob)).sum()
    return entropy

def to_prob(dist_, var):
    prob = np.exp(-(dist_.copy() ** 2) / var)
    prob = prob / prob.sum()
    return prob

def binary_search_variance(dist, perplexity=30.0, verbose=False):

    desired_entropy = np.log2(perplexity)

    var = 1
    decay = 0.9
    factor = 2

    previous_diff_sign = True

    for n_try in range(30):

        entropy = get_entropy(dist, var)
        entropy_diff = entropy - desired_entropy
        diff_sign = entropy_diff > 0

        if previous_diff_sign != diff_sign:
            factor = max(1, factor * decay)

        if entropy_diff > 0:
            var /= factor
        else:
            var *= factor

        if verbose:
            print('var = {:f}, perplexity = {:f}'.format(var, 2 ** entropy))

        previous_diff_sign = diff_sign
        if factor == 1:
            break

    return var, 2 ** entropy

binary_search_variance(dist_samples[0], verbose=True)

var = 0.500000, perplexity = 116.908883
var = 0.250000, perplexity = 109.963842
var = 0.125000, perplexity = 95.196686
var = 0.062500, perplexity = 76.614612
var = 0.031250, perplexity = 60.595323
var = 0.015625, perplexity = 48.178333
var = 0.007812, perplexity = 38.530957
var = 0.003906, perplexity = 30.862100
var = 0.007031, perplexity = 24.493020
var = 0.012656, perplexity = 29.825070
var = 0.007812, perplexity = 36.019429
var = 0.004823, perplexity = 30.862100
var = 0.007031, perplexity = 26.318585
var = 0.010252, perplexity = 29.825070
var = 0.007812, perplexity = 33.674926
var = 0.005954, perplexity = 30.862100
var = 0.007031, perplexity = 28.240360
var = 0.008304, perplexity = 29.825070
var = 0.007813, perplexity = 31.475196
var = 0.007350, perplexity = 30.862100
var = 0.006915, perplexity = 30.258543
var = 0.006915, perplexity = 29.664045


(0.0069154417216252895, 29.664044622640052)

In [5]:
from bokeh.layouts import gridplot

for i, dist in enumerate(dist_samples):

    var, perplexity = binary_search_variance(dist, perplexity=40)
    prob = to_prob(dist, var)

    hist, edges = np.histogram(-prob, density=True, bins=20)

    title = 'Data #{}, perplexity={:.3f}, closest points bin : {:.5f} - {:.5f}'.format(
        i+1, perplexity, -edges[1], -edges[0])

    p = figure(background_fill_color="#E8DDCB", height=600, width=600, title=title)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")

    gp = gridplot([distance_figures[i], p], ncols=2, title=title)
    show(gp)

    export_png(gp, '../assets/figures/tsne_perplexity_data%d_p.png'%(i+1))

W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='88445a2c-31c2-43a5-9d3b-1decbc257602', ...)


W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='88445a2c-31c2-43a5-9d3b-1decbc257602', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='0ef06594-b86e-4ba8-b2b0-fa39bfb5b0a9', ...)


W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='88445a2c-31c2-43a5-9d3b-1decbc257602', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='0ef06594-b86e-4ba8-b2b0-fa39bfb5b0a9', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='9369333b-55b6-41b5-8ea4-0cc23e439615', ...)


In [7]:
dist = dist_samples[0]

plots = []
for perplexity in [20, 30, 50, 100]:
    var, perplexity = binary_search_variance(dist, perplexity=perplexity)
    prob = to_prob(dist, var)

    hist, edges = np.histogram(-prob, density=True, bins=20)

    title = 'Data #1, perp={:.3f}, sigma={:.3f} closest points bin : {:.5f} - {:.5f}'.format(
        perplexity, np.sqrt(var), -edges[1], -edges[0])
    p = figure(background_fill_color="#E8DDCB", height=500, width=500, title=title)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")

    plots.append(p)

gp = gridplot([[plots[0], plots[1]], [plots[2], plots[3]]])
show(gp)

export_png(gp, '../assets/figures/tsne_perplexity_data1_various_perplexity.png')

W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='88445a2c-31c2-43a5-9d3b-1decbc257602', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='0ef06594-b86e-4ba8-b2b0-fa39bfb5b0a9', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='9369333b-55b6-41b5-8ea4-0cc23e439615', ...)


'/mnt/lovit/git/lovit.github.io/assets/figures/tsne_perplexity_data1_various_perplexity.png'

In [8]:
import numpy as np

n_data_per_class = 10
n_classes = 3

x = []
y = []
for c in range(n_classes):
    x_ = 0.1 * np.random.random_sample((n_data_per_class, 2))
    x_ += np.random.random_sample((1, 2))
    x.append(x_)
    y.append(np.asarray([c] * n_data_per_class))
x = np.vstack(x)
y = np.concatenate(y)
print(x)
print(y)

[[1.0266703  0.74847298]
 [0.9841883  0.78404014]
 [1.04037259 0.77570467]
 [1.00865377 0.71931176]
 [0.96601767 0.77035696]
 [0.95819003 0.72671782]
 [0.97872784 0.72303621]
 [0.98689257 0.72647863]
 [0.96019527 0.76703674]
 [1.02704244 0.77947562]
 [0.35606485 0.91383576]
 [0.35208158 0.88696243]
 [0.31514291 0.88577599]
 [0.30506742 0.90147049]
 [0.31223643 0.92715701]
 [0.38135013 0.93488382]
 [0.35368819 0.92456892]
 [0.38421811 0.93076466]
 [0.36852978 0.92387596]
 [0.30379626 0.95688458]
 [0.49375608 0.61255795]
 [0.49204573 0.68044282]
 [0.52327077 0.59745771]
 [0.5009089  0.60386662]
 [0.47689595 0.58367695]
 [0.46728351 0.65663025]
 [0.48822887 0.63944209]
 [0.47853918 0.65223938]
 [0.43722573 0.60441902]
 [0.48437545 0.58826378]]
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]


In [12]:
title = '{} classes {} points'.format(n_classes, n_classes * n_data_per_class)
p = figure(width=600, height=600, title=title)

colors = 'firebrick darksalmon lightseagreen'.split()

for c in range(n_classes):
    idx = np.where(y == c)[0]
    x_ = x[idx]
    p.scatter(x_[:,0], x_[:,1], fill_color=colors[c], line_color=colors[c])
show(p)
export_png(p, '../assets/figures/tsne_perplexity_data4.png')

W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='88445a2c-31c2-43a5-9d3b-1decbc257602', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='0ef06594-b86e-4ba8-b2b0-fa39bfb5b0a9', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='9369333b-55b6-41b5-8ea4-0cc23e439615', ...)


'/mnt/lovit/git/lovit.github.io/assets/figures/tsne_perplexity_data4.png'

In [17]:
from sklearn.manifold import TSNE

grids = []
for perplexity in [10, 15, 20, 30]:
    p = figure(width=600, height=600, title='perplexity = {:.2f}'.format(perplexity))
    z = TSNE(n_components=2, perplexity=perplexity).fit_transform(x)
    for c in range(n_classes):
        idx = np.where(y == c)[0]
        z_ = z[idx]
        p.scatter(z_[:,0], z_[:,1], fill_color=colors[c], line_color=colors[c])    
    grids.append(p)

gp = gridplot([[grids[0], grids[1]], [grids[2], grids[3]]])
show(gp)
export_png(gp, '../assets/figures/tsne_perplexity_data4_various_perp.png')

W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='88445a2c-31c2-43a5-9d3b-1decbc257602', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='0ef06594-b86e-4ba8-b2b0-fa39bfb5b0a9', ...)
W-1004 (BOTH_CHILD_AND_ROOT): Models should not be a document root if they are in a layout box: Figure(id='9369333b-55b6-41b5-8ea4-0cc23e439615', ...)


'/mnt/lovit/git/lovit.github.io/assets/figures/tsne_perplexity_data4_various_perp.png'