In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from bokeh.plotting import figure, output_notebook, show

output_notebook()

In [2]:
n_data = 1000

# uniform
uniform = np.random.random_sample(n_data)

# leftside-skewed
leftskewed = uniform ** (1/3)

# rightside-skewed
rightskewed = uniform **(1.5)

dist_samples = [
    uniform,
    leftskewed,
    rightskewed
]

In [3]:
distance_figures = []

for i, dist in enumerate(dist_samples):

    hist, edges = np.histogram(dist, density=True, bins=20)

    title = 'Data #{}'.format(i+1)
    p = figure(background_fill_color="#E8DDCB", height=600, width=600, title=title)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")

    distance_figures.append(p)
    show(p)

In [4]:
def get_entropy(dist_, var):
    prob = to_prob(dist_, var)
    entropy = - (prob * np.log(prob)).sum()
    return entropy

def to_prob(dist_, var):
    prob = np.exp(-(dist_.copy() ** 2) / var)
    prob = prob / prob.sum()
    return prob

def binary_search_variance(dist, perplexity=30.0, verbose=False):

    desired_entropy = np.log2(perplexity)

    var = 1
    decay = 0.9
    factor = 2

    previous_diff_sign = True

    for n_try in range(30):

        entropy = get_entropy(dist, var)
        entropy_diff = entropy - desired_entropy
        diff_sign = entropy_diff > 0

        if previous_diff_sign != diff_sign:
            factor = max(1, factor * decay)

        if entropy_diff > 0:
            var /= factor
        else:
            var *= factor

        if verbose:
            print('var = {:f}, perplexity = {:f}'.format(var, 2 ** entropy))

        previous_diff_sign = diff_sign
        if factor == 1:
            break

    return var, 2 ** entropy

binary_search_variance(dist_samples[0], verbose=True)

var = 0.500000, perplexity = 116.905231
var = 0.250000, perplexity = 109.953345
var = 0.125000, perplexity = 95.167037
var = 0.062500, perplexity = 76.522375
var = 0.031250, perplexity = 60.324775
var = 0.015625, perplexity = 47.682951
var = 0.007812, perplexity = 37.879158
var = 0.014063, perplexity = 29.316405
var = 0.008681, perplexity = 36.525909
var = 0.005358, perplexity = 30.563451
var = 0.007813, perplexity = 25.140675
var = 0.011391, perplexity = 29.316405
var = 0.008681, perplexity = 33.876913
var = 0.006615, perplexity = 30.563451
var = 0.007813, perplexity = 27.409098
var = 0.009226, perplexity = 29.316405
var = 0.008681, perplexity = 31.296131
var = 0.008167, perplexity = 30.563451
var = 0.008167, perplexity = 29.838489


(0.008166998364405037, 29.83848929926961)

In [5]:
from bokeh.layouts import gridplot

for i, dist in enumerate(dist_samples):

    var, perplexity = binary_search_variance(dist, perplexity=40)
    prob = to_prob(dist, var)

    hist, edges = np.histogram(-prob, density=True, bins=20)

    title = 'Data #{}, perplexity={:.3f}, closest points bin : {:.5f} - {:.5f}'.format(
        i+1, perplexity, -edges[1], -edges[0])

    p = figure(background_fill_color="#E8DDCB", height=600, width=600, title=title)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")

    gp = gridplot([distance_figures[i], p], ncols=2, title=title)
    show(gp)
    

In [6]:
dist = dist_samples[0]

plots = []
for perplexity in [20, 30, 50, 100]:
    var, perplexity = binary_search_variance(dist, perplexity=perplexity)
    prob = to_prob(dist, var)

    hist, edges = np.histogram(-prob, density=True, bins=20)

    title = 'Data #1, perp={:.3f}, closest points bin : {:.5f} - {:.5f}'.format(
        perplexity, -edges[1], -edges[0])
    p = figure(background_fill_color="#E8DDCB", height=500, width=500, title=title)
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="#036564", line_color="#033649")

    plots.append(p)

gp = gridplot([[plots[0], plots[1]], [plots[2], plots[3]]])
show(gp)