In [1]:
import numpy as np

n_samples = 10000

def max_random_sample(max_from=1):
    # mean, std, n samples
    return np.random.normal(0, 1, max_from).max()

def max_random_samples(max_from, n_samples):
    return np.asarray(
        [max_random_sample(max_from) for _ in range(n_samples)])

samples = max_random_samples(1, n_samples)

In [2]:
from bokeh.plotting import show, output_notebook, output_file, figure
from bokeh.palettes import brewer

output_notebook()

def draw_histogram(bins, ranges, color="#036564", legend=None):
    if not legend:
        p.quad(top=bins, bottom=0, left=ranges[:-1], right=ranges[1:],
            fill_color=color, line_color=color, alpha=0.5)
    else:
        p.quad(top=bins, bottom=0, left=ranges[:-1], right=ranges[1:],
            fill_color=color, line_color=color, alpha=0.5, legend=legend)

In [3]:
p = figure()

max_froms = [1, 20, 50, 100, 200]
colors = reversed(brewer['YlGnBu'][len(max_froms)])

for max_from, color in zip(max_froms, colors):
    samples = max_random_samples(max_from, n_samples)
    bins, ranges = np.histogram(samples, bins=30)
    bins = bins / n_samples
    legend = 'max from {} samples'.format(max_from)
    draw_histogram(bins, ranges, color, legend)
    
    mean = samples.mean()
    std = samples.std()
    print('max from {} samples: mean = {}, std = {}'.format(
        max_from, '%.3f' % mean, '%.3f' % std))
show(p)

max from 1 samples: mean = -0.014, std = 0.996
max from 20 samples: mean = 1.869, std = 0.527
max from 50 samples: mean = 2.251, std = 0.470
max from 100 samples: mean = 2.505, std = 0.425
max from 200 samples: mean = 2.746, std = 0.403


In [4]:
output_file?

In [5]:
output_file('cherry_picking.html', title='Max value sample distribution')
show(p)