In [1]:
import random
import numpy as np

In [2]:
from scipy.stats import ttest_ind_from_stats
from scipy.stats import ttest_ind
import statsmodels.stats.api as sms
import scipy.stats as stats

In [3]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
output_notebook()

In [4]:
"Please skip to below. This just copies code from the previous notebook"
#NOTE!! Do not use seed for results. Just using this for instructional purposes
random.seed(13)
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []#keep track of the difference in heights for each experiment
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = np.mean(resample_a) - np.mean(resample_b)
        difference_in_means.append(difference)
    return difference_in_means



def make_grid(sample_a, sample_b):
    resample_a = resample(sample_a)
    resample_b = resample(sample_b)
    p1= make_bar(labels = [x for x in range(len(resample_a))], nums = sorted(resample_a), 
               title = "Resample A", y_range = (120, 180))
    p2 = make_bar(labels = [x for x in range(len(resample_b))], nums = sorted(resample_b), 
               title = "Resample B", y_range = (120, 180))
    grid = gridplot([p1, p2], ncols = 2)
    print('the mean of resample 1 is {r1} and the mean of resample2 is {r2}'.format(
        r1 = np.mean(resample_a), r2 = np.mean(resample_b)))
    return grid

def hist(l):
    hist, edges = np.histogram(l, density=True)
    p = figure()
    p.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
    return p

def get_classroom_willow_a():
    return [146.1,152.7, 146.3, 142.0, 151.8,
     151.4, 145.1, 153.5, 151.2, 143.5, 158.2,
     150.6, 143.5, 151.6, 149.9, 154.2, 142.4,
     154.6, 154.1, 152.8, 152.4, 155.9, 152.9,
     149.9, 145.0]

def get_classroom_willow_b():
    return [142.9, 150.9, 154.0, 146.4, 142.4,
     148.7, 151.4, 154.5, 142.9, 142.5, 152.5,
     151.4, 156.7, 153.9, 148.5, 147.6, 161.9,
     147.6, 145.1, 143.3, 149.5, 147.3,
     148.7, 150.4]

def get_classroom_birch_a():
    return [141.8, 150.2, 147.6, 146.6, 153.8,
     149.3, 147.6, 158.0, 146.5, 142.7, 142.1,
     146.4, 152.3, 153.3, 154.7, 158.3, 157.6,
     152.3, 155.8, 152.4, 146.4, 153.3, 149.5,
     148.2, 159.3]

def get_classroom_birch_b():
    return [169.6, 163.3,  177.6, 164.5,
     169.5, 168.9, 168.4, 168.7, 163.3, 163.4,
     165.0, 164.0, 169.9, 173.6, 161.3, 168.5,
     160.8, 162.3, 164.6, 166.3, 163.6, 152.6,
     172.8, 164.4]


In [5]:
"""
New functions
"""
def repeat_resample_max(sample_a, sample_b, num_iter = 1000):
    difference_in_max = []#keep track of the difference in heights for each experiment
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = max(resample_a) - max(resample_b)
        difference_in_max.append(difference)
    return difference_in_max

def make_normal(dist, p = None):
    mu = np.mean(new_dist)
    sigma = np.std(new_dist)
    x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
    y = stats.norm.pdf(x, mu, sigma)
    if not p:
        p = figure()
    p.line(x = x, y = y, legend_label = 'Theoretical Normal', color = 'red')
    return p


# Why do we Resample? 

We resample because:

1. We don't have the population, and resampling mimics drawing from the population.
2. We use the results of resamping to get a confidence interval, and to estimate a p value

However, if we want to find the difference in means between two populations, there is a more direct way. Consider the two classrooms in the birch school. Let's resample and get a new distribution:

In [6]:
new_dist =repeat_resample(get_classroom_birch_a(),get_classroom_birch_b())
p = hist(new_dist)
show(p)

## Normal Distribution and Central Limit Theorm

This new distribution is called a normal distribution. 

https://en.wikipedia.org/wiki/Normal_distribution

This distribution is one of the most common distributions. It looks like this:

In [7]:
make_normal(new_dist, p )
show(p)


The normal distribution has well known properties. We can use math directly to calculate confidence intervals and p values:

In [8]:
cm = sms.CompareMeans(sms.DescrStatsW(get_classroom_birch_a()), sms.DescrStatsW(get_classroom_birch_b()))
cm.summary()


0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
subset #1,-15.4808,1.451,-10.672,0.000,-18.399,-12.563


The p value is 0 (but report this is < .01), and the confidence interval is between -18.4 and -12.6. This was calculated directly through math. Now let's do the same with resampling:

In [9]:
l, u = np.percentile(new_dist, 2.5), np.percentile(new_dist, 97.5)
print(l,u)

-18.31930416666667 -12.423950000000026


The two confidence intervals are very close. Note that using math directly is much simpler than resampling. Then why use resampling? Two reasons:

1. It is more intuitive. It is easier to explain than complicated math.
2. More importantly, resampling works with distributions that are not normal.

The most common comparison between two population is the means. When we compare the difference in means, we get a normal distribution (even if the two populations are not normal.) 

https://en.wikipedia.org/wiki/Central_limit_theorem

However, what if we are not comparing the means between two populations? 

## Example with Max

A scientist believes that the tallest student with freckles will always be taller than the students with no freckles. 

In [10]:
max(get_classroom_willow_b()) - max(get_classroom_willow_a())


3.700000000000017

Is this difference due to randomness or is it actual? Let's resample:

In [11]:
new_dist =repeat_resample_max(get_classroom_willow_a(),get_classroom_willow_b())
show(hist(new_dist))


Note that this distribution is *not* normally distributed. We cannot use the properties of the normal distribution to calculate a confidence interval or a p value. We must use resampling:

In [12]:
np.percentile(new_dist, 2.5)< 0 <  np.percentile(new_dist, 97.5)
# do not reject


True

## Summary

* The means from a sample will follow a normal distribution
* If the distribution is normal, can calculate confidence intervals and p values directly
* If the distribution is not normal, we should use resampling. Hence, resampling can be used more widely.