In [1]:
import random
import numpy as np

In [2]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
output_notebook()

## Confidence Intervals and Two-Tail Tests
### goals:
* Understand the confidence intervals
* understand a two-tail test


In [3]:
"Please skip to below. This just copies code from the previous notebook"
#NOTE!! Do not use seed for results. Just using this for instructional purposes
random.seed(13)
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []#keep track of the difference in heights for each experiment
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = np.mean(resample_a) - np.mean(resample_b)
        difference_in_means.append(difference)
    return difference_in_means

def make_grid(sample_a, sample_b):
    resample_a = resample(sample_a)
    resample_b = resample(sample_b)
    p1= make_bar(labels = [x for x in range(len(resample_a))], nums = sorted(resample_a), 
               title = "Resample A", y_range = (120, 180))
    p2 = make_bar(labels = [x for x in range(len(resample_b))], nums = sorted(resample_b), 
               title = "Resample B", y_range = (120, 180))
    grid = gridplot([p1, p2], ncols = 2)
    print('the mean of resample 1 is {r1} and the mean of resample2 is {r2}'.format(
        r1 = np.mean(resample_a), r2 = np.mean(resample_b)))
    return grid

def hist(l):
    hist, edges = np.histogram(l, density=True)
    p = figure()
    p.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
    return p

def get_classroom_willow_a():
    return [146.1,152.7, 146.3, 142.0, 151.8,
     151.4, 145.1, 153.5, 151.2, 143.5, 158.2,
     150.6, 143.5, 151.6, 149.9, 154.2, 142.4,
     154.6, 154.1, 152.8, 152.4, 155.9, 152.9,
     149.9, 145.0]

def get_classroom_willow_b():
    return [142.9, 150.9, 154.0, 146.4, 142.4,
     148.7, 151.4, 154.5, 142.9, 142.5, 152.5,
     151.4, 156.7, 153.9, 148.5, 147.6, 161.9,
     147.6, 145.1, 143.3, 149.5, 147.3,
     148.7, 150.4]

WILLOW_A = get_classroom_willow_a()
WILLOW_B = get_classroom_willow_b()

def get_classroom_birch_a():
    return [141.8, 150.2, 147.6, 146.6, 153.8,
     149.3, 147.6, 158.0, 146.5, 142.7, 142.1,
     146.4, 152.3, 153.3, 154.7, 158.3, 157.6,
     152.3, 155.8, 152.4, 146.4, 153.3, 149.5,
     148.2, 159.3]

def get_classroom_birch_b():
    return [169.6, 163.3,  177.6, 164.5,
     169.5, 168.9, 168.4, 168.7, 163.3, 163.4,
     165.0, 164.0, 169.9, 173.6, 161.3, 168.5,
     160.8, 162.3, 164.6, 166.3, 163.6, 152.6,
     172.8, 164.4]

BIRCH_A = get_classroom_birch_a()
BIRCH_B = get_classroom_birch_b()


## Confidence Interval
Let us revisit the two classrooms from the Willow School. We went to the city and found a sample of kids with freckles and a sample of kids with no freckes. We recoreded the means of each group and found the difference between them. We repeated the experiment 1,000 times and recorded the result each time. This new set of numbers is a new sample. The histogram looks like this:

![title](data/hist.jpeg)

Note that some results occur a lot, those in the middle. For example, we see that there is a difference of 0 about 25% of the time. On the other hand, some results do not occurr that often. -2 occurs less than 2.5% of the time, and 4 occurrs less than 2.5% of the time. That means that 95% of the time, the differences lie between -2 and 4. If we conduct this experiment again and again, 95% of the time we would find that the difference would be no less than -2, and no greater than 4. This is the 95% confidence interval. We think that the population mean lies between -2 and 4. 

It appears the mean is approxtimately 1. We can calculate the mean and confidence interval more precisely and directly. 

In [4]:
RESAMPLES_WILLOW = repeat_resample(WILLOW_A, WILLOW_B)
print("the mean of the resamples is {m}".format(m = np.mean(RESAMPLES_WILLOW)))
print("the lower 2.5% is {l}".format(l = np.percentile(RESAMPLES_WILLOW, 2.5)))
print("the upper 97.5% is {l}".format(l = np.percentile(RESAMPLES_WILLOW, 97.5)))

the mean of the resamples is 0.8203034999999992
the lower 2.5% is -1.7721666666666378
the upper 97.5% is 3.259370833333325


The mean is approxtimately .8, and the 95% confidence interval lies between -1.7 and 3.3. Remember, the reason we need statistics is because we have samples and we are trying to estimate the true population. In this case, we can say that the  difference between the freckles and non freckles is .8. But it could be as little as -1.7, or as great as 3.3. We can't know for sure.

If you tell your friend that students with freckles are taller than .8cm. But your friend answers that the true difference might be as low as -1.7. So we can't really be sure there is a difference. In other words, we do not reject the null hypothesis. 

## Two Tail Test
Previously, we tested only if one sample was greater than the other. Now we can perform a different type of test: is one of the samples different than the other sample, either greater or less? 
### Formal Procedure. 

In order to determine if a result is significant, we do the following steps:
1. Determine your "test statistic," what you are testing. Usually this will be 0. 
2. Resample to get a sample of means
3. Determine the confidence interval.
4. If the test statistic lies outside of the confidence interval, (what is know as the "reject regions") reject the null hypothesis. Otherwise, do not reject the null hypothesis.

In [5]:
print('we want to determine if the two samples are different, so our test statistic is 0')
#get the reasmpled means
# (just repeating the step from above, for clarity)
resamples_of_the_means = repeat_resample(WILLOW_A, WILLOW_B)
lower = np.percentile(RESAMPLES_WILLOW, 2.5)
upper = np.percentile(RESAMPLES_WILLOW, 97.5)
print('The 95% interval lies between {l} and {u}'.format(
    l = lower, u = upper))
lies_inside =   lower < 0 < upper
if lies_inside:
    print('0 lies within the confidence interval. We will not reject the null hypothesis.')
else:
    print('0 does not lie within the confidence interval. We will  reject the null hypothesis.')




we want to determine if the two samples are different, so our test statistic is 0
The 95% interval lies between -1.7721666666666378 and 3.259370833333325
0 lies within the confidence interval. We will not reject the null hypothesis.


Let's perform the same procedure with the Birch School.

In [6]:
RESAMPLES_BIRCH = repeat_resample(BIRCH_B, BIRCH_A)
show(hist(RESAMPLES_BIRCH))


It appears that the average difference is 16cm. The lowest in the 95% interval is 12cm, and the highest is 18cm. You tell your friend that the 17-year olds are taller than the 12-year olds by 16cm. Even with randomness, the lowest is at least 12cm. 0 does not lie within the 95% interval, so it is unlikely the population contains 0. So we can be certain the difference is real and not just due to randomness.

In [7]:
mean_birch = np.mean(RESAMPLES_BIRCH)
lower_birch = np.percentile(RESAMPLES_BIRCH, 2.5)
upper_birch = np.percentile(RESAMPLES_BIRCH, 97.5)
print('The 95% interval lies between {l} and {u}'.format(
    l = lower_birch, u = upper_birch))
lies_inside =   lower_birch < 0 < upper_birch
if lies_inside:
    print('0 lies within the confidence interval. We will not reject the null hypothesis.')
else:
    print('0 does not lie within the confidence interval. We will reject the null hypothesis.')

The 95% interval lies between 12.75349166666668 and 18.192141666666675
0 does not lie within the confidence interval. We will reject the null hypothesis.


In [8]:
# is the class of 17-year olds taller than the class of 12-year olds by 12cm? (Is 12cm significant?)
# test statistic is 12
lies_inside =   lower_birch < 12 < upper_birch
lies_inside
# is significant. 12 is not certain, so not likely to be in the population


False

In [9]:
# is 14 cm significant? 
# test statistic is 14
lies_inside =   lower_birch < 14 < upper_birch
lies_inside
# not significant. 14 is somewhat certain, and could lie within the population

True

## p value for confidence intervals
The last section we calculated a p value. A p value is implied when we use confidence intervals. If we use a 95% confidence interval, our p vaue is <= .05. In other words, if the reject region lies in the 5% region, that is the same as saying "the probability of the null hypothesis being true is .05." 

If we were to use a 99% confidence interval, our p value would be .01.

## Summary

* We resample to estimate the population
* The differences between two samples can be quantified by a confidence interval. A 95% confidence interval means there is a 95% probability that the true (population) mean lies within that interval. 
* If the test statistic lies outside the confidence interval, it is population contains the test statistic. For example, if 0 lies in the reject region, then the true difference between the samples is not 0 (and we reject the null hypothesis). 
* The p value is equal to 1 - confidence interval. For a 95% confidence interval, the p vaue = 1 - .95 = .05


