In [1]:
import random
import numpy as np

In [2]:
import pandas as pd

In [3]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot


In [199]:
"""This compares two samples. If the samples are drawn from the same population, they will have
the same parameters"""
#create first population
mean1 = 25
sd1 = 3
size1 = 40
pop1 = [random.gauss(mean1, sd1) for x in range(size1)]

# create a second population
mean2 = 23
sd2 = 3
size2 = 20
pop2 = [random.gauss(mean2, sd2) for x in range(size2)]
# true difference between the two
true_diff = np.mean(pop1) - np.mean(pop2)
print('true diff is {t}'.format(t = true_diff))
# now do sampling

# we need to keep track of both statistics
mean_sample1 = []
mean_sample2 = []
both = pop1 + pop2
#shuffle things. 
for i in range(1000):
    random.shuffle(both) # randomly shuffle
    new_1 = both[0:size1] # new_1 will be a random sample
    new_2 = both[size1:] #new_2 will be a random sample
    mean_sample1.append(np.mean(new_1)) #add the mean to the list
    mean_sample2.append(np.mean(new_2)) #add the mean to the list

# make a histogram of our first resampling
hist, edges = np.histogram(mean_sample1, bins = 10, density=True)
p1 = figure(title = 'Reample 1 with mean of {m}'.format(m = 
    str(round(np.mean(pop1),1))), width = 350, height = 350)
p1.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
# make a historgram of our second resampling
hist, edges = np.histogram(mean_sample2, bins = 10, density=True)
p2 = figure(title = 'Resample 2 with mean of {m}'.format(
    m = str(round(np.mean(pop2),1))), width = 350, height = 350)
p2.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
grid = gridplot([p1, p2], ncols = 2)
show(grid)

true diff is 1.843017996537867


In [200]:
print("""If the two samples came from the same population, then the samples would have the 
same parameters. The first sample should have a mean of 25. But it does not. The second sample
should have a mean of 23. It does not.

In fact, both means are outside what we would expect (calculated below), so we reject the 
null hypothesis that the samples came from the same population. The differences in means 
are not due to randomness.
""")

If the two samples came from the same population, then the samples would have the 
same parameters. The first sample should have a mean of 25. But it does not. The second sample
should have a mean of 23. It does not.

In fact, both means are outside what we would expect (calculated below), so we reject the 
null hypothesis that the samples came from the same population. The differences in means 
are not due to randomness.



In [201]:
#calculate difference of population and resample
diff1 = [np.mean(pop1 )- x for x in mean_sample1]
diff2 = [np.mean(pop2) - x for x in mean_sample2]
p1 = do_hist(diff1, title = 'Resamaple Pop 1')
p2 = do_hist(diff2, title = 'Resample Pop 2')
grid = gridplot([p1, p2], ncols = 2)
show(grid)
print("""We have refactored the same data to show the differences between the sample
mean, and the resample mean. The shape of the histograms is exactly the same
as above, but with the X axis showing the difference between the mean and the 
means we got from resampling.

Note that in both cases, the differences from 0 is very rare, showing we should reject
the null hypothesis.
""")

We have refactored the same data to show the differences between the sample
mean, and the resample mean. The shape of the histograms is exactly the same
as above, but with the X axis showing the difference between the mean and the 
means we got from resampling.

Note that in both cases, the differences from 0 is very rare, showing we should reject
the null hypothesis.



In [202]:
#p valaues
# calculate how many times the mean in the resample exceeds or is less than the 
# sample mean
# number of times resample1 greater than sample mean 1
p_value1 = len([x for x in mean_sample1 if x > np.mean(pop1)])/len(mean_sample1)
# number of times resample 2 is greater than sample mean 2
p_value2 = len([x for x in mean_sample2 if x < np.mean(pop2)])/len(mean_sample1)
# number of times difference between resample 1 and mean 1 is greater than 0
p_value3 = len([x for x in diff1 if x < 0])/len(diff1)
# number of times difference between resample 2 and mean 2 is less than 0
p_value4 = len([x for x in diff2 if x > 0])/len(diff2)

print(p_value1, p_value2, p_value3, p_value4)
print("""\nP Value is the probability that the data would occurr, if the null hypothesis
were true. We see low P Values. So the probability that these two samples 
were drawn from the same population is low. """)


0.007 0.007 0.007 0.007

P Value is the probability that the data would occurr, if the null hypothesis
were true. We see low P Values. So the probability that these two samples 
were drawn from the same population is low. 


In [203]:
#do together
both = np.array(diff1) - np.array(diff2)
# the p value that sample a > sample b
p_both = len([x for x in both if x > 0])/len(both)
show(do_hist(both))
print("The P(A > B) is {p}".format(p = p_both))

The P(A > B) is 0.993


In [240]:
"""Make  functiond"""
def resample_two_samples(sample1, sample2, num_iterations = 100):
    both = sample1 + sample2
    mean1 = []
    mean2 = []
    for i in range(num_iterations):
        random.shuffle(both) 
        new_1 = both[0:len(sample1)] 
        new_2 = both[len(sample1):] 
        mean1.append(np.mean(new_1)) 
        mean2.append(np.mean(new_2)) 
    return mean1, mean2

def combine_resamples(sample1, sample2, resample1, resample2):
    """
    combines the results of the resampling into an numpy array 
    that represents the differences between the sample means 
    and resample means.
    parameters:
       sample1: list of the original sample 1
       sample2: list of the original sample 2
       resample1: results of resampling sample 1
       resample2: result of resampling sample2
    returns:
        numpy array of the differences. the if loop makes sure that 
        the resulting array is always positive, so you can calculate the
        probability of the differences being > 0
    
    Example:
       sample1 is a sample in inches of students, [60, 59, 61.....]
       sample2 is a sample in inches of students, [72, 71, 70]
       resample1 is [63, 61, 60...]
       resamle2 is [63, 60, 59..]
       
       return [1, 2, 1, .5....]
       
       
    """
    diff1 = [np.mean(sample1 )- x for x in resample1]
    diff2 = [np.mean(sample2)- x for x in resample2]
    if np.mean(diff1) > np.mean(diff2):
        both = np.array(diff1) - np.array(diff2)
    elif np.mean(diff1) < np.mean(diff2):
        both = np.array(diff2) - np.array(diff1)
    else:
        #seriously? The same??
        both = np.array(diff2) - np.array(diff1)
    return both

def get_p_value(l, v = 0):
    p = 1 - len([x for x in l if x > v])/len(l)
    return p
    

# let's test
a, b = resample_two_samples(pop1, pop2)
# want to combine
both = combine_resamples(pop1, pop2, a, b)
get_p_value(both, 5)

1.0

In [231]:
pop3 = [random.gauss(60, 2) for x in range(30)]
pop4 = [random.gauss(72, 2) for x in range(3)]
a,b = resample_two_samples(pop3, pop4, num_iterations = 1000)
p1 = do_hist(a, title = 'average students')
p2 = do_hist(b, title = 'tall students')
grid = gridplot([p1, p2], ncols = 2)
print('mean of pop3 is {m}'.format(m = np.mean(pop3)))
show(grid)


mean of pop3 is 60.07979882302005


In [300]:
pop3 = [random.gauss(6, 3) for x in range(30)]
#testit1(m1 = 6, m2 = 7, s1 = 3, s2 = 3)

pop4 = [random.gauss(7, 3) for x in range(30)]
a,b = resample_two_samples(pop3, pop4, num_iterations = 1000)
both = combine_resamples(pop3, pop4, a, b)
show(do_hist(both, title = 'Difference'))
get_p_value(both)


0.258

In [294]:
def do_hist(x, bins=10, width = 350, height = 350, title = None):
  hist, edges = np.histogram(x, bins = bins, density=True)
  p_hist = figure(width = width, height = height, title = title)
  p_hist.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
  return p_hist

In [311]:
"""Want to test this using traditonals stats"""
from scipy import stats
def testit1(m1, m2, s1, s2, size1 = 30, size = 30):
    pop1 = [random.gauss(m1, s1) for x in range(size1)]
    pop2 = [random.gauss(m2, s2) for x in range(size2)]
    p_theory = stats.ttest_ind(pop1,pop2)[1]
    a,b = resample_two_samples(pop1, pop2, num_iterations=1000)
    both = combine_resamples(pop1, pop2, a, b)
    p_resample = get_p_value(both)
    print('theoretical is {t} and resample is {r}'.format(
        t = round(p_theory/2,2), r = round(p_resample,2)))

testit1(m1 = 6, m2 = 7, s1 = 3, s2 = 3)
    

theoretical is 0.23 and resample is 0.24


In [303]:
path = 'data/states.csv'
DF = pd.read_csv(path)

In [314]:
def diff_week(df, last_index = -14):
    all_states = list(set(df['state']))
    for i in sorted(all_states):
        df_ = df[df['state'] == i]
        deaths = df_['deaths'].tolist()
        deaths_last = deaths[last_index:-7]
        deaths_cur = deaths[-7:]
        a, b = resample_two_samples(deaths_last, deaths_cur, num_iterations = 100)
        both = combine_resamples(deaths_last, deaths_cur, a, b)
        p_resample = get_p_value(both)
        print(i,round(np.mean(deaths_last)),round(np.mean(deaths_cur)), round(p_resample,2 ))
diff_week(DF, last_index = -21)

Alabama 13.0 10.0 0.28
Alaska 0.0 0.0 0.68
Arizona 20.0 22.0 0.35
Arkansas 3.0 1.0 0.1
California 75.0 71.0 0.37
Colorado 21.0 34.0 0.02
Connecticut 68.0 58.0 0.31
Delaware 7.0 9.0 0.24
District of Columbia 10.0 8.0 0.15
Florida 44.0 38.0 0.31
Georgia 29.0 25.0 0.38
Guam 0.0 0.0 1.0
Hawaii 0.0 0.0 0.7
Idaho 1.0 1.0 0.27
Illinois 114.0 105.0 0.32
Indiana 40.0 35.0 0.31
Iowa 11.0 13.0 0.31
Kansas 3.0 3.0 0.29
Kentucky 7.0 7.0 0.43
Louisiana 38.0 32.0 0.21
Maine 1.0 1.0 0.59
Maryland 52.0 45.0 0.14
Massachusetts 136.0 107.0 0.13
Michigan 75.0 49.0 0.12
Minnesota 23.0 21.0 0.31
Mississippi 15.0 15.0 0.49
Missouri 16.0 14.0 0.32
Montana 0.0 0.0 1.0
Nebraska 3.0 5.0 0.04
Nevada 7.0 7.0 0.5
New Hampshire 6.0 6.0 0.5
New Jersey 209.0 149.0 0.14
New Mexico 8.0 7.0 0.27
New York 297.0 156.0 0.0
North Carolina 18.0 15.0 0.23
North Dakota 2.0 1.0 0.51
Northern Mariana Islands 0.0 0.0 1.0
Ohio 39.0 43.0 0.38
Oklahoma 5.0 3.0 0.17
Oregon 2.0 1.0 0.16
Pennsylvania 122.0 99.0 0.29
Puerto Rico 2.0 1.0 

In [316]:
# let's do Idaho
df_ = DF[DF['state'] == 'Alabama']
deaths = df_['deaths'].tolist()
deaths_last = deaths[-14:-7]
deaths_cur = deaths[-7:]
print(np.mean(deaths_last), np.mean(deaths_cur))
a, b = resample_two_samples(deaths_last, deaths_cur, num_iterations = 100)
both = combine_resamples(deaths_last, deaths_cur, a,b)
show(do_hist(both))

15.285714285714286 10.285714285714286
