In [1]:
import random
import numpy as np

In [2]:
import pandas as pd

In [94]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot


In [3]:
"""This compares two samples. If the samples are drawn from the same population, they will have
the same parameters"""
#create first population
mean1 = 25
sd1 = 3
size1 = 40
pop1 = [random.gauss(mean1, sd1) for x in range(size1)]

# create a second population
mean2 = 15
sd2 = 3
size2 = 20
pop2 = [random.gauss(mean2, sd2) for x in range(size2)]
# true difference between the two
true_diff = np.mean(pop1) - np.mean(pop2)
print('true diff is {t}'.format(t = true_diff))
both = pop1 + pop2
# now do sampling

# we need to keep track of both statistics
mean_diff1 = []
mean_diff2 = []

#shuffle things. Just 10 times
for i in range(10):
    random.shuffle(both) # randomly shuffle
    new_1 = both[0:size1] # new_1 will be a random sample
    new_2 = both[size1:] #new_2 will be a random sample
    diff1 = np.mean(pop1) - np.mean(new_1) # the difference between the pop and the random
    diff2 = np.mean(pop2) - np.mean(new_2) # the difference between the pop and the random
    mean_diff1.append(diff1) #add the difference to the list
    mean_diff2.append(diff2) #add the difference to the list

mean_of_means1 = np.mean(mean_diff1)
mean_of_means2 = np.mean(mean_diff2)
together = np.array(mean_diff1) - np.array(mean_diff2)
print('resampled mean is {m}'.format(m = np.mean(together)))
"""Conclusion: with resampling, the random mean will equal the true mean (as expected)"""


true diff is 10.632624521421821
resampled mean is 10.371995509909393


'Conclusion: with resampling, the random mean will equal the true mean (as expected)'

In [4]:
"""We want to see if the difference is real, or just because of randomness"""
#lets sort the differences of the means
together = sorted(together)
print(together)
"""The number of times that the mean was different than 0 is 10, or 10/10. 
That indicates the difference is greater than just by random. In fact, we can 
calculate a P value. What is the probability that pop1 > pop2? The answer is 
10/10, or 1. P = 1 - 10/10. What is the probability that pop1 is greather than pop2
by 10? That occurs only 1, or a probability of 9/10. P values = 1 - 9/10, or .1"""
print()

[8.074512434815723, 8.252295663813396, 8.72660883179557, 9.558317853345088, 9.700372041772592, 9.7434287737542, 11.4372517347253, 12.238921574153746, 12.914587667094494, 13.07365852382381]



In [5]:
"""Make a function"""
def resample_two_samples(sample1, sample2, num_iterations = 100):
    both = sample1 + sample2
    mean_diff1 = []
    mean_diff2 = []
    for i in range(num_iterations):
        random.shuffle(both) 
        new_1 = both[0:len(sample1)] 
        new_2 = both[len(sample1):] 
        diff1 = np.mean(sample1) - np.mean(new_1) 
        diff2 = np.mean(sample2) - np.mean(new_2) 
        mean_diff1.append(diff1) 
        mean_diff2.append(diff2) 
    return mean_diff1, mean_diff2

# let's test
a, b = resample_two_samples(pop1, pop2)

In [190]:
"""
Let's visualialize the data. 
"""
pop1 = [random.gauss(7, 1) for x in range(40)]
pop2 = [random.gauss(6.2, 1) for x in range(40)]
real_diff = np.mean(pop1) - np.mean(pop2)
print('real difference is {r}'.format(r = real_diff))
a,b = resample_two_samples(pop1, pop2, num_iterations=1000)
together = np.array(a) - np.array(b)

hist, edges = np.histogram(together, bins = 10, density=True)
p_hist = figure()
p_hist.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
show(p_hist)

real difference is 0.6511911008505997


In [188]:
print("""The way to interpret the graph is: this shows the results from random sampling if the 
samples were drawn from the same population. If the two samples were drawn from the same
population, we shoud see the mean of 0, with a the right tail at about 1, and left tail at -1.\n
But we see the mean is at 1.25. Importantly, we see that the difference of 0 occurrs almost 
not at all. In another words, if the difference between the two sample were random, we would 
see 0 less than 1% of the time (calculated below).

We reject the null hypothesis, that the two samples came from the same population.""")


The way to interpret the graph is: this shows the results from random sampling if the 
samples were drawn from the same population. If the two samples were drawn from the same
population, we shoud see the mean of 0, with a the right tail at about 1, and left tail at -1.

But we see the mean is at 1.25. Importantly, we see that the difference of 0 occurrs almost 
not at all. In another words, if the difference between the two sample were random, we would 
see 0 less than 5% of the time (calculated below).

We reject the null hypothesis, that the two samples came from the same population.


In [191]:
#let's get a p value, the probability that if the null hypothesis were true, we would see the data
len_of_data = len(together)
print('length of data is {l}'.format(l = len_of_data))
# list of valus > 0
greater_than_zero = [x for x in together if x > 0]
print(greater_than_zero[0:5]) # just a test
len_greater_than_zero = len(greater_than_zero)
print('number greater than zero is {n}'.format(n = len_greater_than_zero))
probability_gt_0 = len_greater_than_zero/len_of_data
print(probability_gt_0)
# the probability that 0 will occur is 1 minus this number
p_value = 1 - probability_gt_0
print('p values is {p}'.format(p = p_value))


length of data is 1000
[1.042334893723023, 0.4727128644206431, 0.29946293659650713, 0.6843044984896451, 0.055206741971785434]
number greater than zero is 995
0.995
p values is 0.0050000000000000044


In [130]:
def do_stats(res1, res2, diff = 0, two_tail = False):
    """
    diff res1 - res2
    """
    mean1 = np.mean(res1)
    mean2 = np.mean(res2)
    together = np.array(res1) - np.array(res2)
    all_diff = [x for x in together if x > diff]
    num_diff = len(all_diff)
    p = 1 - num_diff/len(res1)
    d = {'p': p,
         'mean': np.mean(together)
        
    }
    
    return d
# let's test
d = resample_two_samples(pop1, pop2)
#do_stats(a,b)

In [98]:
def do_hist(x, bins=10):
  hist, edges = np.histogram(x, bins = bins, density=True)
  p_hist = figure()
  p_hist.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
  return p_hist

In [157]:
"""Want to test this using traditonals stats"""
from scipy import stats
def testit1(m1, m2, s1, s2):
    the_pop1 = [random.gauss(m1, s1) for x in range(size1)]
    the_pop2 = [random.gauss(m2, s2) for x in range(size2)]
    p_theory = stats.ttest_ind(the_pop1,the_pop2)[1]
    a,b = resample_two_samples(the_pop1, the_pop2, num_iterations=1000)
    together = np.array(a) - np.array(b)
    p_fig = do_hist(together)
    p_resample = do_stats(b,a)['p']
    print('theoretical is {t} and resample is {r}'.format(
        t = round(p_theory/2,2), r = round(p_resample,2)))

testit1(m1 = 6, m2 = 7, s1 = 5, s2 = 3)
    

theoretical is 0.03 and resample is 0.04


In [107]:
path = 'data/states.csv'
DF = pd.read_csv(path)

In [156]:
def diff_week(df, last_index = -14):
    all_states = list(set(df['state']))
    for i in sorted(all_states):
        df_ = df[df['state'] == i]
        deaths = df_['deaths'].tolist()
        deaths_last = deaths[last_index:-7]
        deaths_cur = deaths[-7:]
        if np.mean(deaths_last) > np.mean(deaths_cur):
            a, b = resample_two_samples(deaths_last, deaths_cur, num_iterations = 100)
        else:
            a, b = resample_two_samples(deaths_cur, deaths_last, num_iterations = 100)
        the_dict  = do_stats(a,b)
        print(i,round(np.mean(deaths_last)),round(np.mean(deaths_cur)), round(the_dict['p'],2 ))
diff_week(DF, last_index = -21)

Alabama 13.0 10.0 0.28
Alaska 0.0 0.0 0.67
Arizona 20.0 22.0 0.4
Arkansas 3.0 1.0 0.1
California 75.0 71.0 0.41
Colorado 21.0 34.0 0.03
Connecticut 68.0 58.0 0.22
Delaware 7.0 9.0 0.16
District of Columbia 10.0 8.0 0.13
Florida 44.0 38.0 0.29
Georgia 29.0 25.0 0.29
Guam 0.0 0.0 1.0
Hawaii 0.0 0.0 0.71
Idaho 1.0 1.0 0.22
Illinois 114.0 105.0 0.4
Indiana 40.0 35.0 0.3
Iowa 11.0 13.0 0.27
Kansas 3.0 3.0 0.39
Kentucky 7.0 7.0 0.43
Louisiana 38.0 32.0 0.35
Maine 1.0 1.0 0.5
Maryland 52.0 45.0 0.12
Massachusetts 136.0 107.0 0.06
Michigan 75.0 49.0 0.03
Minnesota 23.0 21.0 0.33
Mississippi 15.0 15.0 0.49
Missouri 16.0 14.0 0.36
Montana 0.0 0.0 1.0
Nebraska 3.0 5.0 0.03
Nevada 7.0 7.0 0.51
New Hampshire 6.0 6.0 0.55
New Jersey 209.0 149.0 0.09
New Mexico 8.0 7.0 0.21
New York 297.0 156.0 0.0
North Carolina 18.0 15.0 0.21
North Dakota 2.0 1.0 0.42
Northern Mariana Islands 0.0 0.0 1.0
Ohio 39.0 43.0 0.32
Oklahoma 5.0 3.0 0.17
Oregon 2.0 1.0 0.22
Pennsylvania 122.0 99.0 0.31
Puerto Rico 2.0 1.0 0

In [207]:
# let's do Idaho
df_ = DF[DF['state'] == 'Alabama']
deaths = df_['deaths'].tolist()
deaths_last = deaths[-14:-7]
deaths_cur = deaths[-7:]
print(np.mean(deaths_last), np.mean(deaths_cur))
a, b = resample_two_samples(deaths_last, deaths_cur, num_iterations = 100)
together = np.array(a) - np.array(b)
show(do_hist(together))

15.285714285714286 10.285714285714286
