In [2]:
import random
import numpy as np

In [17]:
import pandas as pd

In [120]:
"""This compares two samples. If the samples are drawn from the same population, they will have
the same parameters"""
#create first population
mean1 = 25
sd1 = 3
size1 = 40
pop1 = [random.gauss(mean1, sd1) for x in range(size1)]

# create a second population
mean2 = 15
sd2 = 3
size2 = 20
pop2 = [random.gauss(mean2, sd2) for x in range(size2)]
# true difference between the two
true_diff = np.mean(pop1) - np.mean(pop2)
print('true diff is {t}'.format(t = true_diff))
both = pop1 + pop2
# now do sampling

# we need to keep track of both statistics
mean_diff1 = []
mean_diff2 = []

#shuffle things. Just 10 times
for i in range(10):
    random.shuffle(both) # randomly shuffle
    new_1 = both[0:size1] # new_1 will be a random sample
    new_2 = both[size1:] #new_2 will be a random sample
    diff1 = np.mean(pop1) - np.mean(new_1) # the difference between the pop and the random
    diff2 = np.mean(pop2) - np.mean(new_2) # the difference between the pop and the random
    mean_diff1.append(diff1) #add the difference to the list
    mean_diff2.append(diff2) #add the difference to the list

mean_of_means1 = np.mean(mean_diff1)
mean_of_means2 = np.mean(mean_diff2)
together = np.array(mean_diff1) - np.array(mean_diff2)
print('resampled mean is {m}'.format(m = np.mean(together)))
"""Conclusion: with resampling, the random mean will equal the true mean (as expected)"""


true diff is 9.139629843029216
resampled mean is 9.108940377189565


'Conclusion: with resampling, the random mean will equal the true mean (as expected)'

In [124]:
"""We want to see if the difference is real, or just because of randomness"""
#lets sort the differences of the means
together = sorted(together)
print(together)
"""The number of times that the mean was different than 0 is 10, or 10/10. 
That indicates the difference is greater than just by random. In fact, we can 
calculate a P value. What is the probability that pop1 > pop2? The answer is 
10/10, or 1. P = 1 - 10/10. What is the probability that pop1 is greather than pop2
by 10? That occurs only 1, or a probability of 9/10. P values = 1 - 9/10, or .1"""
print()

[5.714602836851952, 8.013867009470165, 8.216370910493215, 8.933871612005905, 9.472305860346541, 9.484429781532938, 9.914935867963631, 9.990911240966764, 10.424107720749074, 10.924000931515465]



In [50]:
"""Make a function"""
def resample_two_samples(sample1, sample2, num_iterations = 100):
    both = sample1 + sample2
    mean_diff1 = []
    mean_diff2 = []
    for i in range(num_iterations):
        random.shuffle(both) 
        new_1 = both[0:len(sample1)] 
        new_2 = both[len(sample1):] 
        diff1 = np.mean(sample1) - np.mean(new_1) 
        diff2 = np.mean(sample2) - np.mean(new_2) 
        mean_diff1.append(diff1) 
        mean_diff2.append(diff2) 
    return mean_diff1, mean_diff2

# let's test
a, b = resample_two_samples(pop1, pop2)

In [87]:
def do_stats(res1, res2, diff = 0, two_tail = False):
    """
    diff res1 - res2
    """
    mean1 = np.mean(res1)
    mean2 = np.mean(res2)
    together = np.array(res1) - np.array(res2)
    all_diff = [x for x in together if x > diff]
    num_diff = len(all_diff)
    p = 1 - num_diff/len(res1)
    d = {'p': p,
         'mean': np.mean(together)
        
    }
    
    return d
# let's test
d = resample_two_samples(pop1, pop2)
#do_stats(a,b)

In [19]:
path = 'data/states.csv'
DF = pd.read_csv(path)

In [113]:
def diff_week(df, last_index = -14):
    all_states = list(set(df['state']))
    for i in sorted(all_states):
        df_ = df[df['state'] == i]
        deaths = df_['deaths'].tolist()
        deaths_last = deaths[last_index:-7]
        deaths_cur = deaths[-7:]
        if np.mean(deaths_last) > np.mean(deaths_cur):
            a, b = resample_two_samples(deaths_last, deaths_cur, num_iterations = 100)
        else:
            a, b = resample_two_samples(deaths_cur, deaths_last, num_iterations = 100)
        the_dict  = do_stats(a,b)
        print(i,round(np.mean(deaths_last)),round(np.mean(deaths_cur)), round(the_dict['p'],2 ))
diff_week(DF, last_index = -21)

Alabama 13.0 10.0 0.31
Alaska 0.0 0.0 0.62
Arizona 20.0 22.0 0.45
Arkansas 3.0 1.0 0.07
California 75.0 71.0 0.39
Colorado 21.0 34.0 0.04
Connecticut 68.0 58.0 0.14
Delaware 7.0 9.0 0.19
District of Columbia 10.0 8.0 0.14
Florida 44.0 38.0 0.29
Georgia 29.0 25.0 0.34
Guam 0.0 0.0 1.0
Hawaii 0.0 0.0 0.66
Idaho 1.0 1.0 0.21
Illinois 114.0 105.0 0.44
Indiana 40.0 35.0 0.33
Iowa 11.0 13.0 0.33
Kansas 3.0 3.0 0.34
Kentucky 7.0 7.0 0.5
Louisiana 38.0 32.0 0.32
Maine 1.0 1.0 0.58
Maryland 52.0 45.0 0.11
Massachusetts 136.0 107.0 0.06
Michigan 75.0 49.0 0.09
Minnesota 23.0 21.0 0.19
Mississippi 15.0 15.0 0.52
Missouri 16.0 14.0 0.25
Montana 0.0 0.0 1.0
Nebraska 3.0 5.0 0.04
Nevada 7.0 7.0 0.6
New Hampshire 6.0 6.0 0.37
New Jersey 209.0 149.0 0.07
New Mexico 8.0 7.0 0.25
New York 297.0 156.0 0.0
North Carolina 18.0 15.0 0.32
North Dakota 2.0 1.0 0.53
Northern Mariana Islands 0.0 0.0 1.0
Ohio 39.0 43.0 0.42
Oklahoma 5.0 3.0 0.13
Oregon 2.0 1.0 0.16
Pennsylvania 122.0 99.0 0.29
Puerto Rico 2.0 1.