In [33]:
import os
import datetime
import random

from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook, reset_output
from bokeh.layouts import gridplot

import numpy as np
import math
import scipy.optimize as optim
from scipy.stats import binom

import pandas as pd

In [65]:
def make_bar(labels, nums, title = None, y_range = None, plot_width = 350, 
             plot_height = 350, rotate = False):
    p = figure(title = title, plot_width = plot_width, plot_height = plot_height,
              y_range = y_range, x_range = labels)
    p.vbar(x=labels, top=nums, width=0.9)
    p.xgrid.grid_line_color = None
    if rotate:
        p.xaxis.major_label_orientation = "vertical"
    return p

In [3]:
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

In [4]:
def exp_func(x, initial, ratio):
    return initial * np.power(ratio, x - 1)

In [5]:
def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []#keep track of the difference in heights for each experiment
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = np.mean(resample_a) - np.mean(resample_b)
        difference_in_means.append(difference)
    return difference_in_means


In [6]:
output_notebook()

In [7]:
"""Read in the data"""
DF = pd.read_csv(os.path.join('data', 'mask.csv'))
DF['Date'] = pd.to_datetime(DF['Date'])

In [8]:
def do_analysis(state):
    """
    state: a string 
    
    returns:
        p_bef: a bokeh image for cases before
        p_aft: a bokeh image for cases after
        diff_in_mean: float of the difference in mean between the cases before and the cases after
        p_value: float of p value of cases before and cases after
        rt_bef: float of rt before
        rt_aft: float of rt after
    """
    pre = DF[
        (DF['period'] == 'pre_mask') #first filter for period (after)
        & (DF['state'] == state) #second filter for state (before)
     ]
    post =DF[
        (DF['period'] == 'post_mask') # first filter for period (after)
        & (DF['state'] == state) # second filter for perid (before)
    ]
    cases_bef = pre['cases']
    cases_aft = post['cases']
    p_bef = make_bar(labels = [x for x in range(len(cases_bef))], 
                     nums = cases_bef, title = '{state} before'.format(state = state))
    p_aft = make_bar(labels = [x for x in range(len(cases_aft))], 
                     nums = cases_aft, title = '{state} after'.format(state = state))
    diff_in_mean = np.mean(cases_aft) - np.mean(cases_bef)
    if np.mean(cases_aft) > np.mean(cases_bef):
        resamp_diff = repeat_resample(cases_aft.tolist(), cases_bef.tolist())
    else:
        resamp_diff = repeat_resample(cases_bef.tolist(), cases_aft.tolist())

    p_value = 1 - len([x for x in resamp_diff if x > 0])/len(resamp_diff)
    popt_pre, pcov_pre = optim.curve_fit(f = exp_func, xdata =list(range(14)), ydata = pre['cases'])
    popt_post, pcov_post = optim.curve_fit(f = exp_func, xdata =list(range(14)), ydata = post['cases'])
    rt_bef, rt_aft = popt_pre[1], popt_post[1]
    return p_bef, p_aft, diff_in_mean, p_value, rt_bef, rt_aft
    
grids = []
outcomes = []
for i in DF['state'].unique():
    p_bef, p_aft, diff_in_mean, p_value, rt_bef, rt_aft = do_analysis(i)
    grids.append(p_bef)
    grids.append(p_aft)
    outcomes.append((i, diff_in_mean, p_value, rt_bef, rt_aft))
for i in outcomes:
    if i[1] > 0:
        direction = 'increased'
    else:
        direction = 'decreased'
    if i[2] < .01:
        sig = 'significant'
    else:
        sig = 'not significant'
    n = abs(round(i[1]))
    if  i[4] - i[3] < 0:
        direction_of_rt ='decreased'
    else:
        direction_of_rt = 'increased'
    diff_in_rt = abs(round(i[4] - i[3], 2))
    p_value = round(i[2],2)
    if p_value < .01:
        p_value = '<.01'
        
    print('For {s} the cases {d} by {n:,} ({sig} with p value = {p})\nrt {d2} by {i2}'.format(
        s = i[0], d = direction, n = n, sig = sig,
        d2 = direction_of_rt,  i2 = diff_in_rt, p = p_value))

show(gridplot(grids, ncols = 2))


For California the cases increased by 3,751 (significant with p value = <.01)
rt increased by 0.02
For Connecticut the cases decreased by 322 (significant with p value = <.01)
rt decreased by 0.02
For Delaware the cases decreased by 13 (not significant with p value = 0.36)
rt decreased by 0.09
For Hawaii the cases decreased by 12 (significant with p value = <.01)
rt increased by 0.03
For Illinois the cases increased by 291 (not significant with p value = 0.1)
rt decreased by 0.07
For Kansas the cases increased by 194 (not significant with p value = 0.05)
rt decreased by 0.09
For Kentucky the cases decreased by 32 (not significant with p value = 0.18)
rt increased by 0.04
For Maine the cases increased by 19 (significant with p value = <.01)
rt decreased by 0.04
For Maryland the cases increased by 316 (significant with p value = <.01)
rt decreased by 0.01
For Massachusetts the cases decreased by 1,019 (significant with p value = <.01)
rt increased by 0.04
For Michigan the cases decreased

In [9]:
#import scipy

#import scipy.stats


0.8949801921844482

In [10]:
def make_bar_both(labels1, labels2, nums1, nums2, title = None,
                  y_range = None, plot_width = 450, plot_height = 400):
    p = figure(title = title, plot_width = plot_width, plot_height = plot_height,
              y_range = y_range)
    p.vbar(x=labels1, top=nums1, width=0.9, color = 'green', legend_label = 'Before')
    p.vbar(x=labels2, top=nums2, width=0.9, color = 'red', legend_label = 'After')

    p.xgrid.grid_line_color = None
    if title in ['California', 'Kansas', 'Nevada', 'North Carolina', 'Oregon']:
        p.legend.location = 'top_left'
    else:
        p.legend.location = 'top_right'
    return p

In [11]:
reset_output()

In [76]:
def do_analysis2(state):
    """
    state: a string 
    
    returns:
        p: a bokeh image for cases 
        diff_in_mean: float of the difference in mean between the cases before and the cases after
        p_value: float of p value of cases before and cases after
        rt_bef: float of rt before
        rt_aft: float of rt after
    """
    pre = DF[
        (DF['period'] == 'pre_mask') #first filter for period (after)
        & (DF['state'] == state) #second filter for state (before)
     ]
    post =DF[
        (DF['period'] == 'post_mask') # first filter for period (after)
        & (DF['state'] == state) # second filter for perid (before)
    ]
    cases_bef = pre['cases']
    cases_aft = post['cases']
    p = make_bar_both(labels1 = [x for x in range(len(cases_bef))], 
                  labels2 = [x + len(cases_bef) for x in range(len(cases_aft))],
                     nums1 = cases_bef, nums2 = cases_aft, 
                  title = '{state}'.format(state = state))
    diff_in_mean = np.mean(cases_aft) - np.mean(cases_bef)
    if np.mean(cases_aft) > np.mean(cases_bef):
        resamp_diff = repeat_resample(cases_aft.tolist(), cases_bef.tolist())
    else:
        resamp_diff = repeat_resample(cases_bef.tolist(), cases_aft.tolist())

    p_value = 1 - len([x for x in resamp_diff if x > 0])/len(resamp_diff)
    popt_pre, pcov_pre = optim.curve_fit(f = exp_func, xdata =list(range(14)), ydata = pre['cases'])
    popt_post, pcov_post = optim.curve_fit(f = exp_func, xdata =list(range(14)), ydata = post['cases'])
    rt_bef, rt_aft = popt_pre[1], popt_post[1]
    return p, diff_in_mean, p_value, rt_bef, rt_aft
    
grids2 = []
outcomes2 = []
for i in DF['state'].unique():
    p, diff_in_mean, p_value, rt_bef, rt_aft = do_analysis2(i)
    grids2.append(p)
    outcomes2.append((i, diff_in_mean, p_value, rt_bef, rt_aft))

show(gridplot(grids2, ncols = 2))
output_notebook()
do_comparison_rates(outcomes2)
analyze_effectiveness(outcomes2)

The num of rates that improved was 14 out of a total of 23
The num of cases that improved was 11 out of a total of 23
The p value for cases (that we saw something unusual) is 0.5. We do not reject the null hypothesis.
The p value for rates (that we saw something unusual) is 0.11. We do not reject the null hypothesis.


In [74]:
def do_comparison_rates(data):
    nums = [x[4] - x[3] for x in data]
    labels = [x[0] for x in data]
    zz = sorted(zip(nums, labels))
    p = make_bar(labels = [x[1] for x in zz ], 
                 nums = [x[0] for x in zz ], 
                 title = 'rt by state', rotate = True)
    show(p)

We will use the binomial theorem to get a p value for *all* of the states. 

If the improvement was really random, then we would expect the probability for improvement to be only .5
For example, if we flipped a coin 100 times, we would expect on average for there to be 50 heads, and 50 tails. How about if we flipped a coint 100 times and got 55 heads? That is expected, right? In order to get a probability, we will use the binomial distribution. 

https://en.wikipedia.org/wiki/Binomial_distribution

We want to answer the question: If the improvements are random (p =.5), what is the probability that we will get k positive outcomes? If we get a really high number of positive outcomes, then we can reject out null hypothesis. Otherwise, we don't reject it.

In [50]:
def analyze_effectiveness(data):
    fit = [x[4] - x[3] for x in data]
    num_rate_improved = len([x for x in fit if x < 0])
    print('The num of rates that improved was {i} out of a total of {t}'.format(
        i = num_rate_improved, t = len(fit)))
    cases = [x[1] for x in data]
    num_cases_improved = len([x for x in cases if x < 0])
    print('The num of cases that improved was {i} out of a total of {t}'.format(
        i = num_cases_improved, t = len(cases)))
    #null hypothesis: that the probability of each outcome is .5
    
    p_value_cases = round(1 - binom.cdf(k=num_cases_improved, n=len(cases), p= .5),2)
    p_value_rates = round(1 - binom.cdf(k=num_rate_improved, n=len(cases), p= .5),2)

    print("The p value for cases (that we saw something unusual) is {p}. We do not reject the null hypothesis.".format(
        p =p_value_cases))
    print("The p value for rates (that we saw something unusual) is {p}. We do not reject the null hypothesis.".format(
        p =p_value_rates))
    