In [1]:
import random
import numpy as np
from scipy.stats import norm
from scipy.stats import ttest_ind_from_stats
from scipy.stats import ttest_ind



In [2]:
#Bokeh
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.layouts import gridplot
from bokeh.models import NumeralTickFormatter
from bokeh.models import DatetimeTickFormatter
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, Band
from bokeh.transform import dodge

In [3]:
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference_in_means.append(np.mean(resample_a) - np.mean(resample_b))
    return difference_in_means


In [4]:
output_notebook()

In [5]:
s_size = 100
s1 =  [random.gauss(0, 1) for x in range(s_size)]
s2 = [random.gauss(.1, 1) for x in range(s_size)]
s3 = [random.gauss(1, 1) for x in range(s_size)]
resamps1 = sorted(repeat_resample(s2, s1, num_iter = 1000))
resamps2 = sorted(repeat_resample(s3, s1, num_iter = 1000))


def get_pvalues(res, s1, s2):
    # for two tail
    p_value_r = round((1 - len([x  for x in res if x > 0])/len(res)) * 2,2)
    p_value_cdf = round((norm.cdf(0, np.mean(res), np.std(res))) * 2,2)
    p_value_ttest = round(ttest_ind(s1, s2).pvalue,2)
    print(p_value_r, p_value_cdf, p_value_ttest)
    
def get_reject(s, v= 0, alpha = .05, two_sided= True,
               verbose = True):
    if two_sided:
        up = (1 - alpha/2) * 100
        low = (alpha/2) * 100
    u = np.percentile(s, up)
    l = np.percentile(s, low)
    if v > u:
        if verbose:
            print('{v} outside of upper bound of {u}'.format(
            u =round(u,2), v = v))
        return True
    if v < l:
        if verbose:
            print('{v} outside of lower bound of {l}'.format(
            l = round(l,2), v =v))
        return True
    if verbose:
        print('not rejecting because {v} lies within {l} and {u}'.format(
            v= v, l=round(l,2), u=round(u,2)))
    return False


    
get_reject(resamps1)
get_pvalues(resamps1, s1, s2)
get_reject(resamps2)
get_pvalues(resamps2, s1, s3)





not rejecting because 0 lies within -0.42 and 0.14
1.69 1.68 0.31
0 outside of lower bound of 0.54
0.0 0.0 0.0


In [6]:
def get_range(r,v= 0, alpha = .05, two_sided= True,
               verbose = True):
    """
    r = resamps
    """
    if two_sided:
        alpha = alpha/2
    return np.percentile(r, alpha),  np.percentile(r, 100 - alpha)
    
def test_get_range():
    s_size = 100
    s1 =  [random.gauss(0, 1) for x in range(s_size)]
    s2 = [random.gauss(.1, 1) for x in range(s_size)]
    s3 = [random.gauss(1, 1) for x in range(s_size)]
    resamps1 = sorted(repeat_resample(s2, s1, num_iter = 1000))
    resamps2 = sorted(repeat_resample(s3, s1, num_iter = 1000))
    l, u = get_range(resamps1, two_sided = True)
    if l <= 0 <= u:
        print('do not reject')

    print(l,u)
    l, u = get_range(resamps2, two_sided = True)
    print(l,u)
    if not  l <= 0 <= u:
        print('reject')


test_get_range()

do not reject
-0.32900198795196983 0.5440283827213275
0.5252711797976428 1.5125345701844006
reject


In [7]:
def make_graph(s):
    p = figure(plot_width = 350, plot_height=350)
    x_range = s[0] -.3, s[-1] + .3
    x = np.arange(x_range[0], x_range[1], 0.001)
    y = norm.pdf(x,np.mean(s), np.std(s))
    p.line(x = x, y = y)
    return p
p1 = make_graph(resamps1)
p2 = make_graph(resamps2)
grid = gridplot([p1, p2],  ncols = 2)
show(grid)

In [8]:
#test two tailed
def explore_2_tail():
    #s1 = [random.gauss(0,1) for x in range(100)]
    pop = [random.gauss(0,1) for x in range(200)]

    gt = []
    lt = []
    lt2 = []
    gt2 = []
    for i in range(100):
        random.shuffle(pop)
        #m = random.choice([1, -1])
        s1 = pop[:50]
        s2 = pop[50:]
        resamps = repeat_resample(s1, s2, num_iter = 100)
        p1 = 1 - len([x for x in resamps if x > 0])/len(resamps)
        p2 = 1 - len([x for x in resamps if x < 0])/len(resamps)
        if p1 <= .05:
            gt.append(p1)
        if p2 <= .05:
            lt.append(p2)
        if p1 <= .05/2:
            gt2.append(p1)
        if p2 <= .05/2:
            lt2.append(p2)
    return lt, gt, lt2, gt2


lt, gt, lt2, gt2 = explore_2_tail()
print(len(lt) + len(gt))

print(len(lt2) + len(gt2))


10
5


In [9]:
#test two tailed
def explore_2_tail2():
    gt = []
    lt = []
    lt2 = []
    gt2 = []
    for i in range(100):
        s1 = [random.gauss(0,1) for x in range(100)]
        s2 = [random.gauss(0,1) for x in range(100)]
        resamps = repeat_resample(s1, s2, num_iter = 100)
        p1 = 1 - len([x for x in resamps if x > 0])/len(resamps)
        p2 = 1 - len([x for x in resamps if x < 0])/len(resamps)
        if p1 < .05:
            gt.append(p1)
        if p2 < .05:
            lt.append(p2)
        if p1 < .05/2:
            gt2.append(p1)
        if p2 < .05/2:
            lt2.append(p2)
    return lt, gt, lt2, gt2
r1 = []
r2 = []
for i in range(100):
    lt, gt, lt2, gt2 = explore_2_tail()
    r1.append(len(lt) + len(gt))
    r2.append(len(lt2) + len(gt2))
print(np.mean(r1))
print(np.mean(r2))


10.78
6.67


In [25]:
# test if t of .2 occurs 20% of the time
def test_freq_random(sig_value, num_iter = 100, s_size = 100):
    num_sig = 0
    for i in range(num_iter):
        s1 =  [random.gauss(0, 1) for x in range(s_size)]
        s2 =  [random.gauss(0, 1) for x in range(s_size)]
        p_value = ttest_ind(s1, s2).pvalue
        if p_value < sig_value:
            num_sig += 1
    return num_sig/num_iter
test_freq_random(.15, num_iter = 1000)




0.156