<a href="https://colab.research.google.com/github/noallynoclan/colab/blob/master/bootstrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def ttest(delta, se, alpha, mc):
    z = delta / se
    pval = norm.sf(np.abs(z)) * 2,
    ci_abs = delta + se * norm.ppf([alpha / 2, 1 - alpha / 2])
    ci_rel = ci_abs / mc
    return z, pval, ci_abs, ci_rel

def mean_test(mt, mc, st, sc, nt, nc, conf):
    se = np.sqrt(st ** 2 / nt + sc ** 2 / nc)
    z, pval, ci_abs, ci_rel = ttest(mt - mc, se, 1 - conf, mc)
    return z, pval, ci_abs, ci_rel

In [230]:
n_samples = 10000000

df = pd.DataFrame({
    'xc': np.random.normal(loc=100, scale=100, size=n_samples),
    'xt': np.random.normal(loc=101, scale=100, size=n_samples)
})

mt, st, nt = df['xt'].agg(['mean', 'std', 'count'])
mc, sc, nc = df['xc'].agg(['mean', 'std', 'count'])
mean_test(mt, mc, st, sc, nt, nc, conf=0.9)

(21.127504964199943,
 (4.444183788031526e-99,),
 array([0.87118291, 1.0182849 ]),
 array([0.00870835, 0.01017879]))

In [236]:
def bootstrap(t, c, df, n_bt = 100, conf=0.9):
    # n_bins = int(len(df) ** 0.7)
    n_bins = 500
    bins = np.arange(len(df)) % n_bins
    np.random.shuffle(bins)
    gdf = df.groupby(bins).sum()
    bt = gdf.sample(n_bins * n_bt, replace=True)
    bt = bt.groupby(np.arange(n_bins * n_bt) % n_bt).sum()
    bt['abs'] = (bt[t] - bt[c]) / len(df)
    bt['rel'] = bt[t] / bt[c] - 1
    alpha = 1 - conf
    return bt['abs'].quantile([alpha / 2, 1 - alpha / 2])

bootstrap('xt', 'xc', df, 100)

0.05    0.861569
0.95    1.023290
Name: abs, dtype: float64

In [139]:
np.arange(10 * 3) % 3

array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
       1, 2, 0, 1, 2, 0, 1, 2])