# New bootstrap in SciPy stats module

See https://github.com/scipy/scipy/pull/13371


In [3]:
import numpy as np
from scipy.stats import norm, bootstrap

## Part 1: using vectorized code

In [5]:
np.random.seed(1)

n1 = 100  # size of sample 1
n2 = 120  # size of sample 2
n_resamples = 1000  # number of bootstrap resamples used to form each CI
confidence_level = 0.9

# The statistic we're interested in is the difference in means
def my_stat(data1, data2, axis=-1):
    mean1 = np.mean(data1, axis=axis)
    mean2 = np.mean(data2, axis=axis)
    return mean1 - mean2

# The true difference in the means is -0.1
dist1 = norm(loc=0, scale=1)
dist2 = norm(loc=0.1, scale=1)
stat_true = dist1.mean() - dist2.mean()

data1 = dist1.rvs(size=n1)
data2 = dist2.rvs(size=n2)

ci = bootstrap((data1, data2),
               statistic=my_stat,
               confidence_level=confidence_level,
               n_resamples=n_resamples,
               method='basic',
               axis=-1).confidence_interval

print(ci)  # (-0.3625395759148521, 0.06064948717410179); contains true value


ConfidenceInterval(low=-0.3625395759148521, high=0.060649487174101846)


In [None]:
# Do the same thing 1000 times. (The code is fully vectorized.)
n_replications = 1000
data1 = dist1.rvs(size=(n_replications, n1))
data2 = dist2.rvs(size=(n_replications, n2))
ci = bootstrap((data1, data2),
               statistic=my_stat,
               confidence_level=confidence_level,
               n_resamples=n_resamples,
               method='basic',
               axis=-1).confidence_interval

# ci contains vectors of lower and upper confidence interval bounds
ci_contains_true = np.sum((ci[0] < stat_true) & (stat_true < ci[1]))
print(f"The {confidence_level*100}% confidence interval contained the true "
      f"value of the statistic in {ci_contains_true} out of {n_replications} "
      "replications.")

## Using un-vectorized function

In [7]:
np.random.seed(1)

n1 = 100  # size of sample 1
n2 = 120  # size of sample 2
n_resamples = 1000  # number of bootstrap resamples used to form each CI
confidence_level = 0.9


# The statistic we're interested in is the difference in means
def my_stat(data1, data2):
    mean1 = np.mean(data1)
    mean2 = np.mean(data2)
    return mean1 - mean2


# The true difference in the means is -0.1
dist1 = norm(loc=0, scale=1)
dist2 = norm(loc=0.1, scale=1)
stat_true = dist1.mean() - dist2.mean()

data1 = dist1.rvs(size=n1)
data2 = dist2.rvs(size=n2)

res = bootstrap((data1, data2),
               statistic=my_stat,
               confidence_level=confidence_level,
               n_resamples=n_resamples,
               method='basic',
               vectorized=False)
CI = res.confidence_interval

print(CI)  # (-0.3625395759148521, 0.06064948717410179); contains true value


ConfidenceInterval(low=-0.3625395759148521, high=0.060649487174101846)
