Hypothesis Testing
==================

In [None]:
from dataclasses import dataclass
from abc import ABC, abstractmethod
from typing import List, Tuple, Callable
from functools import partial
from collections import Counter

In [None]:
import numpy as np
import pandas as pd
import scipy.stats

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sys
sys.path.append('lib')

import nsfg
from pmf import Pmf

In [None]:
from IPython.core.pylabtools import figsize
sns.set_theme()
figsize(11, 5)

# some nicer colors from http://colorbrewer2.org/
COLOR1 = '#7fc97f'
COLOR2 = '#beaed4'
COLOR3 = '#fdc086'
COLOR4 = '#ffff99'
COLOR5 = '#386cb0'

In [None]:
r2 = partial(np.round, decimals=2)

## Framework

In [None]:
class HypothesisTest(ABC):
    '''
    A class that represents the structure of a classical hypothesis test
    '''
    
    def __init__(self, data: List[float]):
        self.data = data
        self.make_model()
        self.actual = self.test_statistic(data)
    
    @abstractmethod
    def test_statistic(self, data: List[float]) -> float:
        '''
        Provides the test statistic of interest
        '''
        
    @abstractmethod
    def make_model(self):
        '''
        Sets up the test
        '''
        
    @abstractmethod
    def run_model(self) -> List[float]:
        '''
        Runs the test - generates the data to pass to test_statistic
        '''
        
    def p_value(self, iters=1000):
        '''
        Computes the p-value
        '''
        self.test_stats = np.array([
            self.test_statistic(self.run_model()) for _ in range(iters)
        ])
        # proportion of stats greater than the actual value
        return sum(self.test_stats >= self.actual) / iters

As a simple example, suppose we toss a coin 250 times and see 140 heads and 110 tails. Based on this result, we might suspect that the coin is biased; that is, more likely to land heads. To test this hypothesis, we compute the probability of seeing such a difference if the coin is actually fair:

## Testing Proportions

In [None]:
class CoinTest(HypothesisTest):
    
    def test_statistic(self, data):
        # the absolute difference between the number
        # of heads and tails
        heads, tails = data
        return abs(heads - tails)
    
    def make_model(self):
        # nothing to do
        pass
    
    def run_model(self):
        '''
        Simulates coin tosses assuming that the coin is actually fair
        '''
        # heads + tails
        n = sum(self.data)
        # generate a sample of n coin tosses
        sample = np.random.binomial(1, 0.5, n)
        # return (number of heads, number of tails)
        return (sum(sample == 1), sum(sample == 0))

In [None]:
# draw from (0, 1) 10 times with probablity of 0.5 for each
np.random.binomial(1, 0.5, 10)

In [None]:
ct = CoinTest((140, 110))
print(f'P value: {ct.p_value(iters=1000):0.2f}')

The result is about 0.07, which means that if the coin is fair, we expect to see a difference as big as 30 about 7% of the time.

How should we interpret this result? By convention, 5% is the threshold of statistical significance. If the p-value is less than 5%, the effect is considered significant; otherwise it is not.

But the choice of 5% is arbitrary, and (as we will see later) the p-value depends on the choice of the test statistics and the model of the null hypothesis. So p-values should not be considered precise measurements.

I recommend interpreting p-values according to their order of magnitude: if the p-value is less than 1%, the effect is unlikely to be due to chance; if it is greater than 10%, the effect can plausibly be explained by chance. P-values between 1% and 10% should be considered borderline. So in this example I conclude that the data do not provide strong evidence that the coin is biased or not.

Here's an example that tests whether the outcome of a rolling a six-sided die is suspicious, where the test statistic is the total absolute difference between the observed outcomes and the expected long-term averages.

In [None]:
class DiceTest(HypothesisTest):
    
    FACES = [1, 2, 3, 4, 5, 6]

    def test_statistic(self, data):
        # data is what has been observed
        n = sum(data)
        # array of 1/6 values the equal to the number of observations
        expected = np.ones(6) * n / 6
        return sum(abs(data - expected))
    
    def make_model(self):
        pass

    def run_model(self):
        n = sum(self.data)
        rolls = np.random.choice(self.FACES, n, replace=True)
        hist = Counter(rolls)
        # the frequencies are the values. return them in order of the dice values 1-6
        return np.array([hist[i] for i in self.FACES])

Here's an example using the data from the book:

In [None]:
data = np.array([8, 9, 19, 5, 8, 11])
dt = DiceTest(data)
pvalue = dt.p_value(iters=10000)
pvalue

The observed deviance from the expected values is not statistically significant.

By convention, it is more common to test data like this using the chi-squared statistic:

In [None]:
class DiceChiTest(DiceTest):

    def test_statistic(self, data):
        n = sum(data)
        expected = np.ones(6) * n / 6
        return sum((data - expected)**2 / expected)

In [None]:
dt = DiceChiTest(data)
pvalue = dt.p_value(iters=10000)
pvalue

Taking this result at face value, we might consider the data statistically significant, but considering the results of both tests, I would not draw any strong conclusions.

## Testing a difference in means

One of the most common effects to test is a difference in mean between two groups. In the NSFG data, the mean pregnancy length for first babies is slightly longer, and the mean birth weight is slightly smaller. Now we will see if those effects are statistically significant.

For these examples, the null hypothesis is that the distributions for the two groups are the same. One way to model the null hypothesis is by permutation; that is, we can take values for first babies and others and shuffle them, treating the two groups as one big group:

Suppose you observe an apparent difference between two groups and you want to check whether it might be due to chance.

As an example, we'll look at differences between first babies and others.  The `first` module provides code to read data from the National Survey of Family Growth (NSFG).

In [None]:
# read pregnency data - live births only
live = nsfg.read_live_fem_preg()

In [None]:
live.shape

In [None]:
live.birthord.value_counts()

In [None]:
# categorize as firsts and others
live.birthcat.value_counts()

In [None]:
# interested in prglngth
live.prglngth.describe()

In [None]:
# no missing values
live.prglngth.isna().sum()

In [None]:
# partition first babies and others
firsts = live[live.birthcat == 'firsts'].prglngth
others = live[live.birthcat == 'others'].prglngth

In [None]:
def weeks2hours(weeks: np.float64) -> np.float64:
    '''
    Convert long time units (weeks) into short (hours)
    '''
    return weeks * 7 * 24

We'll look at a couple of variables, including pregnancy length and birth weight.  The effect size we'll consider is the difference in the means.

Other examples might include a correlation between variables or a coefficient in a linear regression.  The number that quantifies the size of the effect is called the "test statistic".

In [None]:
class DiffMeansPermute(HypothesisTest):
    
    def test_statistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat
    
    def make_model(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))
        
    def run_model(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.m:]
        return data

In [None]:
ht = DiffMeansPermute((firsts, others,))
pvalue = ht.p_value()
print(f'p value: {pvalue:0.2f}')

In [None]:
class DiffMeansOneSided(DiffMeansPermute):
    
    def test_statistic(self, data):
        group1, group2 = data
        return group1.mean() - group2.mean()

In [None]:
ht = DiffMeansOneSided((firsts, others,))
pvalue = ht.p_value()
print(f'p value: {pvalue:0.2f}')

In [None]:
weeks2hours(ht.actual)

In [None]:
del ht, pvalue

In [None]:
# hold our two groups

@dataclass
class GroupPair:
    
    group1: np.array
    group2: np.array
    
    @property
    def lengths(self) -> Tuple[int, int]:
        return (len(self.group1), len(self.group2))
    
    @property
    def means(self) -> Tuple[np.float64, np.float64]:
        return (self.group1.mean(), self.group2.mean())
    
# group stats
def mean_diff(gp: GroupPair) -> np.float64:
    return abs(gp.group1.mean() - gp.group2.mean())

For the first example, I extract the pregnancy length for first babies and others.  The results are pandas Series objects.

In [None]:
data = GroupPair(
    firsts.values,
    others.values
)

In [None]:
actual = mean_diff(data)
print(f'Actual difference: {weeks2hours(actual):0.2f}')

The actual difference in the means is 0.078 weeks, which is only 13 hours.

The null hypothesis is that there is no difference between the groups.  We can model that by forming a pooled sample that includes first babies and others.

In [None]:
def pooled_sample(data: GroupPair) -> GroupPair:
    # represents null hypothesis
    n, m = data.lengths
    pool = np.hstack((data.group1, data.group2))
    # shuffle the pool
    np.random.shuffle(pool)
    # return as a new grouped pair using the same sizes as the actual sample
    return GroupPair(pool[:n], pool[n:])

Then we can simulate the null hypothesis by shuffling the pool and dividing it into two groups, using the same sizes as the actual sample.

The result of running the model is two NumPy arrays with the shuffled pregnancy lengths:

Then we compute the same test statistic using the simulated data:

In [None]:
sample_data = pooled_sample(data)
r2(weeks2hours(mean_diff(sample_data)))

In [None]:
del sample_data

If we run the model 1000 times and compute the test statistic, we can see how much the test statistic varies under the null hypothesis.

In [None]:
def run_model(data: GroupPair, test_stat: Callable, sampler: Callable, niters: int = 1000) -> np.ndarray:
    return np.array([
        test_stat(sampler(data)) for i in range(niters)
    ])

In [None]:
def p_value(test_stats: np.ndarray, actual: np.float64) -> np.float64:
    '''
    the proportion of differences that exceed the observed difference
    '''
    return sum(test_stats >= actual) / len(test_stats)

In [None]:
def plot_hist(test_stats: np.ndarray, actual: np.float64, label=None, title=None, bins=None):
    """
    Draws a histogram with vertical lines at the observed test stat.
    """
    args = {}
    if bins is not None:
        args['bins'] = bins
    p = sns.histplot(
        x = test_stats,
        **args
    )
    p.axvline(actual, linewidth=2, color='darkred', linestyle='--')
    p.set(
        xlabel = label and label or 'test statistic',
        ylabel = 'count'
    );
    if title:
        p.set(title=f'{title}: Actual: {actual:.2f}')
    else:
        p.set(title=f'Actual: {actual:.2f}')

In [None]:
# from the top
actual = mean_diff(data)
test_stats = run_model(data, mean_diff, sampler=pooled_sample, niters = 1000)
p_val = p_value(test_stats, actual)
print(f'P val: {p_val:0.2f}')

Here's the sampling distribution of the test statistic under the null hypothesis, with the actual difference in means indicated by a gray line.

In [None]:
plot_hist(
    test_stats,
    actual,
    'Difference in means (weeks)',
    title=f'p value: {p_val:0.2f}'
)

The p-value is the probability that the test statistic under the null hypothesis exceeds the actual value.

In this case the result is about 15-17%, which means that even if there is no difference between the groups, it is plausible that we could see a sample difference as big as 0.078 weeks.

We conclude that the apparent effect might be due to chance, so we are not confident that it would appear in the general population, or in another sample from the same population.

## Other test statistics

Choosing the best test statistic depends on what question you are trying to address. For example, if the relevant question is whether pregnancy lengths are different for first babies, then it makes sense to test the absolute difference in means, as we did in the previous section.

If we had some reason to think that first babies are likely to be late, then we would not take the absolute value of the difference; instead we would use this test statistic:

In [None]:
def mean_diff_one_sided(gp: GroupPair) -> np.float64:
    return gp.group1.mean() - gp.group2.mean()

the only difference is that `mean_diff` does not take the absolute value of the difference. This kind of test is called one-sided because it only counts one side of the distribution of differences. The previous test, using both sides, is _two-sided_.

In [None]:
actual = mean_diff_one_sided(data)
test_stats = run_model(data, mean_diff_one_sided, sampler=pooled_sample)
p_val = p_value(test_stats, actual)
print(f'One sided p value is: {p_val:0.2f}')

For this version of the test, the p-value is 0.09. In general the p-value for a one-sided test is about half the p-value for a two-sided test, depending on the shape of the distribution.

The one-sided hypothesis, that first babies are born late, is more specific than the two-sided hypothesis, so the p-value is smaller. But even for the stronger hypothesis, the difference is not statistically significant.

We can use the same framework to test for a difference in standard deviation. There is some evidence that first babies are more likely to be early or late, and less likely to be on time. (TODO: show this)

In [None]:
p = sns.histplot(
    data=live,
    x='prglngth',
    hue='birthcat',
    multiple='dodge'
);
p.set(
    xlabel='Pregnancy length (weeks)',
    ylabel='Count'
);

In [None]:
p = sns.kdeplot(
    data=live,
    x='prglngth',
    hue='birthcat'
);
p.set(
    xlabel='Pregnancy length (weeks)',
    ylabel='Density'
);

In [None]:
live.groupby('birthcat')['prglngth'].agg(np.std)

So we might hypothesize that the standard deviation is higher. Here’s how we can test that:

In [None]:
def std_diff_one_sided(gp: GroupPair) -> np.float64:
    return gp.group1.std() - gp.group2.std()

In [None]:
actual = std_diff_one_sided(data)
test_stats = run_model(data, std_diff_one_sided, pooled_sample)
p_val = p_value(test_stats, actual)
print(f'One sided p value for std is: {p_val:0.2f}')

In [None]:
plot_hist(
    test_stats, actual, title=f'p value is {p_val:0.2f}')

This is a one-sided test because the hypothesis is that the standard deviation for first babies is higher, not just different. The p-value is 0.09, which is not statistically significant.

## Testing a correlation

This framework can also test correlations. For example, in the NSFG data set, the correlation between birth weight and mother’s age is about 0.07. It seems like older mothers have heavier babies. But could this effect be due to chance?

For the test statistic, I use Pearson’s correlation, but Spearman’s would work as well. If we had reason to expect positive correlation, we would do a one-sided test. But since we have no such reason, I’ll do a two-sided test using the absolute value of correlation.

The null hypothesis is that there is no correlation between mother’s age and birth weight. By shuffling the observed values, we can simulate a world where the distributions of age and birth weight are the same, but where the variables are unrelated:

In [None]:
live.agepreg.describe()

In [None]:
live.loc[:, ['caseid', 'prglngth','totalwgt_lb', 'agepreg']].apply(lambda col: np.sum(col.isna()))

In [None]:
live.totalwgt_lb.describe()

In [None]:
live.dropna(subset=['totalwgt_lb'], inplace=True)

In [None]:
np.corrcoef(live.agepreg.values, live.totalwgt_lb.values)

In [None]:
def cov(xs: np.array, ys: np.array) -> np.float64:
    return np.dot(xs-xs.mean(), ys-ys.mean()) / len(xs)

In [None]:
np.cov(live.loc[:, ['agepreg', 'totalwgt_lb']].to_numpy(), rowvar=False)

In [None]:
cov(live.agepreg, live.totalwgt_lb)

In [None]:
def cor(xs: np.ndarray, ys: np.ndarray) -> np.float64:
    return cov(xs, ys) / np.sqrt(xs.var() * ys.var())

In [None]:
cor(live.agepreg, live.totalwgt_lb)

In [None]:
# test stat
def cor_test_stat(gp: GroupPair) -> np.float64:
    return abs(cor(gp.group1, gp.group2))

# null hypothesis
def cor_sample(gp: GroupPair):
    return GroupPair(
        np.random.permutation(gp.group1),
        gp.group2
    )

In [None]:
data = GroupPair(
    live.agepreg.values,
    live.totalwgt_lb.values
)
actual = cor(data.group1, data.group2)
test_stats = run_model(data, cor_test_stat, cor_sample, niters=10000)
p_val = p_value(test_stats, actual)
print(f'P value: {p_val:0.3f}, Maximum simulated correlation is {max(test_stats):0.3}')

The actual correlation is 0.07. The computed p-value is 0; after 10000 iterations the largest simulated correlation is 0.04. So although the observed correlation is small, it is statistically significant.

This example is a reminder that “statistically significant” does not always mean that an effect is important, or significant in practice. It only means that it is unlikely to have occurred by chance.

In [None]:
plot_hist(
    test_stats,
    actual,
    label='correlation',
    title=f'p value for correlation test between pregancy age and birth weight is {p_val:0.2f}'
);

In [None]:
# remove global variables
del data, actual, test_stats, p_val

In this case, after 10000 attempts, we never see a sample difference as big as the observed difference, so we conclude that the apparent effect is unlikely under the null hypothesis.  Under normal circumstances, we can also make the inference that the apparent effect is unlikely to be caused by random sampling.

One final note: in this case I would report that the p-value is less than 1/1000 or less than 0.001.  I would not report p=0, because  the apparent effect is not impossible under the null hypothesis; just unlikely.

## Errors

In this section, we'll explore the dangers of p-hacking by running multiple tests until we find one that's statistically significant.

Suppose we want to compare IQs for two groups of people.  And suppose that, in fact, the two groups are statistically identical; that is, their IQs are drawn from a normal distribution with mean 100 and standard deviation 15.

I'll use `numpy.random.normal` to generate fake data I might get from running such an experiment:

In [None]:
groups = GroupPair(
    np.random.normal(100, 15, size=100),
    np.random.normal(100, 15, size=100)
)

We expect the mean in both groups to be near 100, but just by random chance, it might be higher or lower.

In [None]:
r2(groups.means)

We can use DiffMeansPermute to compute the p-value for this fake data, which is the probability that we would see a difference between the groups as big as what we saw, just by chance.

In [None]:
groups = GroupPair(
    np.random.normal(100, 15, size=100),
    np.random.normal(100, 15, size=100)
)
actual = mean_diff(groups)
test_stats = run_model(groups, mean_diff, pooled_sample, niters=1000)
p_val = p_value(test_stats, actual)

plot_hist(
    test_stats,
    actual,
    label='difference in means',
    title=f'p value for difference in means between two randomly generated datasets is {p_val:0.2f}'
);

Just keep running it until you get a significant result

You can probably see where this is going.  If we play this game over and over (or if many researchers play it in parallel), the false positive rate can be as high as 100%.

To see this more clearly, let's simulate 100 researchers playing this game.  I'll take the code we have so far and wrap it in a function:

In [None]:
# delete global variables
del groups, actual, test_stats, p_val

In [None]:
def run_experiment(sample_size: int = 200, niters: int = 1000):
    """Generate random data and run a hypothesis test on it.

    sample_size: integer

    returns: p-value
    """
    groups = GroupPair(
        np.random.normal(100, 15, size=sample_size),
        np.random.normal(100, 15, size=sample_size)
    )
    actual = mean_diff(groups)
    return p_value(
        run_model(groups, mean_diff, pooled_sample, niters=niters),
        actual
    )

Lets run it 100 times

In [None]:
p_vals = np.array([run_experiment(sample_size=500, niters=100) for _ in range(100)])
sum(p_vals < 0.05)

On average, we expect to get a false positive about 5 times out of 100.  To see why, let's plot the histogram of the p-values we got.

In [None]:
plot_hist(
    p_vals,
    0.05,
    label='p-value',
    title='A sample of p-values obtained through muliple iterations of the same experiment',
    # increments of 0.05
    bins=np.linspace(0, 1, 21)
);

The distribution of p-values is uniform from 0 to 1.  So it falls below 5% about 5% of the time.

If the threshold for statistical signficance is 5%, the probability of a false positive is 5%.  You might hope that things would get better with larger sample sizes, but they don't.  Run this experiment again with a larger sample size, and see for yourself.

## Chi-square test of pregnancy length

In [None]:
class PregLengthTest:

    def __init__(self, gp: GroupPair):
        # firsts, others = gp.group1, gp.group2
        self.n = len(gp.group1)
        pmf = Pmf.from_seq(
            np.hstack((gp.group1, gp.group2))
        )
        self.values = np.arange(35, 44)
        self.expected_probs = np.array([pmf.prob(value) for value in self.values])
    
    def test_statistic(self, gp: GroupPair):
        # firsts, others
        stat = self.chi_squared(gp.group1) + self.chi_squared(gp.group2)
        return stat

    def chi_squared(self, lengths):
        hist = Counter(lengths)
        observed = np.array([hist[value] for value in self.values])
        # turn expected probabilities into expected frequencies
        expected = self.expected_probs * len(lengths)
        return sum((observed - expected)**2 / expected)

If we specifically test the deviations of first babies and others from the expected number of births in each week of pregnancy, the results are statistically significant with a very small p-value.  But at this point we have run so many tests, we should not be surprised to find at least one that seems significant.

In [None]:
groups = GroupPair(
    firsts.values,
    others.values
)
ht = PregLengthTest(groups)
actual = ht.test_statistic(groups)
print(f'actual: {actual}')

In [None]:
test_stats = run_model(groups, ht.test_statistic, pooled_sample, niters=1000)
p_val = p_value(test_stats, actual)
print(f'p-value: {p_val}')
print(f'ts max: {np.max(test_stats)}')

## Statistical Power

In the previous section, we computed the false positive rate, which is the probability of seeing a "statistically significant" result, even if there is no statistical difference between groups.

Now let's ask the complementary question: if there really is a difference between groups, what is the chance of seeing a "statistically significant" result?

The answer to this question is called the "power" of the test.  It depends on the sample size (unlike the false positive rate), and it also depends on how big the actual difference is.

We can estimate the power of a test by running simulations similar to the ones in the previous section.  Here's a version that takes the actual difference between groups as a parameter:

In [None]:
def run_experiment_with_difference(actual_diff: int, sample_size=100, niters = 200) -> float:
    """
    Generate random data and run a hypothesis test on it.

    actual_diff: The actual difference between groups.
    sample_size: integer

    returns: p-value
    """
    groups = GroupPair(
        # two groups with different means
        np.random.normal(100, 15, size=sample_size),
        np.random.normal(100 + actual_diff, 15, size=sample_size)
    )
    actual = mean_diff(groups)
    return p_value(
        run_model(groups, mean_diff, pooled_sample, niters=niters),
        actual
    )

Now let's run it 100 times with an actual difference of 5:

In [None]:
p_values = np.array([run_experiment_with_difference(5) for i in range(100)])
sum(p_values < 0.05)

With sample size 100 and an actual difference of 5, the power of the test is approximately 65%.  That means if we ran this hypothetical experiment 100 times, we'd expect a statistically significant result about 65 times.

That's pretty good, but it also means we would NOT get a statistically significant result about 35 times, which is a lot.

Again, let's look at the distribution of p-values:

In [None]:
plot_hist(
    p_values,
    0.05,
    label='p-value',
    title='A sample of p-values with an effect size of 5',
    bins=np.linspace(0, 1, 21)
);

## Power

Here's the function that estimates the probability of a non-significant p-value even is there really is a difference between the groups.

In [None]:
def resample(xs: np.array) -> np.array:
    return np.random.choice(xs, len(xs), replace=True)

def group_resampler(data: GroupPair) -> GroupPair:
    # return as a new grouped pair with resampled groups
    return GroupPair(resample(data.group1), resample(data.group2))

In [None]:
data = GroupPair(firsts, others)
actual = mean_diff(data)
# actual difference is 0.078 weeks
print(f'actual: {actual:.4f}')

In [None]:
test_stats = run_model(data, mean_diff, group_resampler, niters=1000)

In [None]:
print(f'p-value: {p_value(test_stats, actual)}')

In [None]:
def false_negative_rate(data: GroupPair, num_runs=100):
    """Computes the chance of a false negative based on resampling.

    data: pair of sequences
    num_runs: how many experiments to simulate

    returns: float false negative rate
    """
    count = 0
    actual = mean_diff(data)
    p_values = np.array([
        run_model(data, mean_diff, group_resampler, niters=100) for _ in range(num_runs)
    ])
    # if the p-value is greater than 0.05 then we accept the null hypothesis
    return np.sum(p_values > 0.05) / num_runs

In [None]:
false_negative_rate(data, num_runs=100)

In this example, the false negative rate is 70%, which means that the power of the test (probability of statistical significance if the actual difference is 0.078 weeks) is only 30%.

Here's the point of this example: if you get a negative result (no statistical significance), that is not always strong evidence that there is no difference between the groups.  It is also possible that the power of the test was too low; that is, that it was unlikely to produce a positive result, even if there is a difference between the groups.

**Exercise:** As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.

To investigate this behavior, run the tests in this chapter with different subsets of the NSFG data.

What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [None]:
# make sure we have no null values in our features of interest
live.loc[:, ['prglngth', 'totalwgt_lb']].apply(lambda col: sum(col.isna()))

In [None]:
# repeatedly divide a starting value until you can divide no more
def subdivide(n, by=2, lower_limit = 1) -> List[int]:
    vals = []
    while n > lower_limit:
        vals.append(n)
        n //= 2
    if vals[-1] > lower_limit:
        vals.append(lower_limit)
    return vals

In [None]:
subdivide(len(live))

In [None]:
subdivide(len(live), lower_limit=50)

In [None]:
def sample_rows(df: pd.DataFrame, nrows: int, replace=False) -> pd.DataFrame:
    """Choose a sample of rows from a DataFrame.

    df: DataFrame
    nrows: number of rows
    replace: whether to sample with replacement

    returns: DataFrame
    """
    return df.loc[np.random.choice(df.index, nrows, replace=replace)]


def resample_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Resamples rows from a DataFrame.

    df: DataFrame

    returns: DataFrame
    """
    return sample_rows(df, len(df), replace=True)

In [None]:
# Solution

def run_tests(live: pd.DataFrame, iters=1000):
    """Runs the tests from Chapter 9 with a subset of the data.

    live: DataFrame
    iters: how many iterations to run
    """
    n = len(live)
    # indicator vectors
    firsts = live.birthcat == 'firsts'
    others = live.birthcat == 'others'
    pvals = []
    # test differences of means
    # TODO: test for correlation and chi-square
    for col in('prglngth', 'totalwgt_lb'):
        values = live[col].values
        data = GroupPair(values[firsts], values[others])
        estimates = run_model(
            data,
            mean_diff,
            pooled_sample,
            niters = iters
        )
        pvals.append(p_value(estimates, mean_diff(data)))
    return pvals

In [None]:
pvals = []
sample_sizes = subdivide(len(live), lower_limit=50)
for n in sample_sizes:
    pvals.append(run_tests(sample_rows(live, n), iters=100))
pd.DataFrame(
    np.vstack(pvals),
    index=sample_sizes,
    columns=['diff_mean_pregancy_length', 'diff_mean_birth_weight']
)

Conclusion: As expected, tests that are positive with large sample sizes become negative as we take away data.  But the pattern is erratic, with some positive tests even at small sample sizes.