## Pooled variances considered harmful

This answer from stack overflow provides lots of useful info about why
we don't want to use t-test with pooled variances:
https://math.stackexchange.com/a/3442883/46349


In [3]:
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import norm


In [12]:
# models for pathological case   sigma1 >> sigma2  and  n1 << n2
mu1, sigma1, n1 = 100, 20, 10
mu2, sigma2, n2 = 100,  5, 40

In [13]:
np.random.seed(1119)

N = 5000
pvalues_pooled = np.zeros(N)
for i in range(0,N):
    x1 = norm(mu1, sigma1).rvs(n1)
    x2 = norm(mu2, sigma2).rvs(n2)
    res = ttest_ind(x1, x2, equal_var=True)  # Pooled variences t-test
    pvalues_pooled[i] = res.pvalue

np.mean(pvalues_pooled < 0.05)

0.2926

In [11]:
np.random.seed(1119)

N = 5000
pvalues_welch = np.zeros(N)
for i in range(0,N):
    x1 = norm(mu1, sigma1).rvs(n1)
    x2 = norm(mu2, sigma2).rvs(n2)
    res = ttest_ind(x1, x2, equal_var=False)  # Welch's t-test
    pvalues_welch[i] = res.pvalue

np.mean(pvalues_welch < 0.05)

0.0508