# Analytic Methods

In [None]:
from typing import List, Tuple, Callable
from functools import partial

In [None]:
import numpy as np
import pandas as pd
import random

In [None]:
from scipy import stats

In [None]:
import sys
sys.path.append('lib')

In [None]:
import compstats
import nsfg
import hypothesis

from cdf import Cdf
from normal import Normal

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.core.pylabtools import figsize
sns.set_theme()
figsize(9, 5)

In [None]:
r2 = partial(np.round, decimals=2)

## Normal distributions

As a motivating example, let’s review the problem from [Estimation](08_Estimation.ipynb)

Suppose you are a scientist studying gorillas in a wildlife preserve. Having weighed 9 gorillas, you find sample mean $\bar{x} = 90kg$ and sample standard deviation, $S = 7.5 kg$. If you use $\bar{x}$ to estimate the population mean, what is the standard error of the estimate?

If we know the parameters of the sampling distribution, we can compute confidence intervals and p-values analytically, which is computationally faster than resampling.

In [None]:
def qnorm(p, mu=0, sigma=1):
    return stats.norm.ppf(p, loc=mu, scale=sigma)

Here's the confidence interval for the estimated mean.

In [None]:
qnorm((0.05, 0.95), mu=90, sigma=2.5)

`normal.py` provides a `Normal` class that encapsulates what we know about arithmetic operations on normal distributions.

In [None]:
from normal import Normal

In [None]:
dist = Normal(90, 7.5**2)
dist

We can use it to compute the sampling distribution of the mean with sample size 9.

In [None]:
dist_xbar = dist.sum(9) / 9
dist_xbar.sigma

And then compute a confidence interval.

In [None]:
dist_xbar.percentile((0.05, 0.95))

## Central Limit Theorem

As we saw in the previous sections, if we add values drawn from normal distributions, the distribution of the sum is normal. Most other distributions don’t have this property; if we add values drawn from other distributions, the sum does not generally have an analytic distribution.

But if we add up n values from almost any distribution, the distribution of the sum converges to normal as n increases.

More specifically, if the distribution of the values has mean and standard deviation μ and σ, the distribution of the sum is approximately $\mathcal{N}(n \mu, n \sigma^2)$
This result is the Central Limit Theorem (CLT). It is one of the most useful tools for statistical analysis, but it comes with caveats:

- The values have to be drawn independently. If they are correlated, the CLT doesn’t apply (although this is seldom a problem in practice).
- The values have to come from the same distribution (although this requirement can be relaxed).
- The values have to be drawn from a distribution with finite mean and variance. So most Pareto distributions are out.
- The rate of convergence depends on the skewness of the distribution. Sums from an exponential distribution converge for small n. Sums from a lognormal distribution require larger sizes.

The Central Limit Theorem explains the prevalence of normal distributions in the natural world. Many characteristics of living things are affected by genetic and environmental factors whose effect is additive. The characteristics we measure are the sum of a large number of small effects, so their distribution tends to be normal.

## Testing the CLT

The following function generates samples with difference sizes from an exponential distribution.

In [None]:
def make_expo_sample(sample_size: int, beta=2.0, iters=1000):
    """Generates samples from an exponential distribution.

    beta: parameter
    iters: number of samples to generate for each size

    returns: a list of the sums of each sample
    """
    return np.array([
        np.sum(np.random.exponential(beta, sample_size)) for _ in range(iters)
    ])
        

In [None]:
# 1000 sums of random exponential samples of size 1000
samples = make_expo_sample(100, iters=1000)

In [None]:
samples[:10]

The mean should be close to 100*2.0

In [None]:
r2(samples.mean())

And the values should be normally distributed

In [None]:
p = sns.ecdfplot(
    x = samples
);
p.set(
    xlabel = 'Sums',
    ylabel = 'CDF',
    title = 'CDF of 1000 sums of exponential samples of size 100'
);

In [None]:
def normal_qq_plot(values: np.array, label: str, title: str, legend=True, trim=True):
    """Generates a normal probability plot.

    weights: sequence
    """
    if trim:
        mu, var = compstats.trimmed_mean_var(values, p=0.01)
    else:
        mu, var = values.mean(), values.var()
    std = np.sqrt(var)
    xs, ys = compstats.normal_qq(values)
    xlims = (-5, 5)
    fitted_ys = compstats.fit_line(np.array(xlims), mu, std)
    plt.plot(xs, ys, label = 'data')
    plt.plot(xlims, fitted_ys, label='fitted', linestyle='dashed')
    plt.xlabel('z')
    plt.xlim(xlims)
    plt.ylabel(label)
    plt.title(title)
    if legend:
        plt.legend(loc='upper left');

In [None]:
normal_qq_plot(samples, label = 'Sum of exponental values', title='Sample size = 100')

Lets take a look at the convergence given the sample size n

In [None]:
sample_sizes = [1, 10, 100]
fig, axs = plt.subplots(
    nrows=1,
    ncols=len(sample_sizes),
    figsize=(13, 5,)
)
for i, sample_size in enumerate(sample_sizes):
    samples = make_expo_sample(sample_size)
    p = sns.ecdfplot(
        x=samples,
        ax=axs[i]
    )
    p.set(
        xlabel = 'Sums',
        ylabel = 'CDF',
        title = f'Sample size = {sample_size}'
    );

The following plot shows how the sum of exponential variates converges to normal as sample size increases.

In [None]:
# figsize(13, 5)
fig, axs = plt.subplots(nrows=1, ncols=len(sample_sizes), figsize=(13, 5))
for i, sample_size in enumerate(sample_sizes):
    plt.subplot(1, len(sample_sizes), i+1)
    normal_qq_plot(
        values = make_expo_sample(sample_size),
        label = '',
        title = f'Sample size = {sample_size}',
        legend=False
    )
fig.suptitle('Sums of exponential values');

In [None]:
def sample_qq_plots(sample_sizes: List[int], sampler: Callable, title: str, trim=True):
    fig, axs = plt.subplots(nrows=1, ncols=len(sample_sizes), figsize=(13, 5))
    for i, sample_size in enumerate(sample_sizes):
        plt.subplot(1, len(sample_sizes), i+1)
        normal_qq_plot(
            values = sampler(sample_size),
            label = '',
            title = f'Sample size = {sample_size}',
            legend=False,
            trim=trim
        )
    fig.suptitle(title);

In [None]:
sample_qq_plots(sample_sizes, make_expo_sample, 'Sums of exponential values');

The lognormal distribution has higher variance, so it requires a larger sample size before it converges to normal.

In [None]:
def make_lognormal_sample(sample_size: int, mu=1.0, sigma=1.0, iters=1000):
    """Generates samples from a lognormal distribution.

    mu: parmeter
    sigma: parameter
    iters: number of samples to generate for each size

    returns: list of samples
    """
    return np.array([np.sum(np.random.lognormal(mu, sigma, sample_size)) for _ in range(iters)])

In [None]:
sample_qq_plots(sample_sizes, make_lognormal_sample, 'Sums of lognormal values');

The Pareto distribution has infinite variance, and sometimes infinite mean, depending on the parameters.  It violates the requirements of the CLT and does not generally converge to normal.

In [None]:
def make_pareto_sample(sample_size: int, alpha=1.0, iters=1000):
    """Generates samples from a Pareto distribution.

    alpha: parameter
    iters: number of samples to generate for each size

    returns: list of samples
    """
    return np.array([np.sum(np.random.pareto(alpha, sample_size)) for _ in range(iters)])

In [None]:
sample_qq_plots(sample_sizes, make_pareto_sample, 'Sums of pareto values');

If the random variates are correlated, that also violates the CLT, so the sums don't generally converge.

To generate correlated values, we generate correlated normal values and then transform to whatever distribution we want.

In [None]:
def generate_correlated(rho, n):
    """Generates a sequence of correlated values from a standard normal dist.
    
    rho: coefficient of correlation
    n: length of sequence

    returns: iterator
    """
    x = random.gauss(0, 1)
    yield x
    # each subsequent values depends on its predecessor
    sigma = np.sqrt(1 - rho**2)
    for _ in range(n-1):
        # takes the standard deviation as its second argument, not the variance
        x = random.gauss(x * rho, sigma)
        yield x

In [None]:
def generate_expo_correlated(rho: float, n: int):
    """Generates a sequence of correlated values from an exponential dist.

    rho: coefficient of correlation
    n: length of sequence

    returns: NumPy array
    """
    # generate correlated normal values
    normal = list(generate_correlated(rho, n))
    # use the normal CDF to transform the values to uniform
    uniform = stats.norm.cdf(normal)
    # inverse exponential CDF to transform the uniform values to exponential
    expo = stats.expon.ppf(uniform)
    return expo

In [None]:
def make_correlated_sample(sample_size: int, rho=0.9, iters=1000):
    """Generates samples from a correlated exponential distribution.

    rho: correlation
    iters: number of samples to generate for each size

    returns: list of samples
    """    
    
    return np.array(
        [np.sum(generate_expo_correlated(rho, sample_size)) for _ in range(iters)]
    )
        

In [None]:
sample_qq_plots(
    sample_sizes,
    partial(make_correlated_sample, rho=0.9, iters=1000),
    'Sum of correlated exponential values'
);

## Applying the CLT

Let's use analytic methods to compute a CI and p-value for an observed difference in means.

The distribution of pregnancy length is not normal, but it has finite mean and variance, so the sum (or mean) of a few thousand samples is very close to normal.

In [None]:
live = nsfg.read_live_fem_preg()

In [None]:
firsts = live.birthcat == 'firsts'
others = live.birthcat == 'others'

To see why the Central Limit Theorem is useful, let’s get back to the example in Section 9.3: testing the apparent difference in mean pregnancy length for first babies and others. As we’ve seen, the apparent difference is about 0.078 weeks:

In [None]:
delta = np.abs(np.diff(live.groupby('birthcat')['prglngth'].mean()).item())
np.round(delta, 3)

The following function computes the sampling distribution of the mean for a set of values and a given sample size.

In [None]:
def sampling_dist_mean(data: np.array, n: int) -> Normal:
    """Computes the sampling distribution of the mean.

    data: sequence of values representing the population
    n: sample size

    returns: Normal object
    """
    mean, var = data.mean(), data.var()
    dist = Normal(mean, var)
    return dist.sum(n) / n

Here are the sampling distributions for the means of the two groups under the null hypothesis.

In [None]:
dist1 = sampling_dist_mean(live.prglngth, np.sum(firsts))
dist2 = sampling_dist_mean(live.prglngth, np.sum(others))

And the sampling distribution for the difference in means.

In [None]:
dist = dist1 - dist2

In [None]:
dist

Under the null hypothesis, here's the chance of exceeding the observed difference.

In [None]:
r2(1 - dist.prob(delta))

And the chance of falling below the negated difference.

In [None]:
r2(dist.prob(-delta))

The sum of these probabilities is the two-sided p-value.

In [None]:
r2(2 * dist.prob(-delta))

which is consistent with the estimate in [Hypothesis Testing](09_Hypothesis_Testing.ipynb), which was 0.17

## Testing a correlation

Under the null hypothesis (that there is no correlation), the sampling distribution of the observed correlation (suitably transformed) is a "Student t" distribution.

The method is based on this mathematical result: given two variables that are normally distributed and uncorrelated, if we generate a sample with size n, compute Pearson’s correlation, r, and then compute the transformed correlation

$$
t = r \sqrt{\frac{n-2}{1-r^2}}
$$

the distribution of t is Student’s t-distribution with parameter n − 2. The t-distribution is an analytic distribution; the CDF can be computed efficiently using gamma functions.

We can use this result to compute the sampling distribution of correlation under the null hypothesis; that is, if we generate uncorrelated sequences of normal values, what is the distribution of their correlation? `student_cdf` takes the sample size, n, and returns the sampling distribution of correlation:

In [None]:
def compute_t(r: float, n: int) -> float:
    return r * np.sqrt((n-2) / (1-r**2))

To get from ts to the correlation coefficients, rs, we apply the inverse transform,

$$
r = \frac{t}{\sqrt{n-2-t^2}}
$$

The result is the sampling distribution of r under the null hypothesis.

In [None]:
def student_cdf(n: int) -> Cdf:
    """Computes the CDF correlations from uncorrelated variables.

    n: sample size

    returns: Cdf
    """
    ts = np.linspace(-3, 3, 101)
    ps = stats.t.cdf(ts, df=n-2)
    rs = ts / np.sqrt(n - 2 + ts**2)
    return Cdf(rs, ps)

The following is a `HypothesisTest` that uses permutation to estimate the sampling distribution of a correlation. 

In [None]:
def cor_test_stat(gp: hypothesis.GroupPair) -> np.float64:
    return np.corrcoef(gp.group1, gp.group2)[0][1]

In [None]:
data = hypothesis.GroupPair(
    live.agepreg.values,
    live.totalwgt_lb.values
)
actual = cor_test_stat(data)
test_stats = hypothesis.run_model(
    data,
    cor_test_stat,
    hypothesis.permutation_sampler,
    niters=10000
)
p_val = hypothesis.p_value(test_stats, actual)

In [None]:
# cdf of the null hypothesis (no-correlation)
model_cdf = student_cdf(len(live))
# our sample distribution
sample_cdf = Cdf.from_seq(test_stats)

Now we can estimate the sampling distribution by permutation and compare it to the Student t distribution.

In [None]:
figsize(8, 6)
plt.plot(
    model_cdf.xs,
    model_cdf.ps,
    color='darkred',
    alpha=0.5,
    label='Student t'
)
sns.ecdfplot(
    x=test_stats,
    alpha=0.5,
    label='sample'
)
# plt.plot(
#     sample_cdf.xs,
#     sample_cdf.ps,
#     color='royalblue',
#     alpha=0.5,
#     label='sample'
# )
plt.xlabel('correlation')
plt.ylabel('CDF')
plt.legend(loc='lower right');

They are nearly identical. Although the actual distributions are not normal, Pearson’s coefficient of correlation is based on sample means and variances. By the Central Limit Theorem, these moment- based statistics are normally distributed even if the data are not.

From the above figure, we can see that the observed correlation, 0.07, is unlikely to occur if the variables are actually uncorrelated. Using the analytic distri- bution, we can compute just how unlikely:

In [None]:
n = len(live)
# compute the t that corresponds to r=0.07 (r is actual)
t_stat = compute_t(actual, n)
# and evaluate the t distribution at t
p_val = 1 - stats.t.cdf(t_stat, df=n-2)
print(f'r: {actual:0.2f}, t: {t_stat:0.2f}, pval: {p_val:0.4f}')

This example demonstrates an advantage of the analytic method: we can compute very small p-values. But in practice it usually doesn’t matter.

##  Chi-squared test

The reason the chi-squared statistic is useful is that we can compute its distribution under the null hypothesis analytically.

In [None]:
def chi_squared_cdf(n):
    """Discrete approximation of the chi-squared CDF with df=n-1.

    n: sample size
    
    returns: Cdf
    """
    xs = np.linspace(0, 25, 101)
    ps = stats.chi2.cdf(xs, df=n-1)
    return Cdf(xs, ps)

Again, we can confirm the analytic result by comparing values generated by simulation with the analytic distribution.

In [None]:
data = [8, 9, 19, 5, 8, 11]
dt = hypothesis.DiceChiTest(data)
p_value = dt.p_value(iters=1000)
print(f'pval: {p_value}')

In [None]:
n = len(data)
model_cdf = chi_squared_cdf(n)

In [None]:
plt.plot(
    model_cdf.xs,
    model_cdf.ps,
    color='darkred',
    alpha=0.5,
    label='Chi squared'
)
sns.ecdfplot(
    x=dt.test_stats,
    alpha=0.5,
    label='sample'
)
plt.xlabel('chi-squared statistic')
plt.ylabel('CDF')
plt.legend(loc='lower right');

And then we can use the analytic distribution to compute p-values.

In [None]:
chi2 = dt.actual
p_val = 1 - stats.chi2.cdf(chi2, df=n-1)
print(f'chi2: {chi2}, p-value: {p_val:.4f}')

## Exercises

**Exercise:**    In Section 5.4, we saw that the distribution of adult weights is approximately lognormal. One possible explanation is that the weight a person gains each year is proportional to their current weight. In that case, adult weight is the product of a large number of multiplicative factors:

$$
w = w_0 f_1 f_2 \cdots f_n  
$$

where w is adult weight, $w_0$ is birth weight, and $f_i$ is the weight gain factor for year i.

The log of a product is the sum of the logs of the factors:

$$
log(w) = log(w_0) + log(f_1) + log(f_2) + \cdots + log(f_n) 
$$

So by the Central Limit Theorem, the distribution of log(w) is approximately normal for large n, which implies that the distribution of w is lognormal.

To model this phenomenon, choose a distribution for f that seems reasonable, then generate a sample of adult weights by choosing a random value from the distribution of birth weights, choosing a sequence of factors from the distribution of f, and computing the product. What value of n is needed to converge to a lognormal distribution?

In [None]:
def generate_adult_weight(birth_weights: np.array, n: int):
    """Generate a random adult weight by simulating annual gain.

    birth_weights: sequence of birth weights in lbs
    n: number of years to simulate

    returns: adult weight in lbs
    """
    bw = random.choice(birth_weights)
    factors = np.random.normal(1.09, 0.03, n)
    aw = bw * np.prod(factors)
    return aw

In [None]:
birth_weights = live.totalwgt_lb.values
aws = [generate_adult_weight(birth_weights, 40) for _ in range(1000)]
log_aws = np.log10(aws)

In [None]:
normal_qq_plot(
    log_aws,
    label = 'adult weight (log10 lbs)',
    title='Log normal weight distribution after 40 years'
)

1. With n=40 the distribution is approximately lognormal except for the lowest weights.
1. Actual distribution might deviate from lognormal because it is a mixture of people at different ages, or because annual weight gains are correlated.

**Exercise:** In Section 14.6 we used the Central Limit Theorem to find the sampling distribution of the difference in means, δ, under the null hypothesis that both samples are drawn from the same population.

We can also use this distribution to find the standard error of the estimate and confidence intervals, but that would only be approximately correct. To be more precise, we should compute the sampling distribution of δ under the alternate hypothesis that the samples are drawn from different populations.

Compute this distribution and use it to calculate the standard error and a 90% confidence interval for the difference in means.

In [None]:
delta

In [None]:
firsts_mean, others_mean = live.groupby('birthcat')['prglngth'].mean()

In [None]:
# Here's the observed difference in means
delta = firsts_mean - others_mean
delta

Under the null hypothesis, both sampling distributions are based on all live births.

In [None]:
dist1 = sampling_dist_mean(live.prglngth, firsts.sum())
dist2 = sampling_dist_mean(live.prglngth, others.sum())
dist_diff_null = dist1 - dist2
print(f'null hypothesis: {dist_diff_null}')
print(f'{dist_diff_null.prob(-delta):0.4f}, {1 - dist_diff_null.prob(delta):0.4f}')

Under the alternate hypothesis, each sampling distribution is based on the observed parameters.

In [None]:
dist1 = sampling_dist_mean(live.prglngth[firsts].values, len(firsts))
dist2 = sampling_dist_mean(live.prglngth[others].values, len(others))
dist_diff_alt = dist1 - dist2
print(f'estimated params: {dist_diff_alt}')
print(f'{dist_diff_alt.percentile(0.05):0.4f}, {1 - dist_diff_alt.percentile(0.95):0.4f}')

In [None]:
def plot_normal_dist(dist: Normal, label=None):
    xs, ys = dist.render()
    opts = {}
    if label:
        opts['label'] = label
    sns.lineplot(
        x = xs,
        y = ys,
        **opts
    );

In [None]:
plot_normal_dist(dist_diff_null, label='null hypothesis');
plot_normal_dist(dist_diff_alt, label = 'estimated params');
plt.xlabel('Difference in means (weeks');
plt.ylabel('CDF');
plt.xlim([-0.20, 0.25]);
plt.legend(loc='lower right');

**Exercise:** [In a recent paper](http://ieeexplore.ieee.org/document/7044435/), Stein et al. investigate the effects of an intervention intended to mitigate gender-stereotypical task allocation within student engineering teams.

Before and after the intervention, students responded to a survey that asked them to rate their contribution to each aspect of class projects on a 7-point scale.

Before the intervention, male students reported higher scores for the programming aspect of the project than female students; on average men reported a score of 3.57 with standard error 0.28. Women reported 1.91, on average, with standard error 0.32.

Compute the sampling distribution of the gender gap (the difference in means), and test whether it is statistically significant. Because you are given standard errors for the estimated means, you don’t need to know the sample size to figure out the sampling distributions.

After the intervention, the gender gap was smaller: the average score for men was 3.44 (SE 0.16); the average score for women was 3.18 (SE 0.16). Again, compute the sampling distribution of the gender gap and test it.

Finally, estimate the change in gender gap; what is the sampling distribution of this change, and is it statistically significant?

In [None]:
def plot_dist(dist: Normal, invert=True):
    pval = dist.prob(0)
    if invert:
        pval = 1 - pval
    print(f'mean   : {dist.mu:0.2f}, p-value: {pval:0.4f}')
    print(f'CI     : ({dist.percentile(0.05):0.2f}, {dist.percentile(0.95):0.2f})')
    print(f'stderr : {dist.sigma:0.2f}')

In [None]:
# these are sampling distributions
male_before = Normal(3.57, 0.28**2)
male_after = Normal(3.44, 0.16**2)

female_before = Normal(1.91, 0.32**2)
female_after = Normal(3.18, 0.16**2)

In [None]:
# before
diff_before = female_before - male_before
plot_dist(diff_before)

In [None]:
# after
diff_after = female_after - male_after
plot_dist(diff_after)

In [None]:
# difference
diff = diff_after - diff_before
plot_dist(diff, invert=False)

1. Gender gap before intervention was 1.66 points (p-value 5e-5)
1. Genger gap after was 0.26 points (p-value 0.13, not significant)
1. Change in gender gap was 1.4 points (p-value 0.002, significant).