In [None]:
import math
from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
import random
from scipy import stats
import typing

random.seed(1)


In [None]:
nbm.print_metadata()


# Goal

As a preliminary exercise, derive a confidence interval for estimation of population size $n$ from a single observation of fixed gene magnitude $x$.


# Strategy

Recall from [gene_drive_scenario.ipynb](gene_drive_scenario.ipynb), we have the probability density function for fixed gene magnitude given $n$ as,

$$p(x) = nx^{n-1}.$$

In order to achieve a $c\%$ confidence interval, we must capture all $n$ that generate an as-or-less-extreme outcome as the observed $\hat{x}$ within the inner $c\%$ of probability mass.
In other words, for all $n$ *outside* the confidence interval less than $(1-c)\%$ of probability mass should be associated with an as-or-more-extreme outcome as observed.
(The observed outcome would only be generated with $(1-c)\%$ probability.)

We will take an even-tailed approach, so each tail of as-or-more-extreme outcomes for $n$ rejected from the confidence interval should contain no more than $(1-c)/2\%$ probability mass.


# Lower Bound

As the lower bound for our confidence interval estimating $n$, we will find the $n_\mathrm{lb}$ where integrating the upper tail of probability density beyond $\hat{x}$ contains only $(1-c)/2\%$ probability mass.

We find,

$\begin{align*}
(1-c)/2
&= \int_\hat{x}^1 n_\mathrm{lb}x^{n_\mathrm{lb}-1} \, \mathrm{d}x\\
&= x^n_\mathrm{lb} \Big|_\hat{x}^1 \\
&= 1^n_\mathrm{lb} - \hat{x}^n_\mathrm{lb}\\
&= 1 - \hat{x}^n_\mathrm{lb}\\
(-1-c)/2
&= - \hat{x}^n_\mathrm{lb}\\
(c+1)/2
&= \hat{x}^n_\mathrm{lb}\\
n_\mathrm{lb}
&= \log_\hat{x}\Big( \frac{1+c}{2} \Big)\\
&= \frac{\log \Big( \frac{1+c}{2} \Big)}{\log\hat{x}}.
\end{align*}$


# Upper Bound

As the upper bound for our confidence interval estimating $n$, we will find the $n_\mathrm{ub}$ where integrating the lower tail of probability density beyond $\hat{x}$ contains only $(1-c)/2\%$ probability mass.

We find,

$\begin{align*}
(1-c)/2
&= \int_0^\hat{x} n_\mathrm{ub}x^{n_\mathrm{ub}-1} \, \mathrm{d}x\\
&= x^n_\mathrm{ub} \Big|_0^\hat{x} \\
&= \hat{x}^n_\mathrm{ub} - 0^n_\mathrm{ub}\\
&= \hat{x}^n_\mathrm{ub} - 0\\
(1-c)/2
&= \hat{x}^n_\mathrm{ub}\\
n_\mathrm{ub}
&= \log_\hat{x}\Big( \frac{1-c}{2} \Big)\\
&= \frac{\log \Big( \frac{1-c}{2} \Big)}{\log\hat{x}}.
\end{align*}$


# Simulated Experiments


In [None]:
def sample_observation(true_popsize: int,) -> float:
    """Simulate sampling the largest gene from within a population of `true_popsize`."""

    return max(random.random() for __ in range(true_popsize))

def calculate_popsize_confidence_interval(
    observation: float,
    confidence: float,
) -> typing.Tuple[float, float]:
    """Calculate confidence interval for true population size $n$
    given observed fixed gene magnitude $\hat{x}$."""

    lb = math.log( (1.0 + confidence)/2.0 ) / math.log(observation)
    ub = math.log( (1.0 - confidence)/2.0 ) / math.log(observation)

    return (lb, ub,)

def sample_popsize_confidence_interval(
    true_popsize: int,
    confidence: float,
) -> typing.Tuple[float, float]:
    """Generate a sampled largest gene from a `true_popsize` population
    and then to calculate a confidence interval for `true_popsize`
    based on that sampled largest gene."""

    return calculate_popsize_confidence_interval(
        sample_observation(true_popsize),
        confidence,
    )


In [None]:
# simulate gene drive within populations and then subsequent estimates of population size from magnitude of fixed genes
records = []
for true_popsize in 10, 1000,:
    for confidence in 0.8, 0.95, 0.99,:
        sampled_cis = [
            sample_popsize_confidence_interval(
                true_popsize,
                confidence,
            )
            for __ in range(1000)
        ]

        num_confidence_intervals_containing_true_popsize = sum(
            bool(lb_ <= true_popsize <= ub_)
            for lb_, ub_ in sampled_cis
        )

        records.append({
            'True Population Size' : true_popsize,
            'Confidence' : confidence,
            'Mean Normalized Confidence Interval width' : np.mean([ub_ - lb_ for lb_, ub_ in sampled_cis]) / true_popsize,
            f'Fraction Estimates within Confidence Interval'
                : num_confidence_intervals_containing_true_popsize / len(sampled_cis),
            'p As Many Estimates Outside Confidence Interval'
                : stats.binom.cdf(num_confidence_intervals_containing_true_popsize, len(sampled_cis), confidence),
        })

res_df = pd.DataFrame.from_records(records)
res_df.round(5)


Simulated experiments show the expected fractions of estimates falling within calculated confidence intervals.


# Result

For a single observation of fixed gene magnitude $\hat{x}$, the population size $n$ can be estimated with $c\%$ confidence to fall within the interval

$\begin{align*}
\Big(
\frac{\log(1+c)}{\log\hat{x}},
\frac{\log(1-c)}{\log\hat{x}}
\Big).
\end{align*}$

Simulated experiments support the validity of this result.
