In [32]:
import math
from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
import random
from scipy import stats
import sympy
import typing

random.seed(1)


In [2]:
nbm.print_metadata()


context: local
hostname: thinkpad
interpreter: 3.8.10 (default, Nov 26 2021, 20:14:08)  [GCC 9.3.0]
nbcellexec: 2
nbname: mildest_extrema_popsize_estimator
nbpath: /home/mmore500/2022-01-29/hereditary-stratigraph-concept/binder/popsize/mildest_extrema_popsize_estimator.ipynb
revision: null
session: 03c16371-de5a-4eb0-a87b-583e063f3a07
timestamp: 2022-02-08T11:57:53Z00:00


IPython==7.13.0
keyname==0.4.1
yaml==5.3.1
nbmetalog==0.2.5
re==2.2.1
ipython_genutils==0.2.0
logging==0.5.1.2
zmq==18.1.1
json==2.0.9
six==1.16.0
ipykernel==5.2.0


# Goal

Derive an estimator $\hat{n}_\mathrm{mee}$ that predicts the population size that generated $k$ independent fixed-gene magnitudes based solely on the magnitude of the least-extreme fixed gene magnitude observed.


# Strategy


Use the confidence intervals derived in [mildest_extrema_popsize_estimator_confidence_interval.ipynb](mildest_extrema_popsize_estimator_confidence_interval.ipynb), setting confidence to zero.


# Calculating from CI Lower Bound

Beginning from the derived expression for the confidence interval lower bound given $k$ observations and a mildest extreme observation $\hat{x}$, setting confidence to zero,

$\begin{align*}
\hat{n}_\mathrm{lb} 
& =\frac{
    \log \Big( - \frac{c+1}{2}^{1/k} + 1 \Big)
}{\log( \hat{x} )}\\
\hat{n}_\mathrm{mumee}
& =\frac{
    \log \Big( - \frac{0+1}{2}^{1/k} + 1 \Big)
}{\log( \hat{x} )}\\
& =\frac{
    \log \Big( - \frac{1}{2}^{1/k} + 1 \Big)
}{\log( \hat{x} )}.
\end{align*}$


# Calculating from CI Upper Bound

As a sanity check, we should arrive at the same result when deriving $\hat{n}_\mathrm{mumee}$ from the upper confidence bound,

$\begin{align*}
\hat{n}_\mathrm{yb} 
& =\frac{
    \log \Big( - \frac{1-c}{2}^{1/k} + 1 \Big)
}{\log( \hat{x} )}\\
\hat{n}_\mathrm{mumee}
& =\frac{
    \log \Big( - \frac{1-0}{2}^{1/k} + 1 \Big)
}{\log( \hat{x} )}\\
& \stackrel{\checkmark}{=}\frac{
    \log \Big( - \frac{1}{2}^{1/k} + 1 \Big)
}{\log( \hat{x} )}.
\end{align*}$


# Simulated Experiments

Test whether estimator is median-unbiased.


In [26]:
def sample_observations(true_popsize: int, num_observations: int) -> typing.List[float]:
    """Simulate sampling the largest gene from within a population of `true_popsize` `num_observations` times."""
    return [
        max(random.random() for __ in range(true_popsize))
        for __ in range(num_observations)
    ]

def estimate_popsize(observations: typing.List[float]) -> float:
    """Use maximum likelihood estimator to estimate underlying population size given `observations`."""
    least_extreme = min(observations)
    k = len(observations)
    return math.log(1.0 - 0.5**(1/k)) / math.log(least_extreme)

def sample_popsize_estimate(true_popsize: int, num_observations: int) -> float:
    """Generate sampled largest genes from `true_popsize` population
    and then use maximum likelihood estimator to estimate `true_popsize`."""
    return estimate_popsize(sample_observations(true_popsize, num_observations))


In [34]:
# simulate gene drive within populations and then subsequent estimates of population size from magnitude of fixed genes
records = []
for num_observations in 1, 2, 4, 8, 10, 20, 100, 423:
    for true_popsize in 10, 1000:
        sampled_estimates = [
            sample_popsize_estimate(
                true_popsize,
                num_observations,
            )
            for __ in range(500)
        ]
        
        num_overestimates = sum(
            est > true_popsize
            for est in sampled_estimates
        )
        
        records.append({
            'Num Observations' : num_observations,
            'True Population Size' : true_popsize,
            'Mean Normalized Error' : np.mean([abs(est - true_popsize) for est in sampled_estimates]) / true_popsize,
            'Median Median-Unbiased Estimate' : np.median(sampled_estimates), 
            'Num Overestimates' : num_overestimates, 
            'p As Many Overestimates'
                : stats.binom.cdf(num_overestimates, len(sampled_estimates), 0.5),
        })

res_df = pd.DataFrame.from_records(records)
res_df.round(5)


Unnamed: 0,Num Observations,True Population Size,Mean Normalized Error,Median Median-Unbiased Estimate,Num Overestimates,p As Many Overestimates
0,1,10,5.90238,9.62942,246,0.37714
1,1,1000,5.53942,987.99298,246,0.37714
2,2,10,1.08372,9.86799,245,0.34368
3,2,1000,0.93212,1021.0434,258,0.77644
4,4,10,0.57252,9.18518,224,0.01123
5,4,1000,0.5953,1006.18413,256,0.71949
6,8,10,0.34763,9.85764,244,0.3114
7,8,1000,0.40793,1047.47632,271,0.97282
8,10,10,0.349,10.06507,255,0.6886
9,10,1000,0.35179,992.6164,247,0.41155


# Result

For $k$ observations of fixed gene magnitude $\hat{x}_i$, we have derived a median-unbiased estimator of population size $\hat{n}_\mathrm{mumee}$ based on the maximum-likelihood estimator as

$\begin{align*}
\hat{n}_\mathrm{mumee} 
& =\frac{
    \log \Big( - \frac{1}{2}^{1/k} + 1 \Big)
}{\log( \prod_{i=1}^k\hat{x}_i) )}.
\end{align*}$

Numerical simulations support this result.
