In [1]:
import math
from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
import random
from scipy import stats
import sympy
import typing

random.seed(1)


In [2]:
nbm.print_metadata()


context: ci
hostname: 94e59296655e
interpreter: 3.8.10 (default, May 26 2023, 14:05:08)  [GCC 9.4.0]
nbcellexec: 2
nbname: maximum_likelihood_median_unbiased_popsize_estimator
nbpath: /opt/hereditary-stratigraph-concept/binder/popsize/maximum_likelihood_median_unbiased_popsize_estimator.ipynb
revision: null
session: ae340cc9-d3f5-4ff1-a38b-a0e35c636530
timestamp: 2023-09-24T01:03:08Z00:00


IPython==7.16.1
keyname==0.4.1
yaml==5.3.1
nbmetalog==0.2.6
numpy==1.21.5
pandas==2.0.3
scipy==1.5.4
sympy==1.5.1
re==2.2.1
ipython_genutils==0.2.0
logging==0.5.1.2
zmq==22.3.0
json==2.0.9
ipykernel==5.5.3


# Goal

Suppose the scenario described in [gene_drive_scenario.ipynb](gene_drive_scenario.ipynb).

Here, we will derive an estimator for population size $\hat{n}_\mathrm{mumle}$ that is median-unbiased.
That is, half of estimates will be overestimates and half of estimates will be underestimates.


# Strategy

Use the confidence intervals derived in [maximum_likelihood_popsize_estimator_confidence_interval.ipynb](maximum_likelihood_popsize_estimator_confidence_interval.ipynb), setting confidence to zero.


# Calculating from CI Lower Bound

Beginning from the derived expression for the confidence interval lower bound, set confidence to zero,

$\begin{align*}
0
&= 2\Gamma(k, -n_\mathrm{lb}\log(\hat{x})) - (c+1)\Gamma(k)\\
&= 2\Gamma(k, -\hat{n}_\mathrm{mumle}\log(\hat{x})) - \Gamma(k).
\end{align*}$

Beyond this point, we must solve numerically.


In [3]:
def solve_mumle(
    observations: typing.List[float],
) -> float:

    k = len(observations)
    hat_x = math.prod(observations)

    # use mle estimate as starting guess
    hat_n_mle = -k/math.log(hat_x)

    n_lb = sympy.Symbol('n_\mathrm{lb}', positive=True, real=True,)

    return float(sympy.nsolve(
        2 * sympy.uppergamma(k, -n_lb * sympy.log(hat_x))
        - sympy.gamma(k),
        hat_n_mle,
        verify=False,
    ))


# Calculating from CI Upper Bound

As a sanity check, we should arrive at the same result when deriving $\hat{n}_\mathrm{mumle}$ from the upper confidence bound,

$\begin{align*}
0
&= 2\Gamma(k, -n_\mathrm{lb}\log(\hat{x})) - (1-c)\Gamma(k)\\
&\stackrel{\checkmark}{=} 2\Gamma(k, -\hat{n}_\mathrm{mumle}\log(\hat{x})) - \Gamma(k).
\end{align*}$


# Simulated Experiments

Test whether estimator is median-unbiased.


In [4]:
def sample_observations(true_popsize: int, num_observations: int) -> typing.List[float]:
    """Simulate sampling the largest gene from within a population of `true_popsize` `num_observations` times."""

    return [
        max(random.random() for __ in range(true_popsize))
        for __ in range(num_observations)
    ]

def sample_popsize_estimate(true_popsize: int, num_observations: int) -> float:
    """Generate sampled largest genes from `true_popsize` population
    and then use maximum likelihood estimator to estimate `true_popsize`."""

    return solve_mumle(sample_observations(true_popsize, num_observations))


In [5]:
# simulate gene drive within populations and then subsequent estimates of population size from magnitude of fixed genes
records = []
for num_observations in 1, 10, 100:
    for true_popsize in 10, 100:
        sampled_estimates = [
            sample_popsize_estimate(
                true_popsize,
                num_observations,
            )
            for __ in range(50)
        ]

        num_overestimates = sum(
            est > true_popsize
            for est in sampled_estimates
        )

        records.append({
            'Num Observations' : num_observations,
            'True Population Size' : true_popsize,
            'Mean Normalized Error' : np.mean([abs(est - true_popsize) for est in sampled_estimates]) / true_popsize,
            'Median Median-Unbiased Estimate' : np.median(sampled_estimates),
            'Num Overestimates' : num_overestimates,
            'p As Many Overestimates'
                : stats.binom.cdf(num_overestimates, len(sampled_estimates), 0.5),
        })

res_df = pd.DataFrame.from_records(records)
res_df.round(5)


Unnamed: 0,Num Observations,True Population Size,Mean Normalized Error,Median Median-Unbiased Estimate,Num Overestimates,p As Many Overestimates
0,1,10,178.35206,6.8074,21,0.16112
1,1,100,6.43428,121.7442,27,0.76006
2,10,10,0.28677,10.47776,26,0.66409
3,10,100,0.2612,92.25523,20,0.10132
4,100,10,0.08589,9.9976,25,0.55614
5,100,100,0.07632,100.16721,27,0.76006


# Result

We have derived the median-unbiased estimator for population size $n$ given $k$ independent observations of fixed gene magnitude $\hat{x}_1, \hat{x}_2, ... \hat{x}_k$ based on the maximum-likelihood estimator as the solution $\hat{n}_\mathrm{mumle}$ to

$\begin{align*}
0
&= 2\Gamma(k, -\hat{n}_\mathrm{mumle}\log(\prod_{i=1}^k\hat{x}_i)) - \Gamma(k).
\end{align*}$

Simulated experiments support the validity of this result.
