## Parameter estimator algorithm

**Note:** I accidentally removed the algorithm before being committed, therefore this notebook might not run properly.  
At some point in the future I could invest some time in getting it in again.

In [None]:
from py_scripts.normal_parameter_estimator import ParameterEstimatorRaw, ParameterEstimatorSummary
from scipy.stats import norm
import numpy as np

s = 101
np.random.seed(10)
sample = norm(10, 3).rvs(100)
ParameterEstimatorRaw(sample).run()

In [None]:
ParameterEstimatorSummary(sample).run()


## Test that the assumption for summary statistics is right

In [None]:
from scipy.stats import gaussian_kde
import seaborn as sns

mu, sigma, n = 42, 17, 20
sample_test = norm(mu, sigma).rvs((1000, n))

stm = sample_test.mean(axis=1)
sts = sample_test.std(axis=1)

low, high = mu - sigma * 1.5, mu + sigma * 1.5
pdf_range = np.linspace(low, high, 101)

stm_distribution = gaussian_kde(stm).pdf(pdf_range)
algorithm_mean = norm(mu, sigma / np.sqrt(n)).pdf(pdf_range)

sns.lineplot(x=pdf_range, y=stm_distribution, label="KDE of sample test");
sns.lineplot(x=pdf_range, y=algorithm_mean, label="algorithm pdf");


## How to compute the distribution of a difference

In [None]:
def distribution_difference(a, b):
    """
    Compute the distribution of the difference b-a.

    Parameters:
        a, b: 1d numpy arrays
    Returns:
        A pandas Series where the index are the diffs and the values are the probability
        for each diff.
    """
    # get the cartesian grid of pairs
    vx, vy = np.meshgrid(b, a)

    # Get the joint probability of each pair and normalise it.
    joint = (vx * vy).ravel()
    joint = joint / joint.sum()

    # Get the difference between pairs
    difference = (vx - vy).ravel()

    # Assemble a pandas dataframe to be able to group by the difference. We then can add
    # up the probabilities as they contribute to the difference value.
    result = pd.DataFrame(
        {"diff": difference, "joint": joint}
    )
    return result.groupby('diff').sum()

a = np.arange(10, 15)
b = np.arange(7, 9)

distribution_difference(a, b)

## Cranky science exercise

In [None]:
from scipy.stats import norm, chi2
import numpy as np
import pandas as pd


class GenerateComparisonDistribution:

    hs_size = 100

    def __init__(self, m, s, n):
        self.m = m
        self.s = s
        self.n = n
        self.mu_range, self.sigma_range = self.compute_ranges()
        self.cartesian_product = self.compute_cartesian_product()

    def compute_ranges(self, factor=.1):
        """Get the ranges for the pdfs."""
        sigma_low = self.s - factor
        sigma_high = self.s + factor
        sigma_range = np.linspace(sigma_low, sigma_high, self.hs_size)

        # Compute mu range
        mu_range = np.linspace(self.m - factor, self.m + factor, self.hs_size)
        return mu_range, sigma_range

    def compute_cartesian_product(self):
        return np.meshgrid(self.mu_range, self.sigma_range)

    def get_clean_prior(self):
        return np.ones((self.hs_size, self.hs_size)) / self.hs_size**2

    def compute_likes_m(self):
        mm, ss = self.cartesian_product
        return norm(mm, ss/np.sqrt(self.n)).pdf(self.m)

    def compute_likes_s(self):
        _, ss = self.cartesian_product
        t = self.n * (self.s / ss)**2
        return chi2(self.n - 1).pdf(t)

    def compute_posterior(self):
        return self.get_clean_prior() * self.compute_likes_m() * self.compute_likes_s()

    @staticmethod
    def div_dist(marginal_m, marginal_s):
        vx, vy = np.meshgrid(marginal_m, marginal_s)

        # get joint probabilities and normalise them
        joint = (vx * vy).ravel()
        joint = joint / joint.sum()

        # get the division between them
        div = (vy / vx).ravel()

        df = pd.DataFrame({
            "div": div,
            "p": joint,
        })
        return df.groupby("div").p.sum()

    def run(self):
        posterior = self.compute_posterior()
        marginal_m = posterior.sum(axis=0)
        marginal_s = posterior.sum(axis=1)
        div = self.div_dist(marginal_m, marginal_s)
        return np.sum(div.index * div.values)

# Male summary statistics
male_m = 178
male_s = 8.27
male_n = 154_407

# female summary statistics
female_m = 163
female_s = 7.75
female_n = 254_772

male_estimation = GenerateComparisonDistribution(
    m=male_m,
    s=male_s,
    n=male_n
).run()

female_estimation = GenerateComparisonDistribution(
    m=female_m,
    s=female_s,
    n=female_n
).run()
male_estimation, female_estimation