In [None]:
import ergo
from ergo.scale import Scale
import ergo.distributions as dist

import seaborn
import matplotlib.pyplot as plt

import jax.numpy as np

import os
from dotenv import load_dotenv

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

Two convience functions for debugging:

In [None]:
def graph(dist, bins=100):
    """Graph the pdf of some distribution"""
    xs = np.linspace(dist.ppf(0.01), dist.ppf(0.99), bins)
    ys = [dist.pdf(x) for x in xs]
    plt.plot(xs, ys)
    
def display_params(mixture):
    """Print a little summary of the composition of a logistic mixture.
    Not espeically robust, but good enough for here."""
    for i,c in enumerate(mixture.components):
        if hasattr(c, "base_dist"):
            c = c.base_dist
        print(f"Loc: {c.true_loc:3g} \t({c.loc:3g})",
              f"Scale: {c.true_s:3g} \t({c.s:3g})",
              f"Prob: {mixture.probs[i]:3g}",
             sep="\t")

In [None]:
# Load Metaculus data using ought credentials

def get_metaculus():
    load_dotenv()
    uname = str(os.getenv("METACULUS_USERNAME"))
    pwd = str(os.getenv("METACULUS_PASSWORD"))
    user_id_str = str(os.getenv("METACULUS_USER_ID"))
    if None in [uname, pwd, user_id_str]:
        raise ValueError(
            ".env is missing METACULUS_USERNAME, METACULUS_PASSWORD, or METACULUS_USER_ID"
        )
    user_id = int(user_id_str)
    metaculus = ergo.Metaculus(uname, pwd)
    assert metaculus.user_id == user_id
    return metaculus

# We use the "continious linear open question" from the test suite

metaculus = get_metaculus()
question = metaculus.get_question(3962)
print("Question Name: ", question.name)
print("Question Scale: ", question.scale)

We make two test functions.  The first directly uses the LogisticMixture from_samples constructor.  The second uses the metaculus question's `get_submission_from_samples()` function, which:
1. normalizes the samples
2. calls the `from_samples` constructor
3. prepares the logistic mixture, including clipping and enforcing the scale to be (0,1)

In [None]:
def test_fit(lm):
    samples = np.array([lm.sample() for _ in range(0, 5000)])
    fit = dist.LogisticMixture.from_samples(samples, fixed_params={"num_components": 3},
                                        init_tries=200, opt_tries=3)
    
    graph(lm)
    graph(fit)
    
    plt.legend(["original ppf", "fit ppf"])
    
    print("original parameters:")
    display_params(lm)
    print("fit parameters:")
    display_params(fit)
    
    return fit
    
def test_question_fit(question, lm):
    assert lm.scale == question.scale
    
    samples = np.array([lm.sample() for _ in range(0, 5000)])
    fit = question.get_submission_from_samples(samples)
        
    normalized_lm = lm.normalize()
    
    graph(normalized_lm)
    graph(fit)
    
    plt.legend(["original ppf (normalized)", "fit ppf (normalized)"])
    
    print("original parameters:")
    display_params(normalized_lm)
    print("fit parameters:")
    display_params(fit)
    
    return fit

The fitting works well in most cases:

In [None]:
lm_1 = dist.LogisticMixture(
        components=[
            dist.Logistic(loc=400000, s=100000, scale=question.scale),
            dist.Logistic(loc=700000, s=50000, scale=question.scale),
        ],
        probs=[0.8, 0.2],
    )

test_fit(lm_1);

It fails in certain edge cases, however -- including when there are components with very narrow distributions.

In [None]:
lm_2 = dist.LogisticMixture(
        components=[
            dist.Logistic(loc=400000, s=100000, scale=question.scale),
            dist.Logistic(loc=700000, s=5000, scale=question.scale),
        ],
        probs=[0.8, 0.2],
    )

test_fit(lm_2);

I believe the specific bug above is due to the fact that the `logistic_mixture.from_params()` method clips the scale between 0.01 and 0.5 (when normalized).  I'm not sure why this is, but it at least seems like something that should be documented or raise a warning instead of happening implicitly.

# Question Interface

In [None]:
fit = test_question_fit(question, lm_1)

This clearly doesn't work as well -- seemingly because of a scale issue.  As an alternative way of checking, we can take samples from the new distribution and denormalize them to compare to the original distribution:

In [None]:
normalized_samples = np.array([fit.sample() for _ in range(1000)])
denormalized_samples = question.denormalize_samples(normalized_samples)

graph(lm_1)
seaborn.distplot(denormalized_samples)

I think this problem is due to the fact that `prepare_logistic` is passed distributions which aren't truely normalized, just made to fit normalized samples -- but then assumes that they are normalized anyway.