In [1]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sktree.stats import HyppoForestRegressor, PermutationForest

seed = 12345

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def linear_model_ancova(sigma_factor=2.0, seed=None):
    r"""Test MIGHT using MSE from linear model simulation.

    See https://arxiv.org/pdf/1904.07830.pdf Figure 1.

    Y = Beta * X_1 + Beta * I(X_6 = 2) + \epsilon
    """
    beta = 10.0
    sigma = 10.0 / sigma_factor
    n_samples = 2200
    n_estimators = 125
    test_size = 0.1

    rng = np.random.default_rng(seed)

    # sample covariates
    X_15 = rng.uniform(0, 1, size=(n_samples, 5))
    X_610 = np.zeros((n_samples, 5))
    for idx in range(5):
        buff = np.argwhere(
            rng.multinomial(1, [1.0 / 3, 1.0 / 3, 1.0 / 3], size=n_samples)
        )[:, 1]

        X_610[:, idx] = buff

    X = np.concatenate((X_15, X_610), axis=1)
    assert X_15.shape == (n_samples, 5)
    assert X_610.shape == (n_samples, 5)
    assert X.shape == (n_samples, 10)

    # sample noise
    epsilon = rng.normal(size=n_samples, loc=0.0, scale=sigma)

    # compute final y of (n_samples,)
    y = beta * X[:, 0] + (beta * (X[:, 5] - 2)) + epsilon

    # initialize hypothesis tester
    est = PermutationForest(
        max_features=1.0,
        random_state=seed,
        n_estimators=n_estimators,
        n_jobs=-1,
        # bootstrap=True,
        # max_samples=subsample_size
    )
    pvalue_dict = {}

    # test for X_1
    stat, pvalue = est.test(X.copy(), y.copy(), [0], n_repeats=100, test_size=test_size)
    print("X1: ", pvalue)
    pvalue_dict["X1"] = pvalue
    # assert pvalue < 0.05, f"pvalue: {pvalue}"

    # test for X_6
    stat, pvalue = est.test(X.copy(), y.copy(), [5], n_repeats=100, test_size=test_size)
    print("X6: ", pvalue)
    pvalue_dict["X6"] = pvalue
    # assert pvalue < 0.05, f"pvalue: {pvalue}"

    # test for a few unimportant other X
    for name, covariate_index in zip(["X2", "X7"], [1, 6]):
        # test for X_2, X_7
        stat, pvalue = est.test(
            X.copy(), y.copy(), [covariate_index], n_repeats=100, test_size=test_size
        )
        print("X2/7: ", pvalue)
        pvalue_dict[name] = pvalue
        # assert pvalue > 0.05, f"pvalue: {pvalue}"

    return pvalue_dict


def linear_model_mars():
    pass


def correlated_logit_model():
    pass


def random_forest_model():
    pass

In [4]:
pvalue_dict = defaultdict(list)
rng = np.random.default_rng(seed)

j_space = np.linspace(0.005, 2.25, 9)

for sigma_factor in j_space:
    for idx in range(5):
        new_seed = rng.integers(0, np.iinfo(np.uint32).max, dtype=np.uint32)

        elements_dict = linear_model_ancova(sigma_factor, new_seed)
        for key, value in elements_dict.items():
            pvalue_dict[key].append(value)
        pvalue_dict["sigma_factor"].append(sigma_factor)

df = pd.DataFrame(pvalue_dict)

X1:  0.7623762376237624
X6:  0.0891089108910891
X2/7:  0.8316831683168316
X2/7:  0.8712871287128713
X1:  0.9504950495049505
X6:  1.0
X2/7:  1.0
X2/7:  1.0
X1:  0.693069306930693
X6:  0.6039603960396039
X2/7:  0.39603960396039606
X2/7:  0.594059405940594
X1:  0.9306930693069307
X6:  0.9900990099009901
X2/7:  0.9801980198019802
X2/7:  1.0
X1:  0.36633663366336633
X6:  0.039603960396039604
X2/7:  0.7623762376237624
X2/7:  0.9603960396039604
X1:  0.21782178217821782
X6:  0.43564356435643564
X2/7:  0.6237623762376238
X2/7:  0.24752475247524752
X1:  0.42574257425742573
X6:  0.7425742574257426
X2/7:  0.44554455445544555
X2/7:  0.1188118811881188
X1:  0.36633663366336633
X6:  0.0297029702970297
X2/7:  0.13861386138613863
X2/7:  0.1485148514851485
X1:  0.9405940594059405
X6:  1.0
X2/7:  0.9207920792079208
X2/7:  1.0
X1:  0.504950495049505
X6:  0.009900990099009901
X2/7:  0.5742574257425742
X2/7:  1.0
X1:  0.7128712871287128
X6:  0.2079207920792079
X2/7:  0.9306930693069307
X2/7:  1.0
X1:  0.267

In [5]:
print("done")

done
