# A comparison of CoMIGHT against KSG and Linear Regression Based Estimates of Conditional Independence

CoMIGHT is a fully non-parametric method for i) estimating CMI and ii) providing a pvalue indicating the statistical significance of the estimated CMI compared to the null hypothesis where the CMI is 0 (for a given dimensionality and sample size using permutation principles).

Another nonparametric method for CMI is the kNN based KSG estimator, which also can be used along a permutation test to obtain a pvalue.

Finally, a parametric method for CMI is the linear regression method, which computes partial correlation among variables under the assumption that the data arises from Gaussian distributions.

In [1]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import expit
from sklearn.datasets import (
    make_blobs,
    make_classification,
    make_sparse_spd_matrix,
    make_spd_matrix,
)

from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor
from sktree.datasets.multiview import make_gaussian_mixture, make_joint_factor_model
from sktree.stats import (
    FeatureImportanceForestClassifier,
    FeatureImportanceForestRegressor,
    PermutationForestRegressor,
)
from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier

seed = 12345
rng = np.random.default_rng(seed)

# Define Dataset Generators

## Confounder Y <- X1 -> X2

In [3]:
def make_confounder(n_samples, n_features, noise_dims, class_probs, seed):
    rng = np.random.default_rng(seed)

    fixed_center = rng.standard_normal(size=(n_features,))
    centers = [fixed_center, fixed_center]

    covariances = [
        make_spd_matrix(n_dim=n_features, random_state=seed),
        make_spd_matrix(n_dim=n_features, random_state=seed + 123),
    ]

    Xs, y = make_gaussian_mixture(
        centers,
        covariances,
        n_samples=n_samples,
        noise=1.0,
        noise_dims=noise_dims,
        shuffle=True,
        class_probs=class_probs,
        random_state=seed + idx,
    )

    signal_X = np.hstack((Xs[1], Xs[0]))
    n_features_ends = [
        n_features + noise_dims,
        n_features_2 + n_features + noise_dims * 2,
    ]

    return signal_X, y

In [None]:
def make_collider():
    signal_X_1, y1 = make_classification(
        n_samples=n_samples,
        n_features=n_features_1 + noise_dims,
        n_informative=n_features_1,
        n_redundant=10,
        n_repeated=0,
        n_classes=2,
        class_sep=1.0,
        flip_y=0.02,
        shuffle=False,
        random_state=seed + idx,
    )
    signal_X_2, y2 = make_classification(
        n_samples=n_samples,
        n_features=n_features_1 + noise_dims,
        n_informative=n_features_1,
        n_redundant=10,
        n_repeated=0,
        n_classes=2,
        class_sep=0.75,
        flip_y=0.02,
        shuffle=False,
        random_state=seed + idx + idx,
    )
    signal_X = np.hstack((signal_X_1, signal_X_2))
    y = y1.copy()

    keep_inds = np.argwhere(y1 == y2)
    y = y[keep_inds, ...]
    y2 = y2[keep_inds, ...].squeeze()
    signal_X = signal_X[keep_inds, ...].squeeze()

    np.testing.assert_array_equal(y.squeeze(), y2)
    return signal_X, y2

In [None]:
def make_mediator():
    Xs, U, _ = make_joint_factor_model(
        n_views,
        n_features,
        n_samples=n_samples,
        joint_rank=joint_rank,
        noise_std=10.0,
        m=0.25,
        random_state=seed + idx,
        return_decomp=True,
    )

    signal_X = np.hstack(Xs)
    signal_X += rng.standard_normal(size=signal_X.shape)
    signal_X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, noise_dims))))
    n_features_1 = signal_X.shape[1]
    print(signal_X.shape)
    y = rng.binomial(n=1, p=expit(signal_X[:, :n_features].sum(axis=1)), size=n_samples)

    U += rng.standard_normal(size=U.shape)  # * 2.0
    U = np.hstack(
        (U, rng.standard_normal(size=(signal_X.shape[0], n_features_1 - joint_rank)))
    )

    n_features_ends = [n_features_1, signal_X.shape[1]]
    # print(n_features_ends)
    print(U.shape, signal_X.shape, y.shape, [x.shape for x in Xs])