# Generate Datasets for Figure 2 (Co)MIGHT

Here, we would like to generate the following datasets that contain the tuple (X1, X2, y), where X1, X2 and y have different structural and functional relationships. The structural relationships are there to help determine what relationships each variable set has with respect to another. The functional relationships determine what sort of functional dependencies each has on another. One can interpret these graphical model relationships "causally", but for our purposes, we simply utilize their probabilistic constraints. 

In this figure, we will specifically address the questions:

- Can we detect when X2 has additional information about y conditioned on X1?
- Can we capture the signal X2 provides about y conditioned on X1?

1. (X1 -> y <- X2): This demonstrates that X1 and X2 both provide information on y, but are themselves independent. The arrowhead edge can have different functional relationships. In this case, we will use X1 -> y as "linear" and "X2 -> y" as "log", meaning there is a linear relationship. 

2. (X1 -> y <- X2; X1 -> X2): This means X1 and X2 both provide information on y, but the information provided by X1 is both direct and mediated by X2. Here, we will use X1 and X2 relationship with y as "linear", but X2 is a noisy nonlinear transformation of X1. X2 is conditionally dependent of y given X1.

3. (X2 <- X1 -> y): Each relationship is linear. In this case, X2 is conditionally independent of y given X1.

4. (X1 -> y; X2): Each relationship is linear. In this case, again X2 is conditionally independent of y given X1.

Here, we present different ways that X1, X2 and y are structurally related, such that the null hypothesis of `X2 \perp y | X1` is tested. We also change the relationship of X2 in various settings. However, the important part is generating data with different conditional independence relationships with respect to y.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import os
from collections import defaultdict
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from hyppo.conditional import ConditionalDcorr
from joblib import Parallel, delayed
from scipy.special import expit
from scipy.stats import ortho_group
from sklearn.datasets import (
    make_blobs,
    make_classification,
    make_sparse_spd_matrix,
    make_spd_matrix,
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    StratifiedKFold,
    StratifiedShuffleSplit,
    cross_val_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor
from sktree.datasets.multiview import make_gaussian_mixture, make_joint_factor_model
from sktree.stats import (
    FeatureImportanceForestClassifier,
    FeatureImportanceForestRegressor,
    PermutationForestRegressor,
    PermutationTest,
)
from sktree.stats.utils import (
    METRIC_FUNCTIONS,
    POSITIVE_METRICS,
    POSTERIOR_FUNCTIONS,
    REGRESSOR_METRICS,
    _compute_null_distribution_coleman,
    _non_nan_samples,
)
from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier

seed = 12345
rng = np.random.default_rng(seed)

In [3]:
def linear(n, p, noise=False, coeffs=None):
    x = np.random.normal(size=(n, p))
    eps = np.random.normal(size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.0022 * i) for i in range(p)])
    y = x * coeffs + noise * eps

    return x, y


def exponential(n, p, noise=False, coeffs=None):
    x = np.random.normal(scale=3, size=(n, p))
    eps = np.random.normal(scale=3, size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.022 * (i + 52)) for i in range(p)])
    y = np.exp(x * coeffs) - 1 + noise * eps

    return x, y


def cubic(n, p, noise=False, coeffs=None):
    x = np.random.normal(size=(n, p))
    eps = np.random.normal(size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.031 * (i + 25)) for i in range(p)])

    x_coeffs = x * coeffs
    y = x_coeffs**3 + x_coeffs**2 + x_coeffs + noise * eps

    return x, y


def step(n, p, noise=False, coeffs=None):
    x = np.random.normal(size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.0457 * (i + 10)) for i in range(p)])
    eps = np.random.normal(size=(n, p))

    x_coeff = ((x * coeffs) > 0.5) * 1
    y = x_coeff + noise * eps

    return x, y


def quadratic(n, p, noise=False, coeffs=None):
    x = np.random.normal(size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(p)])
    eps = np.random.normal(size=(n, p))

    x_coeffs = x * coeffs
    y = x_coeffs**2 + noise * eps

    return x, y


def w_shaped(n, p, noise=False, coeffs=None):
    x = np.random.normal(scale=30, size=(n, p))
    u = np.random.normal(scale=30, size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.2735 * (i + 10)) for i in range(p)])
    eps = np.random.normal(scale=30, size=(n, p))
    x_coeffs = x * coeffs
    u_coeffs = u * coeffs
    y = ((x_coeffs**4) - 7 * x_coeffs**2) + noise * eps

    return x, y


def logarithmic(n, p, noise=False, coeffs=None):
    rng = np.random.default_rng()
    if coeffs is None:
        coeffs = np.array([np.exp(-0.072 * i) for i in range(p)])

    sig = np.identity(p)
    x = rng.standard_normal(size=(n, p))
    eps = rng.standard_normal(size=(n, p))

    y = np.log((x * coeffs + 1) ** 2) + noise * eps

    return x, y


def fourth_root(n, p, noise=False, coeffs=None):
    x = np.random.normal(size=(n, p))
    eps = np.random.normal(size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.25 * (i + 50)) for i in range(p)])

    x_coeffs = x * coeffs
    y = 10 * np.abs(x_coeffs) ** 0.25 + noise * eps

    return x, y


def _sin(n, p, noise=False, period=4 * np.pi, coeffs=None):
    rng = np.random.default_rng()

    if period == 4 * np.pi and coeffs is None:
        coeffs = np.array([np.exp(-0.0095 * (i + 50)) for i in range(p)])
    elif period == 16 * np.pi and coeffs is None:
        coeffs = np.array([np.exp(-0.015 * (i + 50)) for i in range(p)])
    x = rng.normal(size=(n, p))
    sig = np.identity(p)
    v = rng.multivariate_normal(np.zeros(p), sig, size=n, method="cholesky")
    eps = rng.normal(size=(n, p))

    y = np.sin(x * coeffs * period) + noise * eps

    return x, y


def sin_four_pi(n, p, noise=False, coeffs=None):
    return _sin(n, p, noise=noise, period=4 * np.pi, coeffs=coeffs)


def sin_sixteen_pi(n, p, noise=False, coeffs=None):
    return _sin(n, p, noise=noise, period=16 * np.pi, coeffs=coeffs)


def _square_diamond(n, p, noise=False, low=-1, high=1, period=-np.pi / 2, coeffs=None):
    u = np.random.uniform(low, high, size=(n, p))
    v = np.random.uniform(low, high, size=(n, p))
    sig = np.identity(p)
    eps = np.random.uniform(low, high, size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.0042 * (i + 10)) for i in range(p)])

    x = u * np.cos(period) + v * np.sin(period)
    y = -u * coeffs * np.sin(period) + v * coeffs * np.cos(period) + eps * noise

    return x, y


def square(n, p, noise=False, low=-1, high=1, coeffs=None):
    return _square_diamond(
        n, p, noise=noise, low=low, high=high, period=-np.pi / 8, coeffs=coeffs
    )


def two_parabolas(n, p, noise=False, prob=0.5, coeffs=None):
    x = np.random.normal(size=(n, p))
    if coeffs is None:
        coeffs = np.array([np.exp(-0.00145 * (i + 50)) for i in range(p)])
    u = np.random.binomial(1, prob, size=(n, 1))
    eps = np.random.normal(size=(n, p))

    x_coeffs = x * coeffs
    y = (x_coeffs**2) * (u - 0.5) + noise * eps

    return x, y


def diamond(n, p, noise=False, low=-1, high=1, coeffs=None):
    return _square_diamond(
        n, p, noise=noise, low=low, high=high, period=-np.pi / 4, coeffs=coeffs
    )


def multimodal_independence(n, p, prob=0.5, sep1=3, sep2=2):
    rng = np.random.default_rng()

    sig = np.identity(p)
    u = rng.multivariate_normal(np.zeros(p), sig, size=n, method="cholesky")
    v = rng.multivariate_normal(np.zeros(p), sig, size=n, method="cholesky")
    u_2 = rng.binomial(1, prob, size=(n, p))
    v_2 = rng.binomial(1, prob, size=(n, p))

    x = u / sep1 + sep2 * u_2 - 1
    y = v / sep1 + sep2 * v_2 - 1

    return x, y

In [4]:
def make_log_collider(
    n_samples,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    seed,
    noise_1=False,
    noise_2=False,
):
    """
    This generates a X1 -> y <- X2 where X2's log is correlated with y.

    X1 ~ N(0, 1) for n_features. Then X1 is concatenated with noise dimensions N(0, 1).

    X2 ~ log(\beta * (N(0, 1)) for n_features_2. Here \beta is a vector that's linearly
    decreasing as the dimensionality increases.
    """
    # generate X1 with linear relationship to y
    x_1, x1y = linear(n_samples, n_features, noise=noise_1)
    x_1 = np.vstack((x_1, x1y))

    # now add noise dimensions for x_1
    x_1 = np.hstack((x_1, np.random.standard_normal((len(x_1), noise_dims))))

    # generate independent X2 that has logarithmic separation
    x_2, x2y = logarithmic(n_samples, n_features_2, noise=noise_2)
    x_2 = np.vstack((x_2, x2y))

    # now add noise dimensions for x_2
    x_2 = np.hstack((x_2, np.random.standard_normal((len(x_2), noise_dims_2))))

    # stack them together
    x = np.hstack((x_1, x_2))

    # now generate y, which is a function of both X1 and X2
    y = np.array([0] * (n_samples) + [1] * (n_samples)).reshape(-1, 1).ravel()
    # print(x_1.shape, x_2.shape, y.shape)
    return x, y

In [5]:
def make_directindirect(
    n_samples, n_features, n_features_2, noise_dims, class_probs, seed
):
    rng = np.random.default_rng(seed)

    fixed_center = rng.standard_normal(size=(n_features_2,))
    centers = [fixed_center, fixed_center]

    covariances = [
        make_spd_matrix(n_dim=n_features_2, random_state=seed),
        make_spd_matrix(n_dim=n_features_2, random_state=seed + 123),
    ]

    Xs, y = make_gaussian_mixture(
        centers,
        covariances,
        n_samples=n_samples,
        noise=1.0,
        noise_dims=0,
        shuffle=True,
        class_probs=class_probs,
        random_state=seed,
    )
    Xs[0] = Xs[0][:, :n_features]
    # print([x.shape for x in Xs])
    # print(Xs[0].shape)
    noise_arr = rng.standard_normal(size=(n_samples, noise_dims))
    # print(noise_arr.shape)
    # Xs[0] = np.hstack((Xs[0], ))
    signal_X = np.hstack((Xs[0], noise_arr, Xs[1]))

    return signal_X, y

In [6]:
# def make_directindirect(
#     n_samples,
#     n_features,
#     n_features_2,
#     noise_dims,
#     noise_dims_2,
#     seed,
#     noise_1=False,
#     noise_2=False,
# ):
#     """
#     TODO
#     """
#     # generate X1 with linear relationship to y
#     x_1, x1y = linear(n_samples, n_features, noise=noise_1)

#     # generate X2 as a function of X1
#     x = np.random.normal(size=(n, p))
#     eps = np.random.normal(size=(n_samples, n_features_2))
#     if coeffs is None:
#         coeffs = np.array([np.exp(-0.0022 * (i + 10)) for i in range(n_features_2)])
#     x_21 = x1y * coeffs + noise_2 * eps

#     x_2, x2y = logarithmic(n_samples, n_features_2, noise=noise_2)

#     x_1 = np.vstack((x_1, x1y))
#     # now add noise dimensions for x_1
#     x_1 = np.hstack((x_1, np.random.standard_normal((len(x_1), noise_dims))))

#     x_2 = np.vstack((x_2, x2y))
#     # now add noise dimensions for x_2
#     x_2 = np.hstack((x_2, np.random.standard_normal((len(x_2), noise_dims_2))))

#     # stack them together
#     x = np.hstack((x_1, x_2))

#     # now generate y, which is a function of both X1 and X2
#     y = np.array([0] * (n_samples // 2) + [1] * (n_samples // 2)).reshape(-1, 1).ravel()
#     return x, y

In [7]:
def make_confounder(
    n_samples,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    seed,
    noise_1=False,
    noise_2=False,
):
    """Make a X1, X2, Y with a confounded relationship between X2 and Y.

    Here, X2 will be conditionally independent of Y given X1.
    X1 ~ N(0, 1) for n_features. Then X1 is concatenated with noise dimensions N(0, 1).

    X2 ~ \beta * (N(0, 1)) for n_features_2. Here \beta is a vector that's linearly
    decreasing as the dimensionality increases.
    """
    # generate X1 with linear relationship to y
    x_1, x1y = linear(n_samples, n_features, noise=noise_1)
    x_1 = np.vstack((x_1, x1y))
    # now add noise dimensions for x_1
    x_1 = np.hstack((x_1, np.random.standard_normal((len(x_1), noise_dims))))

    # generate X2 as a function of X1
    eps = np.random.standard_normal(size=(n_samples * 2, n_features_2))

    # n_features x n_features
    rand_U = ortho_group.rvs(n_features)[:, :n_features]
    # n_features_2 x n_features_2
    rand_V = ortho_group.rvs(n_features_2)[:, :n_features_2].T
    # n_features x n_features_2
    svals = np.zeros((n_features, n_features_2))
    np.fill_diagonal(svals, [np.exp(-0.022 * i) for i in range(n_features)])

    # print(rand_U.shape, svals.shape, rand_V.shape)
    # n_features x n_features_2
    coeff_arr = rand_U @ svals @ rand_V

    # coeffs = np.array([np.exp(-0.022 * (i)) for i in range(n_features_2)])
    x_2 = x_1[:, :n_features] @ coeff_arr + noise_2 * eps

    # x_2 = np.vstack((x_2, x2y))
    # now add noise dimensions for x_2
    x_2 = np.hstack((x_2, np.random.standard_normal((len(x_2), noise_dims_2))))

    # stack them together
    x = np.hstack((x_1, x_2))

    # now generate y, which is a function of both X1 and X2
    y = np.array([0] * (n_samples) + [1] * (n_samples)).reshape(-1, 1).ravel()
    # print(x.shape, y.shape)
    return x, y

# Generate Datasets

In [8]:
# number of features in the first view
n_features = 10
noise_dims = 90
n_features_2 = 32
noise_dims_2 = 4096 - n_features_2 - (n_features + noise_dims)
n_features_2_full = 4096

n_samples_full = 2048 // 2

In [9]:
n_repeats = 100

In [10]:
print(n_features, noise_dims, n_features_2, noise_dims_2)

10 90 32 3964


## Log (Collider)

In [235]:
def generate_log_collider_dataset(
    n_samples,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    n_repeats,
):
    for idx in range(n_repeats):
        rng = np.random.default_rng(seed * idx)
        X, y = make_log_collider(
            n_samples,
            n_features,
            n_features_2,
            noise_dims,
            noise_dims_2,
            seed * idx,
            noise_1=True,
            noise_2=True,
        )

        n_features_ends = [n_features + noise_dims, X.shape[1]]
        # print(X.shape, y.shape, n_features_ends)
        np.savez(
            f"/Users/adam2392/Desktop/cancer/data/log_collider/log_collider_{idx}.npz",
            X=X,
            y=y,
        )

In [236]:
print(n_samples_full)

1024


In [237]:
generate_log_collider_dataset(
    n_samples_full,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    n_repeats,
)

## Confounder

In [233]:
def generate_confounder_dataset(
    n_samples,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    n_repeats,
):
    for idx in range(n_repeats):
        rng = np.random.default_rng(seed * idx)
        X, y = make_confounder(
            n_samples,
            n_features,
            n_features_2,
            noise_dims,
            noise_dims_2,
            seed * idx,
            noise_1=True,
            noise_2=True,
        )

        make_log_collider(
            n_samples,
            n_features,
            n_features_2,
            noise_dims,
            noise_dims_2,
            seed * idx,
            noise_1=True,
            noise_2=True,
        )

        n_features_ends = [n_features + noise_dims, X.shape[1]]
        # print(X.shape, y.shape, n_features_ends)
        np.savez(
            f"/Users/adam2392/Desktop/cancer/data/confounder/confounder_{idx}.npz",
            X=X,
            y=y,
        )

In [234]:
generate_confounder_dataset(
    n_samples_full,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    n_repeats,
)

## Direct/Indirect Effects

In [221]:
def generate_directindirect_dataset(
    n_samples,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    n_repeats,
    seed,
):
    for idx in range(n_repeats):
        rng = np.random.default_rng(seed * idx)
        signal_X, y = make_directindirect(
            n_samples=n_samples,
            n_features=n_features,
            n_features_2=n_features_2,
            noise_dims=100 - n_features,
            class_probs=class_probs,
            seed=seed * idx,
        )
        X = np.hstack(
            (
                signal_X,
                rng.standard_normal(size=(n_samples, noise_dims_2)),
            )
        )
        # # signal_X = np.hstack((signal_X, Xs[1][:, :4]))
        # n_features_ends_one = 100
        # signal_X = np.hstack((signal_X, Xs[1]))
        # second_view_dim = Xs[1].shape[1]

        n_features_ends = [n_features + noise_dims, None]
        # _X = signal_X.copy()
        # second_view_dim = n_features_2 - n_features_ends[0]
        # # # if n_features_2_ - second_view_dim > 0:
        # _X = np.hstack(
        #     (
        #         _X,
        #         rng.standard_normal(
        #             size=(n_samples, n_features_2_list[-1] - second_view_dim)
        #         ),
        #     )
        # )
        # X = _X.copy()
        n_features_ends[1] = X.shape[1]
        print(signal_X.shape, X.shape, n_features_ends)
        np.savez(
            f"/Users/adam2392/Desktop/cancer/data/direct-indirect/direct-indirect_{idx}.npz",
            X=X,
            y=y,
        )

In [222]:
generate_directindirect_dataset(
    n_samples_full,
    n_features,
    n_features_2,
    noise_dims,
    noise_dims_2,
    n_repeats,
    seed,
)

## Independent View

In [214]:
def generate_independent_dataset(
    n_samples, n_features, n_features_2, noise_dims, n_repeats, seed
):
    rng = np.random.default_rng(seed)
    for idx in range(n_repeats):
        n_features_begin = 0
        signal_X, y = make_classification(
            n_samples=n_samples,
            n_features=n_features + noise_dims,
            n_redundant=0,
            shuffle=True,
            n_informative=n_features,
            random_state=seed * idx,
        )
        n_features_ends = [n_features + noise_dims, None]

        _X = np.hstack(
            (signal_X, rng.standard_normal(size=(n_samples, n_features_2_list[-1])))
        )
        X = _X.copy()
        n_features_ends[1] = X.shape[1]

        # print(X.shape, signal_X.shape, n_features_ends)
        np.savez(
            f"/Users/adam2392/Desktop/cancer/data/independent/independent_{idx}.npz",
            X=X,
            y=y,
        )

In [218]:
print(n_features_2, noise_dims, n_features)

4096 90 10


In [216]:
generate_independent_dataset(
    n_samples_full, n_features, n_features_2, noise_dims, n_repeats, seed
)

# Run Co-MIGHT

In [18]:
def _run_parallel_comight(
    idx, n_samples, seed, n_features_2, test_size, sim_type, rootdir, output_folder
):
    """Run parallel job on pre-generated data.

    Parameters
    ----------
    idx : int
        The index of the pre-generated dataset, stored as npz file.
    n_samples : int
        The number of samples to keep.
    seed : int
        The random seed.
    n_features_2 : int
        The number of dimensions to keep in feature set 2.
    test_size : float
        The size of the test set to use for predictive-model based tests.
    sim_type : str
        The simulation type. Either 'independent', 'collider', 'confounder',
        or 'direct-indirect'.
    rootdir : str
        The root directory where 'data/' and 'output/' will be.
    run_cdcorr : bool, optional
        Whether or not to run conditional dcorr, by default True.
    """
    n_jobs = 1
    n_features_ends = [100, None]

    # set output directory to save npz files
    output_dir = os.path.join(rootdir, f"output/{output_folder}/{sim_type}/")
    os.makedirs(output_dir, exist_ok=True)

    # load data
    npy_data = np.load(os.path.join(rootdir, f"data/{sim_type}/{sim_type}_{idx}.npz"))

    X = npy_data["X"]
    y = npy_data["y"]

    X = X[:, : 100 + n_features_2]
    if n_samples < X.shape[0]:
        cv = StratifiedShuffleSplit(n_splits=1, train_size=n_samples)
        for train_idx, _ in cv.split(X, y):
            continue
        X = X[train_idx, :]
        y = y[train_idx, ...].squeeze()
    assert len(X) == len(y)
    assert len(y) == n_samples
    n_features_ends[1] = X.shape[1]

    est = FeatureImportanceForestClassifier(
        estimator=HonestForestClassifier(
            n_estimators=n_estimators,
            tree_estimator=MultiViewDecisionTreeClassifier(
                max_features=[max_features, min(n_features_2, max_features * 100)],
                feature_set_ends=n_features_ends,
                apply_max_features_per_feature_set=True,
            ),
            random_state=seed,
            honest_fraction=0.5,
            n_jobs=n_jobs,
        ),
        random_state=seed,
        test_size=test_size,
        sample_dataset_per_tree=False,
    )

    # now compute the pvalue when shuffling X2
    covariate_index = np.arange(n_features_ends[0], n_features_ends[1])

    # Estimate CMI with
    mi_rf, pvalue = est.test(
        X,
        y,
        covariate_index=covariate_index,
        return_posteriors=True,
        metric="mi",
    )
    comight_posteriors_x2 = est.observe_posteriors_
    comight_null_posteriors_x2 = est.permute_posteriors_
    print(sim_type, pvalue)
    samples = est.observe_samples_
    permute_samples = est.permute_samples_

    assert np.isnan(comight_posteriors_x2[:, samples, :]).sum() == 0
    print(
        f'Saving to {os.path.join(output_dir, f"comight_{n_samples}_{n_features_2}_{idx}.npz")}.'
    )
    np.savez(
        os.path.join(output_dir, f"comight_{n_samples}_{n_features_2}_{idx}.npz"),
        n_samples=n_samples,
        n_features_2=n_features_2,
        y_true=y,
        comight_pvalue=pvalue,
        comight_mi=mi_rf,
        comight_posteriors_x2=comight_posteriors_x2,
        comight_null_posteriors_x2=comight_null_posteriors_x2,
    )

In [12]:
# hard-coded parameters
n_estimators = 500
max_features = 0.3
test_size = 0.2
n_jobs = -1

n_samples = 512
n_features_2 = 4096

max_fpr = 0.1

# number of features in the second view
pows = np.arange(2, 13, dtype=int)
n_features_2_list = [2**pow for pow in pows]
print(n_features_2_list)

# n_samples_list = [2**x for x in range(6, 12)]
n_samples_list = [2**x for x in range(6, 11)]
print(n_samples_list)
class_probs = [0.5, 0.5]

[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
[64, 128, 256, 512, 1024]


## Collider

In [13]:
rootdir = "/Users/adam2392/Desktop/cancer/"

In [14]:
print(n_samples, n_features_2)

512 4096


In [15]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "log_collider",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done


In [241]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "log_collider",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

done


In [139]:
print("done")

done


## Confounder

In [16]:
rootdir = "/Users/adam2392/Desktop/cancer/"

In [17]:
print(n_samples)

512


In [18]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "confounder",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done


In [197]:
n_features_2 = 4096

In [198]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "confounder",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

done


In [24]:
print("done")

done


## Direct-Indirect (Linear)

In [19]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "direct-indirect",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_256_0.npz.
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_32_1.npz.
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_1024_1.npz.
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_128_2.npz.
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_16_3.npz.
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_2048_3.npz.
direct-indirect 0.000999000999000999
Saving to /Users/adam2392/Desktop/cancer/output/varying-dimensionality/direc

In [22]:
n_features_2 = 4096

test = np.load('/Users/adam2392/Desktop/cancer/output/varying-dimensionality/direct-indirect/comight_512_128_22.npz')

print(dict(test).keys())
print(test['comight_pvalue'])

dict_keys(['n_samples', 'n_features_2', 'y_true', 'comight_pvalue', 'comight_mi', 'comight_posteriors_x2', 'comight_null_posteriors_x2'])
0.000999000999000999


In [None]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "direct-indirect",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

In [None]:
print("done")

## Independent

In [21]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "independent",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/adam2392/Desktop/cancer/output/varying-dimensionality/independent/comight_512_512_22.npz'

In [None]:
n_features_2 = 4096

In [None]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "independent",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

In [None]:
print("done")