# A comparison of permuting per tree, vs regular Coleman method

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import math
import os
from collections import defaultdict
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from hyppo.conditional import ConditionalDcorr
from joblib import Parallel, delayed
from scipy.special import expit
from scipy.stats import ortho_group
from sklearn.datasets import (
    make_blobs,
    make_classification,
    make_sparse_spd_matrix,
    make_spd_matrix,
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (
    StratifiedKFold,
    StratifiedShuffleSplit,
    cross_val_score,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor
from sktree.datasets.multiview import make_gaussian_mixture, make_joint_factor_model
from sktree.stats import (
    FeatureImportanceForestClassifier,
    FeatureImportanceForestRegressor,
    PermutationForestRegressor,
    PermutationTest,
)
from sktree.stats.utils import (
    METRIC_FUNCTIONS,
    POSITIVE_METRICS,
    POSTERIOR_FUNCTIONS,
    REGRESSOR_METRICS,
    _compute_null_distribution_coleman,
    _non_nan_samples,
)
from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier

seed = 12345
rng = np.random.default_rng(seed)

# Run Co-MIGHT

In [7]:
def _run_parallel_comight(
    idx, n_samples, seed, n_features_2, test_size, sim_type, rootdir, output_folder
):
    """Run parallel job on pre-generated data.

    Parameters
    ----------
    idx : int
        The index of the pre-generated dataset, stored as npz file.
    n_samples : int
        The number of samples to keep.
    seed : int
        The random seed.
    n_features_2 : int
        The number of dimensions to keep in feature set 2.
    test_size : float
        The size of the test set to use for predictive-model based tests.
    sim_type : str
        The simulation type. Either 'independent', 'collider', 'confounder',
        or 'direct-indirect'.
    rootdir : str
        The root directory where 'data/' and 'output/' will be.
    run_cdcorr : bool, optional
        Whether or not to run conditional dcorr, by default True.
    """
    n_jobs = 1
    n_features_ends = [100, None]

    # set output directory to save npz files
    output_dir = os.path.join(rootdir, f"output/{output_folder}/{sim_type}/")
    os.makedirs(output_dir, exist_ok=True)

    # load data
    npy_data = np.load(os.path.join(rootdir, f"data/{sim_type}/{sim_type}_{idx}.npz"))

    X = npy_data["X"]
    y = npy_data["y"]

    X = X[:, : 100 + n_features_2]
    if n_samples < X.shape[0]:
        cv = StratifiedShuffleSplit(n_splits=1, train_size=n_samples)
        for train_idx, _ in cv.split(X, y):
            continue
        X = X[train_idx, :]
        y = y[train_idx, ...].squeeze()
    assert len(X) == len(y)
    assert len(y) == n_samples
    n_features_ends[1] = X.shape[1]

    est = FeatureImportanceForestClassifier(
        estimator=HonestForestClassifier(
            n_estimators=n_estimators,
            tree_estimator=MultiViewDecisionTreeClassifier(
                max_features=[max_features, min(n_features_2, max_features * 100)],
                feature_set_ends=n_features_ends,
                apply_max_features_per_feature_set=True,
            ),
            random_state=seed,
            honest_fraction=0.5,
            n_jobs=n_jobs,
        ),
        random_state=seed,
        test_size=test_size,
        sample_dataset_per_tree=False,
        permute_forest_fraction=1./n_estimators,
    )

    # now compute the pvalue when shuffling X2
    covariate_index = np.arange(n_features_ends[0], n_features_ends[1])

    # Estimate CMI with
    mi_rf, pvalue = est.test(
        X,
        y,
        covariate_index=covariate_index,
        return_posteriors=True,
        metric="mi",
    )
    comight_posteriors_x2 = est.observe_posteriors_
    comight_null_posteriors_x2 = est.permute_posteriors_

    samples = est.observe_samples_
    permute_samples = est.permute_samples_

    assert np.isnan(comight_posteriors_x2[:, samples, :]).sum() == 0

    np.savez(
        os.path.join(output_dir, f"comight_{n_samples}_{n_features_2}_{idx}.npz"),
        n_samples=n_samples,
        n_features_2=n_features_2,
        y_true=y,
        comight_pvalue=pvalue,
        comight_mi=mi_rf,
        comight_posteriors_x2=comight_posteriors_x2,
        comight_null_posteriors_x2=comight_null_posteriors_x2,
    )

In [8]:
# hard-coded parameters
n_estimators = 500
max_features = 0.3
test_size = 0.2
n_jobs = -1

n_samples = 512
n_features_2 = 4096

max_fpr = 0.1

# number of features in the second view
pows = np.arange(2, 13, dtype=int)
n_features_2_list = [2**pow for pow in pows]
print(n_features_2_list)

# n_samples_list = [2**x for x in range(6, 12)]
n_samples_list = [2**x for x in range(6, 11)]
print(n_samples_list)
class_probs = [0.5, 0.5]

[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
[64, 128, 256, 512, 1024]


## Collider

In [3]:
rootdir = "/Users/adam2392/Desktop/cancer/"

In [4]:
print(n_samples, n_features_2)

512 4096


In [240]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "log_collider",
        rootdir,
        "varying-dimensionality-permute-per-tree",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done


In [241]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "log_collider",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

done


In [139]:
print("done")

done


## Confounder

In [224]:
rootdir = "/Users/adam2392/Desktop/cancer/"

In [225]:
print(n_samples)

512


In [226]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "confounder",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done


In [197]:
n_features_2 = 4096

In [198]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "confounder",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

done


In [199]:
print("done")

done


## Direct-Indirect (Linear)

In [227]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "direct-indirect",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done


In [228]:
n_features_2 = 4096

In [None]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "direct-indirect",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

In [None]:
print("done")

## Independent

In [229]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples,
        seed + 1,
        n_features_2_,
        test_size,
        "independent",
        rootdir,
        "varying-dimensionality",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

print("done")

done


In [230]:
n_features_2 = 4096

In [None]:
Parallel(n_jobs=-1)(
    delayed(_run_parallel_comight)(
        idx_,
        n_samples_,
        seed + 1,
        n_features_2,
        test_size,
        "independent",
        rootdir,
        "varying-samples",
    )
    for (idx_, n_samples_) in product(range(n_repeats), n_samples_list)
)
print("done")

In [None]:
print("done")