# Old Scripts for Running different models

In [468]:
def make_mediator(
    n_samples, n_features, noise_dims, n_features_1, n_views, joint_rank, seed
):
    Xs, U, _ = make_joint_factor_model(
        n_views,
        n_features,
        n_samples=n_samples,
        joint_rank=joint_rank,
        noise_std=10.0,
        m=0.5,
        random_state=seed,
        return_decomp=True,
    )
    # total_n_features = n_features * n_views + noise_dims

    signal_X = np.hstack(Xs)
    signal_X += rng.standard_normal(size=signal_X.shape) * 2.0
    print(signal_X.shape)
    y = rng.binomial(n=1, p=expit(signal_X[:, :n_features].sum(axis=1)), size=n_samples)

    # make signal view (i.e. X1) higher-dimensional
    U += rng.standard_normal(size=U.shape)
    U = np.hstack(
        (
            U,
            rng.standard_normal(size=(signal_X.shape[0], n_features_1 - U.shape[1])),
        )
    )

    # first view is X1, which generates X2
    X = np.hstack((U, signal_X))

    return X, y

## Mediator

In [469]:
def generate_mediator_dataset(
    n_samples, n_features, class_probs, test_size, max_fpr, n_repeats, seed
):
    n_views = 2
    rng = np.random.default_rng(seed)
    for idx in range(n_repeats):
        n_features_begin = 0
        signal_X, y = make_mediator(
            n_samples=n_samples,
            n_features=n_features // 2,
            noise_dims=noise_dims // 2,
            n_features_1=100,
            n_views=n_views,
            joint_rank=5,
            seed=seed * idx,
        )
        n_features_ends = [100, None]

        _X = np.hstack(
            (
                signal_X,
                rng.standard_normal(
                    size=(n_samples, n_features_2_list[-1] - n_features)
                ),
            )
        )
        X = _X.copy()
        n_features_ends[1] = X.shape[1]

        print(X.shape, signal_X.shape, y.shape, n_features_ends)
        np.savez(f"./mediator/mediator_{idx}.npz", X=X, y=y)

In [470]:
generate_mediator_dataset(
    n_samples, n_features, class_probs, test_size, max_fpr, n_repeats, seed
)

(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(256, 4196) (256, 110) (256,) [100, 4196]
(256, 10)
(2

## Collider Results

In [None]:
_results_collider = Parallel(n_jobs=-1)(
    delayed(_run_parallel_sim)(
        idx_,
        n_samples,
        n_features,
        class_probs,
        seed,
        n_features_2_,
        test_size,
        max_fpr,
        "collider",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list[1:])
)

In [90]:
results = defaultdict(list)

for idx in range(n_repeats):
    n_features_begin = 0
    signal_X, y = make_collider(
        n_samples=n_samples,
        n_features=n_features,
        noise_dims=noise_dims,
        seed=seed + idx,
    )
    n_features_ends = [n_features + noise_dims, None]

    for n_features_2_ in n_features_2_list:
        _X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, n_features_2_))))
        X = _X.copy()
        n_features_ends[1] = X.shape[1]

        print(X.shape, y.shape)
        print(n_features_ends)

        est = FeatureImportanceForestClassifier(
            estimator=HonestForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                tree_estimator=MultiViewDecisionTreeClassifier(
                    feature_set_ends=n_features_ends,
                    apply_max_features_per_feature_set=True,
                ),
                random_state=seed,
                honest_fraction=0.5,
                n_jobs=n_jobs,
            ),
            random_state=seed,
            test_size=test_size,
            sample_dataset_per_tree=False,
        )

        # compute the statistic
        # also compute the pvalue when shuffling X1
        covariate_index = np.arange(0, n_features_ends[0])
        stat, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x1"].append(pvalue)
        # get the actual partial-AUC of the unpermuted dataset for the forest
        stat = est.observe_stat_
        results["mvrf_pauc"].append(stat)

        # now compute the same relevant quantities using kNN
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(X.shape[1])) + 1)
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )

        results["knn_pauc"].append(pauc)
        results["knn_pvalue_x1"].append(pvalue)

        # also compute the relevant quantities using linear regression
        lr = LogisticRegression(random_state=seed + idx + n_features_2_)
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x1"].append(pvalue)
        results["lr_pauc"].append(pauc)

        # now compute the pvalue when shuffling X2
        covariate_index = np.arange(n_features_ends[0], n_features_ends[1])
        _, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x2"].append(pvalue)

        # now compute the same relevant quantities using kNN
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["knn_pvalue_x2"].append(pvalue)

        # also compute the relevant quantities using linear regression
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x2"].append(pvalue)

        results["n_samples"].append(n_samples)
        results["n_features_2"].append(n_features_2_)
        results["noise_dims"].append(noise_dims)

(500, 200) (500,)
[100, 200]
(500, 1000) (500,)
[100, 1000]
(500, 2000) (500,)
[100, 2000]
(500, 3000) (500,)
[100, 3000]
(500, 4000) (500,)
[100, 4000]
(500, 5000) (500,)
[100, 5000]
(500, 6000) (500,)
[100, 6000]
(500, 7000) (500,)
[100, 7000]
(500, 8000) (500,)
[100, 8000]
(500, 9000) (500,)
[100, 9000]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(results)

# save the results
df.to_csv("./cv_comight_mv_vs_knn_vs_lr_collider_model.csv")

print(df.columns)
print(df.shape)

## Mediator Results

In [17]:
n_features = 5
noise_dims = 90
n_views = 2

In [19]:
results = defaultdict(list)

for idx in range(n_repeats):
    n_features_begin = 0
    signal_X, y = make_mediator(
        n_samples=n_samples,
        n_features=n_features,
        noise_dims=noise_dims,
        n_views=n_views,
        joint_rank=5,
        seed=seed + idx,
    )
    n_features_ends = [n_views * n_features + noise_dims, None]

    for n_features_2_ in n_features_2_list:
        _X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, n_features_2_))))
        X = _X.copy()
        n_features_ends[1] = X.shape[1]

        print(X.shape, y.shape)
        print(n_features_ends)

        est = FeatureImportanceForestClassifier(
            estimator=HonestForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                tree_estimator=MultiViewDecisionTreeClassifier(
                    feature_set_ends=n_features_ends,
                    apply_max_features_per_feature_set=True,
                ),
                random_state=seed,
                honest_fraction=0.5,
                n_jobs=n_jobs,
            ),
            random_state=seed,
            test_size=test_size,
            sample_dataset_per_tree=False,
        )

        # compute the statistic
        # also compute the pvalue when shuffling X1
        covariate_index = np.arange(0, n_features_ends[0])
        stat, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x1"].append(pvalue)
        # get the actual partial-AUC of the unpermuted dataset for the forest
        stat = est.observe_stat_
        results["mvrf_pauc"].append(stat)

        # now compute the same relevant quantities using kNN
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(X.shape[1])) + 1)
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )

        results["knn_pauc"].append(pauc)
        results["knn_pvalue_x1"].append(pvalue)

        # also compute the relevant quantities using linear regression
        lr = LogisticRegression(random_state=seed + idx + n_features_2_)
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x1"].append(pvalue)
        results["lr_pauc"].append(pauc)

        # now compute the pvalue when shuffling X2
        covariate_index = np.arange(n_features_ends[0], n_features_ends[1])
        _, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x2"].append(pvalue)

        # now compute the same relevant quantities using kNN
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["knn_pvalue_x2"].append(pvalue)

        # also compute the relevant quantities using linear regression
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x2"].append(pvalue)

        results["n_samples"].append(n_samples)
        results["n_features_2"].append(n_features_2_)
        results["noise_dims"].append(noise_dims)

(500, 200) (500,)
[100, 200]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(results)

# save the results
df.to_csv("./cv_comight_mv_vs_knn_vs_lr_mediator_model.csv")

print(df.columns)
print(df.shape)

## Direct/Indirect Effect Results

In [21]:
n_features = 10
noise_dims = 90

In [22]:
results = defaultdict(list)

for idx in range(n_repeats):
    n_features_begin = 0
    signal_X, y = make_direct_indirect_effects(
        n_samples=n_samples,
        n_features=n_features,
        noise_dims=noise_dims,
        class_probs=class_probs,
        seed=seed + idx,
    )
    n_features_ends = [n_features + noise_dims, None]

    for n_features_2_ in n_features_2_list:
        _X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, n_features_2_))))
        X = _X.copy()
        n_features_ends[1] = X.shape[1]

        print(X.shape, y.shape)
        print(n_features_ends)

        est = FeatureImportanceForestClassifier(
            estimator=HonestForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                tree_estimator=MultiViewDecisionTreeClassifier(
                    feature_set_ends=n_features_ends,
                    apply_max_features_per_feature_set=True,
                ),
                random_state=seed,
                honest_fraction=0.5,
                n_jobs=n_jobs,
            ),
            random_state=seed,
            test_size=test_size,
            sample_dataset_per_tree=False,
        )

        # compute the statistic
        # also compute the pvalue when shuffling X1
        covariate_index = np.arange(0, n_features_ends[0])
        stat, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x1"].append(pvalue)
        # get the actual partial-AUC of the unpermuted dataset for the forest
        stat = est.observe_stat_
        results["mvrf_pauc"].append(stat)

        # now compute the same relevant quantities using kNN
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(X.shape[1])) + 1)
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )

        results["knn_pauc"].append(pauc)
        results["knn_pvalue_x1"].append(pvalue)

        # also compute the relevant quantities using linear regression
        lr = LogisticRegression(random_state=seed + idx + n_features_2_)
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x1"].append(pvalue)
        results["lr_pauc"].append(pauc)

        # now compute the pvalue when shuffling X2
        covariate_index = np.arange(n_features_ends[0], n_features_ends[1])
        _, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x2"].append(pvalue)

        # now compute the same relevant quantities using kNN
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["knn_pvalue_x2"].append(pvalue)

        # also compute the relevant quantities using linear regression
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x2"].append(pvalue)

        results["n_samples"].append(n_samples)
        results["n_features_2"].append(n_features_2_)
        results["noise_dims"].append(noise_dims)

(250, 200)
(250, 200)
[(250, 100), (250, 100)]
(500, 200) (500, 1)
[100, 200]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(results)

# save the results
df.to_csv("./cv_comight_mv_vs_knn_vs_lr_directindirecteffects_model.csv")

print(df.columns)
print(df.shape)

## Independent View Results

In [30]:
n_features = 20
noise_dims = 80
n_samples = 500
max_features = 0.3
n_jobs = -1
test_size = 0.2

max_fpr = 0.1

n_features_2_list = np.linspace(900, 10_000 - 100, 10, dtype=int)
n_features_2_list = np.insert(n_features_2_list, 0, 100)
print(n_features_2_list)

[ 100  900 1900 2900 3900 4900 5900 6900 7900 8900 9900]


In [31]:
results = defaultdict(list)

for idx in range(n_repeats):
    n_features_begin = 0
    signal_X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features + noise_dims,
        n_informative=n_features,
        random_state=seed + idx,
    )
    n_features_ends = [n_features + noise_dims, None]

    for n_features_2_ in n_features_2_list:
        _X = np.hstack((signal_X, rng.standard_normal(size=(n_samples, n_features_2_))))
        X = _X.copy()
        n_features_ends[1] = X.shape[1]

        print(X.shape, y.shape)
        print(n_features_ends)

        est = FeatureImportanceForestClassifier(
            estimator=HonestForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                tree_estimator=MultiViewDecisionTreeClassifier(
                    feature_set_ends=n_features_ends,
                    apply_max_features_per_feature_set=True,
                ),
                random_state=seed,
                honest_fraction=0.5,
                n_jobs=n_jobs,
            ),
            random_state=seed,
            test_size=test_size,
            sample_dataset_per_tree=False,
        )

        # compute the statistic
        # also compute the pvalue when shuffling X1
        covariate_index = np.arange(0, n_features_ends[0])
        stat, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x1"].append(pvalue)
        # get the actual partial-AUC of the unpermuted dataset for the forest
        stat = est.observe_stat_
        results["mvrf_pauc"].append(stat)

        # now compute the same relevant quantities using kNN
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(X.shape[1])) + 1)
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )

        results["knn_pauc"].append(pauc)
        results["knn_pvalue_x1"].append(pvalue)

        # also compute the relevant quantities using linear regression
        lr = LogisticRegression(random_state=seed + idx + n_features_2_)
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        pauc, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x1"].append(pvalue)
        results["lr_pauc"].append(pauc)

        # now compute the pvalue when shuffling X2
        covariate_index = np.arange(n_features_ends[0], n_features_ends[1])
        _, pvalue = est.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["mvrf_pvalue_x2"].append(pvalue)

        # now compute the same relevant quantities using kNN
        # compute pvalue for kNN based job
        permest = PermutationTest(
            neigh, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["knn_pvalue_x2"].append(pvalue)

        # also compute the relevant quantities using linear regression
        permest = PermutationTest(
            lr, n_repeats=100, random_state=seed + idx + n_features_2_
        )
        _, pvalue = permest.test(
            X, y, covariate_index=covariate_index, metric="auc", max_fpr=max_fpr
        )
        results["lr_pvalue_x2"].append(pvalue)

        results["n_samples"].append(n_samples)
        results["n_features_2"].append(n_features_2_)
        results["noise_dims"].append(noise_dims)

(500, 200) (500,)
[100, 200]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(results)

# save the results
df.to_csv("./cv_comight_mv_vs_knn_vs_lr_independentview_model.csv")

print(df.columns)
print(df.shape)

In [1]:
df = pd.read_csv(
    "./cv_comight_mv_vs_knn_vs_lr_confounder_model.csv", index_col=0, header=0
)

display(df.head())

NameError: name 'pd' is not defined