In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import expit
from sklearn.datasets import (
    make_blobs,
    make_classification,
    make_sparse_spd_matrix,
    make_spd_matrix,
)
from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier

from sktree import HonestForestClassifier, RandomForestClassifier, RandomForestRegressor
from sktree.datasets.multiview import make_gaussian_mixture, make_joint_factor_model
from sktree.stats import (
    FeatureImportanceForestClassifier,
    FeatureImportanceForestRegressor,
    PermutationForestRegressor,
)
from sktree.tree import DecisionTreeClassifier, MultiViewDecisionTreeClassifier

seed = 12345
rng = np.random.default_rng(seed)

In [3]:
print("done")

done


In [8]:
max_features = "sqrt"
n_estimators = 500
n_jobs = -1
test_size = 0.2
max_fpr = 0.1

# Data-generating model with a copy of the feature-set

Here, we say X = (X1, X1), where X1 is a feature-set that may be informative of Y.

In [9]:
n_samples = 200
noise_dims = 80
n_features = 100 - noise_dims
n_features_2 = 10000 - noise_dims

# max_features = 0.3

n_repeats = 5

In [10]:
X, y = make_classification(
    n_samples=n_samples,
    n_features=100,
    n_informative=10,
    n_redundant=5,
    n_repeated=0,
    n_classes=2,
    class_sep=2.0,
    flip_y=0.05,
    shuffle=False,
    random_state=seed,
)
n_features_ends = [20, X.shape[1]]
print(n_features_ends)
print(X.shape, np.sum(y) / n_samples)

X = np.hstack((X, X))
print(X.shape)

n_features_ends = [X.shape[1] // 2, X.shape[1]]
print(n_features_ends)

[20, 100]
(200, 100) 0.505
(200, 200)
[100, 200]


In [11]:
est = FeatureImportanceForestClassifier(
    estimator=HonestForestClassifier(
        n_estimators=n_estimators,
        max_features=max_features,
        tree_estimator=MultiViewDecisionTreeClassifier(
            feature_set_ends=n_features_ends,
            apply_max_features_per_feature_set=True,
        ),
        random_state=seed,
        honest_fraction=0.5,
        n_jobs=n_jobs,
    ),
    random_state=seed,
    test_size=test_size,
    permute_per_tree=False,
    sample_dataset_per_tree=False,
)

est_mv_old = FeatureImportanceForestClassifier(
    estimator=HonestForestClassifier(
        n_estimators=n_estimators,
        max_features=max_features,
        tree_estimator=MultiViewDecisionTreeClassifier(
            feature_set_ends=n_features_ends,
            apply_max_features_per_feature_set=False,
        ),
        random_state=seed,
        honest_fraction=0.5,
        n_jobs=n_jobs,
    ),
    random_state=seed,
    test_size=test_size,
    permute_per_tree=False,
    sample_dataset_per_tree=False,
)

est_rf = FeatureImportanceForestClassifier(
    estimator=HonestForestClassifier(
        n_estimators=n_estimators,
        max_features=max_features,
        tree_estimator=DecisionTreeClassifier(),
        # tree_estimator=skDecisionTreeClassifier(),
        random_state=seed,
        honest_fraction=0.5,
        n_jobs=n_jobs,
    ),
    random_state=seed,
    test_size=test_size,
    permute_per_tree=False,
    sample_dataset_per_tree=False,
)

# compute the statistic and pvalue when we permute the first feature-set
covariate_index = np.arange(0, n_features_ends[0])
# print(covariate_index)
stat, pvalue = est.test(
    X,
    y,
    covariate_index=covariate_index,
    metric="mi",
    n_repeats=1000,
    # max_fpr=max_fpr
)
stat_old, pvalue_old = est_mv_old.test(
    X,
    y,
    covariate_index=covariate_index,
    metric="mi",
    n_repeats=1000,
    # max_fpr=max_fpr
)
stat_rf, pvalue_rf = est_rf.test(
    X,
    y,
    metric="mi",
    n_repeats=1000,
    # max_fpr=max_fpr
)

print(stat, stat_rf, stat_old)
print(pvalue, pvalue_rf, pvalue_old)

0.10085299540085102 0.08304276713486813 0.13011074840498627
0.000999000999000999 0.000999000999000999 0.005994005994005994
