In [2]:
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sktree.ensemble import HonestForestClassifier
from sktree.stats import build_hyppo_oob_forest
from sklearn.metrics import (
    balanced_accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    roc_auc_score,
    roc_curve,
    accuracy_score
)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

In [10]:
from scipy.stats import multivariate_normal, entropy
from scipy.integrate import nquad

def _autoregressive_cov(n_dim, rho):
    # Create a meshgrid of indices
    i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij")

    # Calculate the covariance matrix using the corrected formula
    cov_matrix = rho ** np.abs(i - j)

    return cov_matrix


def _moving_avg_cov(n_dim, rho):
    # Create a meshgrid of indices
    i, j = np.meshgrid(np.arange(1, n_dim + 1), np.arange(1, n_dim + 1), indexing="ij")

    # Calculate the covariance matrix using the corrected formula
    cov_matrix = rho ** np.abs(i - j)

    # Apply the banding condition
    cov_matrix[abs(i - j) > 1] = 0
    return cov_matrix


def make_trunk_classification(
    n_samples,
    n_dim=4096,
    n_informative=1,
    simulation: str = "trunk",
    mu_0: float = 0,
    mu_1: float = 1,
    rho: int = 0,
    band_type: str = "ma",
    return_params: bool = False,
    mix: float = 0.75,
    seed=None,
):
    if n_dim < n_informative:
        raise ValueError(
            f"Number of informative dimensions {n_informative} must be less than number "
            f"of dimensions, {n_dim}"
        )
    rng = np.random.default_rng(seed=seed)

    mu_0 = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)])
    mu_1 = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)])

    if rho != 0:
        if band_type == "ma":
            cov = _moving_avg_cov(n_informative, rho)
        elif band_type == "ar":
            cov = _autoregressive_cov(n_informative, rho)
        else:
            raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".')
    else:
        cov = np.identity(n_informative)

    if mix < 0 or mix > 1:
        raise ValueError("Mix must be between 0 and 1.")

    # speed up computations for large multivariate normal matrix with SVD approximation
    if n_informative > 1000:
        method = "cholesky"
    else:
        method = "svd"

    if simulation == "trunk":
        X = np.vstack(
            (
                rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
                rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method),
            )
        )
    elif simulation == "trunk_overlap":
        mixture_idx = rng.choice(
            2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
        )
        norm_params = [[mu_0, cov], [mu_1, cov]]
        X_mixture = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )
        X_mixture_2 = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )

        X = np.vstack(
            (
                X_mixture_2.reshape(n_samples // 2, n_informative),
                X_mixture.reshape(n_samples // 2, n_informative),
            )
        )
    elif simulation == "trunk_mix":
        mixture_idx = rng.choice(
            2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
        )
        norm_params = [[mu_0, cov], [mu_1, cov]]
        X_mixture = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )

        X = np.vstack(
            (
                rng.multivariate_normal(
                    np.zeros(n_informative), cov, n_samples // 2, method=method
                ),
                X_mixture.reshape(n_samples // 2, n_informative),
            )
        )
    else:
        raise ValueError(f"Simulation must be: trunk, trunk_overlap, trunk_mix")

    if n_dim > n_informative:
        X = np.hstack(
            (X, rng.normal(loc=0, scale=1, size=(X.shape[0], n_dim - n_informative)))
        )

    y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))

    if return_params:
        returns = [X, y]
        if simulation == "trunk":
            returns += [[mu_0, mu_1], [cov, cov]]
        elif simulation == "trunk-overlap":
            returns += [[np.zeros(n_informative), np.zeros(n_informative)], [cov, cov]]
        elif simulation == "trunk-mix":
            returns += [*list(zip(*norm_params)), X_mixture]
        return returns
    return X, y

In [4]:
def Calculate_SA98(y_true, y_pred_proba, max_fpr=0.02) -> float:
    if y_true.squeeze().ndim != 1:
        raise ValueError(f"y_true must be 1d, not {y_true.shape}")
    if 0 in y_true or -1 in y_true:
        fpr, tpr, thresholds = roc_curve(
            y_true, y_pred_proba[:, 1], pos_label=1, drop_intermediate=False
        )
    else:
        fpr, tpr, thresholds = roc_curve(
            y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
        )
    s98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr])
    return s98


def Calculate_MI(y_true, y_pred_proba):
    H_YX = np.mean(entropy(y_pred_proba, base=np.exp(1), axis=1))
    # empirical count of each class (n_classes)
    _, counts = np.unique(y_true, return_counts=True)
    H_Y = entropy(counts, base=np.exp(1))
    return H_Y - H_YX

In [6]:
from random import seed


SAVE_PATH  = './output/S@98_n'

def POS_RF(dim,samp,ite):
    X, y = make_trunk_classification(
            n_samples=samp, n_dim=dim, n_informative=1, seed=ite,
        )
    model = RandomForestClassifier(n_estimators = 1200,
                                            max_samples = 1,
                                            max_features = 0.3,
                                            random_state=ite)
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    k=0
    for train_ix, test_ix in cv.split(X,y):
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]
        model.fit(X_train,y_train)
        observe_proba = model.predict_proba(X_test)
        # save the observed probability and the true label
        result = np.hstack((y_test.astype('int').reshape(-1, 1),observe_proba))
        print(result.shape)
        np.savetxt(SAVE_PATH + "/POS_RF_dim{}_samp{}_reps{}_cv{}.csv".format(dim,samp,ite,k), result, delimiter=",")
        k +=1
    
    

In [7]:
from sklearn.ensemble import RandomForestClassifier
from joblib import Parallel, delayed
SAMPLE_SIZES = [256, 512, 1024, 2048, 4096]
FIXED_DIM = 4096
N_ITR = 10
SAVE_PATH  = './output/S@98_n'
Parallel(n_jobs=516)(delayed(POS_RF)(FIXED_DIM,j,i)for i in range(N_ITR) for j in SAMPLE_SIZES )


(52, 3)
(103, 3)
(52, 3)
(52, 3)
(103, 3)
(52, 3)
(52, 3)
(52, 3)
(410, 3)
(52, 3)
(103, 3)
(205, 3)
(52, 3)
(103, 3)
(52, 3)
(103, 3)
(52, 3)
(820, 3)
(205, 3)
(103, 3)
(205, 3)
(205, 3)
(205, 3)
(103, 3)
(410, 3)
(103, 3)
(205, 3)
(205, 3)
(205, 3)
(103, 3)
(205, 3)
(205, 3)
(103, 3)
(410, 3)
(410, 3)
(410, 3)
(410, 3)
(410, 3)
(103, 3)
(51, 3)
(410, 3)
(51, 3)
(410, 3)
(51, 3)
(820, 3)
(410, 3)
(103, 3)
(51, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(51, 3)
(820, 3)
(103, 3)
(51, 3)
(820, 3)
(51, 3)
(51, 3)
(820, 3)
(820, 3)
(205, 3)
(205, 3)
(51, 3)
(103, 3)
(103, 3)
(51, 3)
(205, 3)
(103, 3)
(410, 3)
(205, 3)
(205, 3)
(205, 3)
(103, 3)
(205, 3)
(103, 3)
(103, 3)
(410, 3)
(205, 3)
(205, 3)
(103, 3)
(819, 3)
(51, 3)
(205, 3)
(51, 3)
(410, 3)
(410, 3)
(51, 3)
(410, 3)
(410, 3)
(102, 3)
(51, 3)
(410, 3)
(410, 3)
(410, 3)
(102, 3)
(410, 3)
(51, 3)
(819, 3)
(102, 3)
(51, 3)
(51, 3)
(51, 3)
(102, 3)
(51, 3)
(102, 3)
(102, 3)
(205, 3)
(819, 3)
(51, 3)
(819, 3)
(205, 3)
(205, 3)
(102, 3)
(819

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [36]:
DIMENSIONS = [256, 512, 1024, 2048, 4096]
FIXED_N_SAMPLE = 4096
N_ITR = 10
SAVE_PATH  = './output/S@98_dim'
Parallel(n_jobs=516)(delayed(POS_RF)(j,FIXED_N_SAMPLE,i) for i in range(N_ITR) for j in DIMENSIONS)

(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(819, 3)
(820, 3)
(819, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(820, 3)
(819, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(819, 3)
(820, 3)
(820, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(819, 3)
(820, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(819, 3)
(

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [23]:
# plot the results
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import glob

SAVE_PATH  = './output/S@98_dim'
files = glob.glob(SAVE_PATH + '/*.csv')
print(files)
data = []
for file in files:
    df = pd.read_csv(file, header=None)
    data.append(df)
data
# def plot_results(SAVE_PATH, title):
#     files = glob.glob(SAVE_PATH + '/*.csv')
#     print(files)
#     data = []
#     for file in files:
#         df = pd.read_csv(file, header=None)
#         data.append(df)
#     data = np.array(data)
#     print(data.shape)
#     data = np.mean(data, axis=0)
#     print(data.shape)
#     print(data)
#     sns.set(style="whitegrid")
#     plt.figure(figsize=(10, 6))
#     plt.plot(data)
#     plt.title(title)
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.show()

['./output/S@98_dim/POS_RF_dim4096_samp4096_reps0_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps0_cv1.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps0_cv2.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps0_cv3.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps2_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps1_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps6_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps7_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps5_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps3_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps8_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps0_cv4.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps9_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps2_cv1.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps4_cv0.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps1_cv1.csv', './output/S@98_dim/POS_RF_dim4096_samp4096_reps7_cv1.cs

[            0         1
 0    0.484167  0.515833
 1    0.484167  0.515833
 2    0.484167  0.515833
 3    0.484167  0.515833
 4    0.484167  0.515833
 ..        ...       ...
 815  0.484167  0.515833
 816  0.484167  0.515833
 817  0.484167  0.515833
 818  0.484167  0.515833
 819  0.484167  0.515833
 
 [820 rows x 2 columns],
             0         1
 0    0.484167  0.515833
 1    0.484167  0.515833
 2    0.484167  0.515833
 3    0.484167  0.515833
 4    0.484167  0.515833
 ..        ...       ...
 814  0.484167  0.515833
 815  0.484167  0.515833
 816  0.484167  0.515833
 817  0.484167  0.515833
 818  0.484167  0.515833
 
 [819 rows x 2 columns],
             0         1
 0    0.484167  0.515833
 1    0.484167  0.515833
 2    0.484167  0.515833
 3    0.484167  0.515833
 4    0.484167  0.515833
 ..        ...       ...
 814  0.484167  0.515833
 815  0.484167  0.515833
 816  0.484167  0.515833
 817  0.484167  0.515833
 818  0.484167  0.515833
 
 [819 rows x 2 columns],
             0     