In [1]:
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sktree.ensemble import HonestForestClassifier
from sktree.stats import build_hyppo_oob_forest
from sklearn.metrics import (
    balanced_accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    roc_auc_score,
    roc_curve,
    accuracy_score
)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

In [2]:
from scipy.stats import multivariate_normal, entropy
from scipy.integrate import nquad


def make_trunk_classification(
    n_samples,
    n_dim=4096,
    n_informative=1,
    simulation: str = "trunk",
    mu_0: float = 0,
    mu_1: float = 1,
    rho: int = 0,
    band_type: str = "ma",
    return_params: bool = False,
    mix: float = 0.5,
    seed=None,
):
    if n_dim < n_informative:
        raise ValueError(
            f"Number of informative dimensions {n_informative} must be less than number "
            f"of dimensions, {n_dim}"
        )
    rng = np.random.default_rng(seed=seed)

    mu_0 = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)])
    mu_1 = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)])

    if rho != 0:
        if band_type == "ma":
            cov = _moving_avg_cov(n_informative, rho)
        elif band_type == "ar":
            cov = _autoregressive_cov(n_informative, rho)
        else:
            raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".')
    else:
        cov = np.identity(n_informative)

    if mix < 0 or mix > 1:
        raise ValueError("Mix must be between 0 and 1.")

    # speed up computations for large multivariate normal matrix with SVD approximation
    if n_informative > 1000:
        method = "cholesky"
    else:
        method = "svd"

    if simulation == "trunk":
        X = np.vstack(
            (
                rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
                rng.multivariate_normal(mu_1, cov, n_samples // 2, method=method),
            )
        )
    elif simulation == "trunk_overlap":
        mixture_idx = rng.choice(
            2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
        )
        norm_params = [[mu_0, cov], [mu_1, cov]]
        X_mixture = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )
        X_mixture_2 = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )

        X = np.vstack(
            (
                X_mixture_2.reshape(n_samples // 2, n_informative),
                X_mixture.reshape(n_samples // 2, n_informative),
            )
        )
    elif simulation == "trunk_mix":
        mixture_idx = rng.choice(
            2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
        )
        norm_params = [[mu_0, cov], [mu_1, cov]]
        X_mixture = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )

        X = np.vstack(
            (
                rng.multivariate_normal(
                    np.zeros(n_informative), cov, n_samples // 2, method=method
                ),
                X_mixture.reshape(n_samples // 2, n_informative),
            )
        )
    else:
        raise ValueError(f"Simulation must be: trunk, trunk_overlap, trunk_mix")

    if n_dim > n_informative:
        X = np.hstack(
            (X, rng.normal(loc=0, scale=1, size=(X.shape[0], n_dim - n_informative)))
        )

    y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))

    if return_params:
        returns = [X, y]
        if simulation == "trunk":
            returns += [[mu_0, mu_1], [cov, cov]]
        elif simulation == "trunk-overlap":
            returns += [[np.zeros(n_informative), np.zeros(n_informative)], [cov, cov]]
        elif simulation == "trunk-mix":
            returns += [*list(zip(*norm_params)), X_mixture]
        return returns
    return X, y

In [3]:
def Calculate_SA98(y_true, y_pred_proba, max_fpr=0.02) -> float:
    if y_true.squeeze().ndim != 1:
        raise ValueError(f"y_true must be 1d, not {y_true.shape}")
    if 0 in y_true or -1 in y_true:
        fpr, tpr, thresholds = roc_curve(
            y_true, y_pred_proba[:, 1], pos_label=1, drop_intermediate=False
        )
    else:
        fpr, tpr, thresholds = roc_curve(
            y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
        )
    s98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr])
    return s98


def Calculate_MI(y_true, y_pred_proba):
    H_YX = np.mean(entropy(y_pred_proba, base=np.exp(1), axis=1))
    # empirical count of each class (n_classes)
    _, counts = np.unique(y_true, return_counts=True)
    H_Y = entropy(counts, base=np.exp(1))
    return H_Y - H_YX

In [4]:
SAMPLE_SIZES = [256, 512, 1024, 2048, 4096]
FIXED_SIZE = 4096
N_ITR = 10
SAVE_PATH  = './output/trunk'

observe_probas = []
# null_probas = []
for i in range(N_ITR):
    print(i)
    observe_probas.append([])
    # null_probas.append([])
    for j in SAMPLE_SIZES:
        print(j)
        dim = FIXED_SIZE
        X, y = make_trunk_classification(
            n_samples=j, n_dim=dim, n_informative=1, seed=i+j
        )
        model = SVC(probability=True,kernel = 'rbf')  # Set probability=True to enable probability estimates
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        observe_proba_ij = []
        k=0
        for train_ix, test_ix in cv.split(X,y):
            X_train, X_test = X[train_ix, :], X[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            model.fit(X_train,y_train)
            observe_proba = model.predict_proba(X_test)
            observe_proba = np.hstack((y_test.reshape(-1, 1),observe_proba))
            np.savetxt(SAVE_PATH + "/POS_SVMrbf_dim4096_samp{}_reps{}_cv{}.csv".format(j,i,k), observe_proba, delimiter=",")


0
256
512
1024


KeyboardInterrupt: 

In [18]:
from sklearn.linear_model import LogisticRegression
SAMPLE_SIZES = [4096]
FIXED_SIZE = 4096
N_ITR = 10
SAVE_PATH  = './output/trunk/'

observe_probas = []
# null_probas = []
for i in N_ITR:
    print(i)
    for j in SAMPLE_SIZES:
        print(j)
        dim = FIXED_SIZE
        X, y = make_trunk_classification(
            n_samples=j, n_dim=dim, n_informative=1, seed=i+j
        )
        clf = LogisticRegression(penalty='l1',solver = 'liblinear')  # Set probability=True to enable probability estimates
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        k=0
        for train_ix, test_ix in cv.split(X,y):
            X_train, X_test = X[train_ix, :], X[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            observe_proba = model.predict_proba(X_test)
            observe_proba = np.hstack((y_test.reshape(-1, 1),observe_proba))
            np.savetxt(SAVE_PATH + "/POS_LGliblin_dim4096_samp{}_reps{}_cv{}.csv".format(j,i,k), observe_proba, delimiter=",")
            k +=1


9
4096
2


In [8]:
from sklearn.neighbors import KNeighborsClassifier
SAMPLE_SIZES = [256, 512, 1024, 2048, 4096]
FIXED_SIZE = 4096
N_ITR = 10
# SAVE_PATH  = '/Users/baiyuxin/Desktop/JHU/NDD/Cancer/POS_MIXGaussian0221'

for i in range(N_ITR):
    print(i)
    # null_probas.append([])
    for j in SAMPLE_SIZES:
        print(j)
        dim = FIXED_SIZE
        X, y = make_trunk_classification(
            n_samples=j, n_dim=dim, n_informative=1, seed=i+j
        )
        model =  KNeighborsClassifier(n_neighbors=int(np.sqrt(j))+1)
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        k=0
        for train_ix, test_ix in cv.split(X,y):
            X_train, X_test = X[train_ix, :], X[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            model.fit(X_train,y_train)
            observe_proba = model.predict_proba(X_test)
            observe_proba = np.hstack((y_test.reshape(-1, 1),observe_proba))
            np.savetxt(SAVE_PATH + "/POS_KNN_dim4096_samp{}_reps{}_cv{}.csv".format(j,i,k), observe_proba, delimiter=",")
            k +=1
        # print(len(observe_probas))


0
256
512
1024
2048
4096
1
256
512
1024
2048
4096
2
256
512
1024
2048
4096
3
256
512
1024
2048
4096
4
256
512
1024
2048
4096
5
256
512
1024
2048
4096
6
256
512
1024
2048
4096
7
256
512
1024
2048
4096
8
256
512
1024
2048
4096
9
256
512
1024
2048
4096


In [6]:
from sklearn.ensemble import RandomForestClassifier
SAMPLE_SIZES = [256, 512, 1024, 2048, 4096]
FIXED_SIZE = 4096
N_ITR = 10
SAVE_PATH  = '/Users/baiyuxin/Desktop/JHU/NDD/Cancer/POS_MIXGaussian0221'

observe_probas = []
# null_probas = []
for i in range(N_ITR):
    print(i)
    observe_probas.append([])
    # null_probas.append([])
    for j in SAMPLE_SIZES:
        print(j)
        dim = FIXED_SIZE
        X, y = make_trunk_classification(
            n_samples=j, n_dim=dim, n_informative=1, seed=i+j
        )
        model = RandomForestClassifier(n_estimators = 1200,
                                             max_samples = 1.0,
                                             max_features = 0.3,
                                             random_state=i)
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        observe_proba_ij = []
        k=0
        for train_ix, test_ix in cv.split(X,y):
            X_train, X_test = X[train_ix, :], X[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            model.fit(X_train,y_train)
            observe_proba = model.predict_proba(X_test)
            observe_proba_ij.append(observe_proba)
            # print(observe_proba.shape)
            np.savetxt(SAVE_PATH + "/POS_RF_dim4096_samp{}_reps{}_cv{}.csv".format(j,i,k), observe_proba, delimiter=",")
            k +=1
        observe_probas.append(observe_proba_ij)
        print(len(observe_probas))


0
256
2
512
3
1024


  rng.multivariate_normal(


4
2048
5
4096


KeyboardInterrupt: 