Copyright (c) 2024 Microsoft Corporation.

Licensed under the MIT License

Experimenting with EuroSAT dataset under 11 settings:
- Semantic Shift: Leave-one-class-out. Train on 9 class and test on 10 class 
- Covariate Shift: Longitude-wise split. Train on West and test on East 

For each, we perform: 
- activation extraction
- downsample benchmarking
- layer benchmarking
- g training and evaluation
- g_hat training and evaluation
- g and g_hat statistical significance test
- g benchmark
- clustering benchmark
- num_cluster vs. g and g_hat performance investigation
- activation space visualization    

In [None]:
import os
import sys

import torch

sys.path.append("..")

from src.tardis.eurosat_xview_utils import *
from src.tardis.utils import set_seed

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(0))

%load_ext autoreload
%autoreload 2

fixed_seed = 31
set_seed(fixed_seed)

In [None]:
# Define main directories for checkpoints and configs
ckpt_main_dir = "./exp_data/main_tardis/eurosat_exp_logs"
config_main_dir = "./geospatial-ood-detection/configs"


# Define a function to construct the paths dynamically
def construct_paths(main_dir, sub_dir, filename):
    return os.path.join(main_dir, sub_dir, filename)


# Define subdirectories and filenames for each configuration and checkpoint
paths = {
    "forest": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_forest.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Forest_resnet50_0066", "epoch=21-step=4972.ckpt"
        ),
    },
    "herb_veg": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_herbaceousvegetation.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir,
            "Holdout_HerbaceousVegetation_resnet50_0066",
            "epoch=27-step=6328.ckpt",
        ),
    },
    "highway": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_highway.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Highway_resnet50_0066", "epoch=28-step=6670.ckpt"
        ),
    },
    "industrial": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_industrial.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Industrial_resnet50_0066", "epoch=32-step=7590.ckpt"
        ),
    },
    "pasture": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_pasture.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Pasture_resnet50_0066", "epoch=34-step=8225.ckpt"
        ),
    },
    "permanentcrop": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_permanentcrop.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir,
            "Holdout_PermanentCrop_resnet50_0066",
            "epoch=26-step=6210.ckpt",
        ),
    },
    "residential": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_residential.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir,
            "Holdout_Residential_resnet50_0066",
            "epoch=49-step=11250.ckpt",
        ),
    },
    "river": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_river.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_River_resnet50_0066", "epoch=31-step=7392.ckpt"
        ),
    },
    "sealake": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_sealake.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_SeaLake_resnet50_0066", "epoch=34-step=7875.ckpt"
        ),
    },
    "annualcrop": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_annualcrop.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_AnnualCrop_resnet50_0066", "epoch=15-step=3616.ckpt"
        ),
    },
    "spatial_split": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_spatial_config.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "eurosat_spatial_0776", "epoch=18-step=4826.ckpt"
        ),
    },
}

# Define other parameters
layer = ["conv1"]
downsample_method = "avg_pool"
getitem_keys = ["image", "label"]
device = "cuda" if torch.cuda.is_available() else "cpu"
n_batches_to_process = 2

# Downsampling methods and benchmarks
downsample_methods = ["avg_pool", "mean_std", "avg_pool", "max_pool", "nodownsample"]
downsample_benchmark = {}
layer_benchmark = {}

collect_activations_from_layers = ["conv1"]
getitem_keys = ["image", "label"]
verbose = False
test_size = 0.2
n_estimators = 100
split_seed = 31
fixed_classifier_seed = 31

n_optuna_trials = 20
min_cluster = 2
max_cluster_ratio = 0.2
min_fraction = 0.01
max_fraction = 0.2
fixed_seed = 31

# Print paths to verify
for key, path in paths.items():
    print(f"{key} config path: {path['config']}")
    print(f"{key} ckpt path: {path['ckpt']}")

# Forest

### Start

In [None]:
collect_aug_input = True

(
    X,
    y,
    model,
    datamodule,
    train_dataloader,
    val_dataloader,
    test_dataloader,
    cfg_dict,
    x_sample_train_tensor,
    x_sample_test_tensor,
) = get_X_y_arrays(
    paths["forest"]["config"],
    paths["forest"]["ckpt"],
    layer,
    downsample_method,
    getitem_keys,
    device,
    n_batches_to_process,
    mode="holdout",
    verbose=False,
    collect_aug_input=collect_aug_input,
)

In [None]:
x_sample_train_tensor = torch.Tensor(x_sample_train_tensor).to("cuda")
x_sample_test_tensor = torch.Tensor(x_sample_test_tensor).to("cuda")
print(x_sample_train_tensor.shape, x_sample_test_tensor.shape)

### External Benchmark -- WIP

In [None]:
k = int(0.3 * len(X))
M = 0.1
fixed_seed = 31
i = 0
test_size = 0.2

metrics, X_samples, y_ground_truth, y_surrogate, train_indices, baseline_indices = (
    run_g_hat_experiment(X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1)
)

y_random = np.random.randint(2, size=len(y_ground_truth))
y_ground_truth.shape, y_surrogate.shape, y_random.shape

In [None]:
train_indices.shape, baseline_indices.shape

In [None]:
unique_samples = np.unique(train_indices)
unique_samples

In [None]:
idx_0 = np.where(y_ground_truth == 0)[0]
idx_1 = np.where(y_ground_truth == 1)[0]

# Get the corresponding X values for y equals 0 and 1
X_train = X_samples[idx_0]
X_test = X_samples[idx_1]

X_train_tensor = torch.Tensor(X_train)
X_test_tensor = torch.Tensor(X_test)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

In [None]:
y_surrogate_train = y_surrogate[idx_0]
y_surrogate_test = y_surrogate[idx_1]
print("y_surrogate_train shape:", y_surrogate_train.shape)
print("y_surrogate_test shape:", y_surrogate_test.shape)
np.unique(y_surrogate_train, return_counts=True), np.unique(
    y_surrogate_test, return_counts=True
)

y_surrogate_combined_labels = np.concatenate([y_surrogate_train, y_surrogate_test])
print("y_surrogate_combined_labels shape:", y_surrogate_combined_labels.shape)

#### Apply ReAct

In [None]:
def react(logits, threshold):
    """
    Applies the ReAct method to the given logits.

    Parameters:
    - logits: Tensor of raw logits.
    - threshold: Clipping threshold.

    Returns:
    - Tensor of logits after applying ReAct.
    """
    return torch.clamp(logits, max=threshold)


# Evaluation Functions
def evaluate_react(id_logits, ood_logits, replace_labels, threshold):
    # Apply ReAct to the ID and OOD logits
    reacted_id_logits = react(id_logits, threshold)
    reacted_ood_logits = react(ood_logits, threshold)

    # Compute scores using the maximum logit value
    id_scores = torch.max(reacted_id_logits, dim=1)[0].cpu().numpy()
    ood_scores = torch.max(reacted_ood_logits, dim=1)[0].cpu().numpy()

    combined_scores = np.concatenate([id_scores, ood_scores])

    if replace_labels is None:
        # Create labels (1 for ID, 0 for OOD)
        id_labels = np.ones_like(id_scores)
        ood_labels = np.zeros_like(ood_scores)

        # Combine scores and labels
        combined_labels = np.concatenate([id_labels, ood_labels])
    else:
        combined_labels = replace_labels

    # Calculate AUROC
    auroc = roc_auc_score(combined_labels, combined_scores)

    # Calculate FPR at 95% TPR
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]

    # Calculate Accuracy (assuming a threshold of 0.5 for binary classification)
    predicted_labels = (combined_scores >= 0.5).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


def find_optimal_threshold_react(
    id_logits, ood_logits, threshold_range, replace_labels, metric="auroc"
):
    best_threshold = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for threshold in threshold_range:
        auroc, fpr95, accuracy = evaluate_react(
            id_logits, ood_logits, replace_labels, threshold
        )
        print(
            f"Threshold: {threshold}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}"
        )

        if metric == "auroc":
            current_metric_value = auroc
        elif metric == "fpr95":
            current_metric_value = fpr95
        elif metric == "accuracy":
            current_metric_value = accuracy
        else:
            raise ValueError(
                "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
            )

        if (
            metric in ["auroc", "accuracy"] and current_metric_value > best_metric_value
        ) or (metric == "fpr95" and current_metric_value < best_metric_value):
            best_metric_value = current_metric_value
            best_threshold = threshold

    print(
        f"Optimal Threshold: {best_threshold} with {metric.upper()}: {best_metric_value}"
    )
    return best_threshold

In [None]:
observe_metric_search = "accuracy"
results = {}
# Define a range of thresholds to test for ReAct
threshold_range = np.linspace(0.1, 1500.0, num=50)

labels = {"gt": None, "surr": y_surrogate_combined_labels, "rand": y_random}

for key, value in labels.items():
    replace_labels = value
    print(f"Replace Labels: {key}")
    # Find the optimal threshold for ReAct
    optimal_threshold_react = find_optimal_threshold_react(
        X_train_tensor,
        X_test_tensor,
        threshold_range,
        replace_labels=replace_labels,
        metric=observe_metric_search,
    )
    # Evaluate ReAct with the optimal threshold
    auroc_react, fpr95_react, accuracy_react = evaluate_react(
        X_train_tensor,
        X_test_tensor,
        replace_labels=replace_labels,
        threshold=optimal_threshold_react,
    )
    print(
        f"ReAct - AUROC: {auroc_react}, FPR95: {fpr95_react}, Accuracy: {accuracy_react}"
    )

    results[key] = {"Accuracy": accuracy_react}

#### Apply ASH

In [None]:
# ASH Functions


def ash_p(logits, alpha):
    """
    Applies the ASH-P (Positive) method to the given logits.

    Parameters:
    - logits: Tensor of raw logits.
    - alpha: Scaling parameter.

    Returns:
    - Tensor of logits after applying ASH-P.
    """
    return logits * (logits >= alpha).float()


def ash_s(logits, alpha):
    """
    Applies the ASH-S (Symmetric) method to the given logits.

    Parameters:
    - logits: Tensor of raw logits.
    - alpha: Scaling parameter.

    Returns:
    - Tensor of logits after applying ASH-S.
    """
    return logits * (logits.abs() >= alpha).float()


def ash_b(logits, alpha):
    """
    Applies the ASH-B (Batch) method to the given logits.

    Parameters:
    - logits: Tensor of raw logits.
    - alpha: Scaling parameter.

    Returns:
    - Tensor of logits after applying ASH-B.
    """
    batch_mean = logits.mean(dim=0, keepdim=True)
    batch_std = logits.std(dim=0, keepdim=True)
    return logits * ((logits - batch_mean).abs() >= alpha * batch_std).float()


def evaluate_ash_p(id_logits, ood_logits, replace_labels, alpha):
    # Apply ASH-P to the ID and OOD logits
    ash_p_id_logits = ash_p(id_logits, alpha)
    ash_p_ood_logits = ash_p(ood_logits, alpha)

    # Compute scores using the maximum logit value
    id_scores = torch.max(ash_p_id_logits, dim=1)[0].cpu().numpy()
    ood_scores = torch.max(ash_p_ood_logits, dim=1)[0].cpu().numpy()

    # Combine scores and labels
    combined_scores = np.concatenate([id_scores, ood_scores])

    if replace_labels is None:
        # Create labels (1 for ID, 0 for OOD)
        id_labels = np.ones_like(id_scores)
        ood_labels = np.zeros_like(ood_scores)

        # Combine scores and labels
        combined_labels = np.concatenate([id_labels, ood_labels])
    else:
        combined_labels = replace_labels

    # Calculate AUROC
    auroc = roc_auc_score(combined_labels, combined_scores)

    # Calculate FPR at 95% TPR
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]

    # Calculate Accuracy (assuming a threshold of 0.5 for binary classification)
    predicted_labels = (combined_scores >= combined_scores.mean()).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


def evaluate_ash_s(id_logits, ood_logits, replace_labels, alpha):
    # Apply ASH-S to the ID and OOD logits
    ash_s_id_logits = ash_s(id_logits, alpha)
    ash_s_ood_logits = ash_s(ood_logits, alpha)

    # Compute scores using the maximum logit value
    id_scores = torch.max(ash_s_id_logits, dim=1)[0].cpu().numpy()
    ood_scores = torch.max(ash_s_ood_logits, dim=1)[0].cpu().numpy()

    # Combine scores and labels
    combined_scores = np.concatenate([id_scores, ood_scores])

    if replace_labels is None:
        # Create labels (1 for ID, 0 for OOD)
        id_labels = np.ones_like(id_scores)
        ood_labels = np.zeros_like(ood_scores)

        # Combine scores and labels
        combined_labels = np.concatenate([id_labels, ood_labels])
    else:
        combined_labels = replace_labels

    # Calculate AUROC
    auroc = roc_auc_score(combined_labels, combined_scores)

    # Calculate FPR at 95% TPR
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]

    # Calculate Accuracy (assuming a threshold of 0.5 for binary classification)
    predicted_labels = (combined_scores >= combined_scores.mean()).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


def evaluate_ash_b(id_logits, ood_logits, replace_labels, alpha):
    # Apply ASH-B to the ID and OOD logits
    ash_b_id_logits = ash_b(id_logits, alpha)
    ash_b_ood_logits = ash_b(ood_logits, alpha)

    # Compute scores using the maximum logit value
    id_scores = torch.max(ash_b_id_logits, dim=1)[0].cpu().numpy()
    ood_scores = torch.max(ash_b_ood_logits, dim=1)[0].cpu().numpy()

    # Combine scores and labels
    combined_scores = np.concatenate([id_scores, ood_scores])

    if replace_labels is None:
        # Create labels (1 for ID, 0 for OOD)
        id_labels = np.ones_like(id_scores)
        ood_labels = np.zeros_like(ood_scores)

        # Combine scores and labels
        combined_labels = np.concatenate([id_labels, ood_labels])
    else:
        combined_labels = replace_labels

    # Calculate AUROC
    auroc = roc_auc_score(combined_labels, combined_scores)

    # Calculate FPR at 95% TPR
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]

    # Calculate Accuracy (assuming a threshold of 0.5 for binary classification)
    predicted_labels = (combined_scores >= combined_scores.mean()).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


def find_optimal_threshold_react(
    id_logits, ood_logits, threshold_range, replace_labels, metric="auroc"
):
    best_threshold = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for threshold in threshold_range:
        auroc, fpr95, accuracy = evaluate_react(
            id_logits, ood_logits, replace_labels, threshold
        )
        print(
            f"Threshold: {threshold}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}"
        )

        if metric == "auroc":
            current_metric_value = auroc
        elif metric == "fpr95":
            current_metric_value = fpr95
        elif metric == "accuracy":
            current_metric_value = accuracy
        else:
            raise ValueError(
                "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
            )

        if (
            metric in ["auroc", "accuracy"] and current_metric_value > best_metric_value
        ) or (metric == "fpr95" and current_metric_value < best_metric_value):
            best_metric_value = current_metric_value
            best_threshold = threshold

    print(
        f"Optimal Threshold: {best_threshold} with {metric.upper()}: {best_metric_value}"
    )
    return best_threshold


def find_optimal_alpha_ash_p(
    id_logits, ood_logits, alpha_range, replace_labels, metric="auroc"
):
    best_alpha = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for alpha in alpha_range:
        auroc, fpr95, accuracy = evaluate_ash_p(
            id_logits, ood_logits, replace_labels, alpha
        )
        print(f"Alpha: {alpha}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}")

        if metric == "auroc":
            current_metric_value = auroc
        elif metric == "fpr95":
            current_metric_value = fpr95
        elif metric == "accuracy":
            current_metric_value = accuracy
        else:
            raise ValueError(
                "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
            )

        if (
            metric in ["auroc", "accuracy"] and current_metric_value > best_metric_value
        ) or (metric == "fpr95" and current_metric_value < best_metric_value):
            best_metric_value = current_metric_value
            best_alpha = alpha

    print(f"Optimal Alpha: {best_alpha} with {metric.upper()}: {best_metric_value}")
    return best_alpha


def find_optimal_alpha_ash_b(
    id_logits, ood_logits, alpha_range, replace_labels, metric="auroc"
):
    best_alpha = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for alpha in alpha_range:
        auroc, fpr95, accuracy = evaluate_ash_b(
            id_logits, ood_logits, replace_labels, alpha
        )
        print(f"Alpha: {alpha}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}")

        if metric == "auroc":
            current_metric_value = auroc
        elif metric == "fpr95":
            current_metric_value = fpr95
        elif metric == "accuracy":
            current_metric_value = accuracy
        else:
            raise ValueError(
                "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
            )

        if (
            metric in ["auroc", "accuracy"] and current_metric_value > best_metric_value
        ) or (metric == "fpr95" and current_metric_value < best_metric_value):
            best_metric_value = current_metric_value
            best_alpha = alpha

    print(f"Optimal Alpha: {best_alpha} with {metric.upper()}: {best_metric_value}")
    return best_alpha


def find_optimal_alpha_ash_s(
    id_logits, ood_logits, alpha_range, replace_labels, metric="auroc"
):
    best_alpha = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for alpha in alpha_range:
        auroc, fpr95, accuracy = evaluate_ash_s(
            id_logits, ood_logits, replace_labels, alpha
        )
        print(f"Alpha: {alpha}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}")

        if metric == "auroc":
            current_metric_value = auroc
        elif metric == "fpr95":
            current_metric_value = fpr95
        elif metric == "accuracy":
            current_metric_value = accuracy
        else:
            raise ValueError(
                "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
            )

        if (
            metric in ["auroc", "accuracy"] and current_metric_value > best_metric_value
        ) or (metric == "fpr95" and current_metric_value < best_metric_value):
            best_metric_value = current_metric_value
            best_alpha = alpha

    print(f"Optimal Alpha: {best_alpha} with {metric.upper()}: {best_metric_value}")
    return best_alpha

In [None]:
observe_metric_search = "accuracy"
results = {}
alpha_range = np.linspace(0.1, 10, num=100)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

labels = {"gt": None, "surr": y_surrogate_combined_labels, "rand": y_random}

for key, value in labels.items():
    replace_labels = value
    print(f"Replace Labels: {key}")
    optimal_alpha_p_ash_p = find_optimal_alpha_ash_p(
        X_train_tensor,
        X_test_tensor,
        alpha_range,
        replace_labels=replace_labels,
        metric=observe_metric_search,
    )
    auroc, fpr95, accuracy = evaluate_ash_p(
        X_train_tensor,
        X_test_tensor,
        replace_labels=replace_labels,
        alpha=optimal_alpha_p_ash_p,
    )
    print(f"AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}")

    results[key] = {"Accuracy": accuracy}

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
results

In [None]:
del results, fpr95, auroc, accuracy

In [None]:
observe_metric_search = "accuracy"
results = {}
alpha_range = np.linspace(0.1, 10, num=100)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

labels = {"gt": None, "surr": y_surrogate_combined_labels, "rand": y_random}

for key, value in labels.items():
    replace_labels = value
    print(f"Replace Labels: {key}")
    optimal_alpha_s_ash_s = find_optimal_alpha_ash_s(
        X_train_tensor,
        X_test_tensor,
        alpha_range,
        replace_labels=replace_labels,
        metric=observe_metric_search,
    )
    auroc, fpr95, accuracy = evaluate_ash_s(
        X_train_tensor,
        X_test_tensor,
        replace_labels=replace_labels,
        alpha=optimal_alpha_s_ash_s,
    )
    print(f"AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}")

    results[key] = {"Accuracy": accuracy}

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
results

Now, we need input images, not samples

#### ODIN

Limiting ID to 1000 samples due to OOM

In [None]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve


# ODIN Function
def odin(model, inputs, temperature, epsilon):
    inputs = inputs
    inputs.requires_grad = True
    outputs = model(inputs)
    # print(outputs)
    outputs = outputs / temperature

    max_logit = torch.max(outputs, dim=1)[0]
    max_logit.backward(torch.ones_like(max_logit))

    perturbation = epsilon * inputs.grad.sign()
    # print(perturbation)
    # print(inputs)
    perturbed_inputs = inputs + perturbation
    perturbed_inputs = torch.clamp(perturbed_inputs, 0, 1)

    perturbed_outputs = model(perturbed_inputs)
    perturbed_outputs = perturbed_outputs / temperature

    return perturbed_outputs


# Evaluation Function for ODIN
def evaluate_odin(model, id_inputs, ood_inputs, temperature, replace_labels, epsilon):
    odin_id_logits = odin(model, id_inputs, temperature, epsilon)
    odin_ood_logits = odin(model, ood_inputs, temperature, epsilon)

    id_scores = torch.max(odin_id_logits, dim=1)[0].detach().cpu().numpy()
    ood_scores = torch.max(odin_ood_logits, dim=1)[0].detach().cpu().numpy()

    combined_scores = np.concatenate([id_scores, ood_scores])

    if replace_labels is None:
        # Create labels (1 for ID, 0 for OOD)
        id_labels = np.ones_like(id_scores)
        ood_labels = np.zeros_like(ood_scores)

        # Combine scores and labels
        combined_labels = np.concatenate([id_labels, ood_labels])
    else:
        combined_labels = replace_labels

    auroc = roc_auc_score(combined_labels, combined_scores)
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]
    predicted_labels = (combined_scores >= 0.5).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


# Find Optimal Temperature and Epsilon for ODIN
def find_optimal_odin_params(
    model,
    id_inputs,
    ood_inputs,
    temperature_range,
    epsilon_range,
    replace_labels,
    metric="auroc",
):
    best_temperature = None
    best_epsilon = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for temperature in temperature_range:
        for epsilon in epsilon_range:
            auroc, fpr95, accuracy = evaluate_odin(
                model, id_inputs, ood_inputs, temperature, replace_labels, epsilon
            )
            print(
                f"Temperature: {temperature}, Epsilon: {epsilon}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}"
            )

            if metric == "auroc":
                current_metric_value = auroc
            elif metric == "fpr95":
                current_metric_value = fpr95
            elif metric == "accuracy":
                current_metric_value = accuracy
            else:
                raise ValueError(
                    "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
                )

            if (
                metric in ["auroc", "accuracy"]
                and current_metric_value > best_metric_value
            ) or (metric == "fpr95" and current_metric_value < best_metric_value):
                best_metric_value = current_metric_value
                best_temperature = temperature
                best_epsilon = epsilon

    print(
        f"Optimal Temperature: {best_temperature}, Optimal Epsilon: {best_epsilon} with {metric.upper()}: {best_metric_value}"
    )
    return best_temperature, best_epsilon

Can I set different temperature and epsilon for ID and OOD?

In [None]:
temperature_range = np.linspace(1.0, 10.0, num=10)
epsilon_range = np.linspace(0.0, 0.1, num=10)

monitor_metric = "accuracy"
results = {}

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

labels = {"gt": None, "surr": y_surrogate_combined_labels, "rand": y_random}

for key, value in labels.items():
    replace_labels = value
    print(f"Replace Labels: {key}")

    # Find the optimal temperature and epsilon for ODIN based on AUROC
    optimal_temperature, optimal_epsilon = find_optimal_odin_params(
        model,
        x_sample_train_tensor,
        x_sample_test_tensor,
        temperature_range,
        epsilon_range,
        replace_labels=replace_labels,
        metric=monitor_metric,
    )
    #  2.0 0.03333333333333333
    # Evaluate ODIN with the optimal temperature and epsilon
    auroc_odin, fpr95_odin, accuracy_odin = evaluate_odin(
        model,
        x_sample_train_tensor,
        x_sample_test_tensor,
        optimal_temperature,
        replace_labels=replace_labels,
        optimal_epsilon=optimal_epsilon,
    )
    print(f"ODIN - AUROC: {auroc_odin}, FPR95: {fpr95_odin}, Accuracy: {accuracy_odin}")

    results[key] = {
        "AUROC": auroc_odin,
        "FPR at 95% TPR": fpr95_odin,
        "Accuracy": accuracy_odin,
    }

    results[key] = {"Accuracy": accuracy}

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
results

#### MSP, Energy, Softmax

In [None]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

# Define all the evaluation functions


# Maximum Softmax Probability (MSP)
def evaluate_msp(model, id_inputs, ood_inputs):
    model.eval()
    with torch.no_grad():
        id_outputs = model(id_inputs)
        ood_outputs = model(ood_inputs)

        id_scores = torch.max(torch.softmax(id_outputs, dim=1), dim=1)[0].cpu().numpy()
        ood_scores = (
            torch.max(torch.softmax(ood_outputs, dim=1), dim=1)[0].cpu().numpy()
        )

    # Combine scores and labels
    combined_scores = np.concatenate([id_scores, ood_scores])

    if replace_labels is None:
        # Create labels (1 for ID, 0 for OOD)
        id_labels = np.ones_like(id_scores)
        ood_labels = np.zeros_like(ood_scores)

        # Combine scores and labels
        combined_labels = np.concatenate([id_labels, ood_labels])
    else:
        combined_labels = r

    auroc = roc_auc_score(combined_labels, combined_scores)
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]
    predicted_labels = (combined_scores >= combined_scores.mean()).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


def _get_energy_score(logits, temperature=2):
    scores = -(temperature * torch.logsumexp(logits / temperature, dim=1)).cpu().numpy()
    return scores


def evaluate_energy(model, id_inputs, ood_inputs, temperature=2):
    model.eval()
    with torch.no_grad():
        id_outputs = model(id_inputs)
        ood_outputs = model(ood_inputs)

        id_scores = _get_energy_score(id_outputs, temperature)
        ood_scores = _get_energy_score(ood_outputs, temperature)

    id_labels = np.ones_like(id_scores)
    ood_labels = np.zeros_like(ood_scores)

    combined_scores = np.concatenate([id_scores, ood_scores])
    combined_labels = np.concatenate([id_labels, ood_labels])

    auroc = roc_auc_score(combined_labels, combined_scores)
    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]
    # print(combined_scores.min(), combined_scores.max(), combined_scores.mean()) ###########
    predicted_labels = (combined_scores >= combined_scores.mean()).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return auroc, fpr95, accuracy


def find_optimal_temperature(
    model, id_inputs, ood_inputs, temperature_range, metric="auroc"
):
    best_temperature = None
    best_metric_value = (
        -float("inf") if metric in ["auroc", "accuracy"] else float("inf")
    )

    for temperature in temperature_range:
        auroc, fpr95, accuracy = evaluate_energy(
            model, id_inputs, ood_inputs, temperature
        )
        print(
            f"Temperature: {temperature}, AUROC: {auroc}, FPR95: {fpr95}, Accuracy: {accuracy}"
        )

        if metric == "auroc":
            current_metric_value = auroc
        elif metric == "fpr95":
            current_metric_value = fpr95
        elif metric == "accuracy":
            current_metric_value = accuracy
        else:
            raise ValueError(
                "Invalid metric specified. Choose from 'auroc', 'fpr95', or 'accuracy'."
            )

        if (
            metric in ["auroc", "accuracy"] and current_metric_value > best_metric_value
        ) or (metric == "fpr95" and current_metric_value < best_metric_value):
            best_metric_value = current_metric_value
            best_temperature = temperature

    print(
        f"Optimal Temperature: {best_temperature} with {metric.upper()}: {best_metric_value}"
    )
    return best_temperature


# Softmax Method
def evaluate_softmax(model, id_inputs, ood_inputs):
    model.eval()
    with torch.no_grad():
        id_outputs = model(id_inputs)
        ood_outputs = model(ood_inputs)

        id_softmax = torch.softmax(id_outputs, dim=1).cpu().numpy()
        ood_softmax = torch.softmax(ood_outputs, dim=1).cpu().numpy()

    id_labels = np.ones(len(id_softmax))
    ood_labels = np.zeros(len(ood_softmax))

    combined_softmax = np.concatenate([id_softmax, ood_softmax])
    combined_labels = np.concatenate([id_labels, ood_labels])

    # Calculate AUROC for each class and average
    auroc_per_class = []
    for i in range(combined_softmax.shape[1]):
        auroc = roc_auc_score(combined_labels, combined_softmax[:, i])
        auroc_per_class.append(auroc)

    mean_auroc = np.mean(auroc_per_class)

    # Use the max softmax probability for FPR95 and accuracy
    combined_scores = np.max(combined_softmax, axis=1)

    fpr, tpr, thresholds = roc_curve(combined_labels, combined_scores)
    fpr95 = fpr[np.where(tpr >= 0.95)[0][0]]
    predicted_labels = (combined_scores >= combined_scores.mean()).astype(int)
    accuracy = accuracy_score(combined_labels, predicted_labels)

    return mean_auroc, fpr95, accuracy

In [None]:
model.eval()

id_inputs_ = id_inputs  # _subset
ood_inputs_ = ood_inputs

results = {}

# Evaluate MSP
auroc_msp, fpr95_msp, accuracy_msp = evaluate_msp(model, id_inputs_, ood_inputs_)
print(f"MSP - AUROC: {auroc_msp}, FPR95: {fpr95_msp}, Accuracy: {accuracy_msp}")
results["MSP"] = {
    "AUROC": auroc_msp,
    "FPR at 95% TPR": fpr95_msp,
    "Accuracy": accuracy_msp,
}

# Define the range of temperatures to search
# temperature_range = np.linspace(1, 50.0, num=10)
# Find the optimal temperature based on AUROC
# Evaluate Energy with the optimal temperature
# optimal_temperature = find_optimal_temperature(model, id_inputs, ood_inputs, temperature_range, metric='accuracy')
auroc_energy, fpr95_energy, accuracy_energy = evaluate_energy(
    model, id_inputs_, ood_inputs_, temperature=39
)
print(
    f"Energy - AUROC: {auroc_energy}, FPR95: {fpr95_energy}, Accuracy: {accuracy_energy}"
)

# Evaluate Softmax Method
auroc_softmax, fpr95_softmax, accuracy_softmax = evaluate_softmax(
    model, id_inputs_, ood_inputs_
)
print(
    f"Softmax - AUROC: {auroc_softmax}, FPR95: {fpr95_softmax}, Accuracy: {accuracy_softmax}"
)
results["Softmax"] = {
    "AUROC": auroc_softmax,
    "FPR at 95% TPR": fpr95_softmax,
    "Accuracy": accuracy_softmax,
}

print("-----------results", results)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
all_layer_names = get_all_layer_names(model)


def pick_random_layers(layers, n):
    first_conv = conv_layers[0]  # First conv layer
    last_conv = conv_layers[-1]  # Last conv layer
    middle_layers = conv_layers[1:-1]
    random_layers = random.sample(middle_layers, min(n - 2, len(middle_layers)))
    selected_layers = [first_conv] + random_layers + [last_conv]
    return selected_layers


conv_layers = [layer for layer in all_layer_names if "conv" in layer]
selected_layers = pick_random_layers(all_layer_names, 10)
selected_layers = [[layer] for layer in selected_layers]

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict["data"]["class_name"],
    save_plot=True,
)

# HerbaceousVegetation

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["herb_veg"]["config"],
        paths["herb_veg"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)
clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict.data.class_name,
    save_plot=True,
)

# Highway

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["highway"]["config"],
        paths["highway"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict.data.class_name,
    save_plot=True,
)

# Industrial

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["industrial"]["config"],
        paths["industrial"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict.data.class_name,
    save_plot=True,
)

# Pasture

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["pasture"]["config"],
        paths["pasture"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict["data"]["class_name"],
    save_plot=True,
)

# Permanent Crop

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["permanentcrop"]["config"],
        paths["permanentcrop"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# Residential

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["residential"]["config"],
        paths["residential"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# River

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["river"]["config"],
        paths["river"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
results_df_g_hat

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# SeaLake

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["sealake"]["config"],
        paths["sealake"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# Annual Crop

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["annualcrop"]["config"],
        paths["annualcrop"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# Spatial Split

### Start

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )

    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.__class__.__name__ + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.__class__.__name__ + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)