Copyright (c) 2024 Microsoft Corporation.

Licensed under the MIT License

Experimenting with EuroSAT dataset under 11 settings:
- Semantic Shift: Leave-one-class-out. Train on 9 class and test on 10 class 
- Covariate Shift: Longitude-wise split. Train on West and test on East 

For each, we perform: 
- activation extraction
- downsample benchmarking
- layer benchmarking
- g training and evaluation
- g_hat training and evaluation
- g and g_hat statistical significance test
- g benchmark
- clustering benchmark
- num_cluster vs. g and g_hat performance investigation
- activation space visualization    

In [4]:
import os
import sys

import torch

sys.path.append("..")
from src.tardis.eurosat_xbd_utils import *
from src.tardis.utils import *

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(0))

%load_ext autoreload
%autoreload 2

fixed_seed = 31
set_seed(fixed_seed)

CUDA available: True
Number of GPUs: 2
Current GPU: 0
GPU Name: NVIDIA A100 80GB PCIe


In [None]:
# Define main directories for checkpoints and configs
ckpt_main_dir = "./exp_data/main_tardis/eurosat_exp_logs"
config_main_dir = "./geospatial-ood-detection/configs"

# Define a function to construct the paths dynamically
def construct_paths(main_dir, sub_dir, filename):
    return os.path.join(main_dir, sub_dir, filename)

# Define subdirectories and filenames for each configuration and checkpoint
paths = {
    "forest": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_forest.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Forest_resnet50_0066", "epoch=21-step=4972.ckpt"
        ),
    },
    "herb_veg": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_herbaceousvegetation.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir,
            "Holdout_HerbaceousVegetation_resnet50_0066",
            "epoch=27-step=6328.ckpt",
        ),
    },
    "highway": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_highway.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Highway_resnet50_0066", "epoch=28-step=6670.ckpt"
        ),
    },
    "industrial": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_industrial.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Industrial_resnet50_0066", "epoch=32-step=7590.ckpt"
        ),
    },
    "pasture": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_pasture.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_Pasture_resnet50_0066", "epoch=34-step=8225.ckpt"
        ),
    },
    "permanentcrop": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_permanentcrop.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir,
            "Holdout_PermanentCrop_resnet50_0066",
            "epoch=26-step=6210.ckpt",
        ),
    },
    "residential": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_residential.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir,
            "Holdout_Residential_resnet50_0066",
            "epoch=49-step=11250.ckpt",
        ),
    },
    "river": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_river.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_River_resnet50_0066", "epoch=31-step=7392.ckpt"
        ),
    },
    "sealake": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_sealake.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_SeaLake_resnet50_0066", "epoch=34-step=7875.ckpt"
        ),
    },
    "annualcrop": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_holdout_annualcrop.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "Holdout_AnnualCrop_resnet50_0066", "epoch=15-step=3616.ckpt"
        ),
    },
    "spatial_split": {
        "config": construct_paths(
            config_main_dir, "eurosat", "eurosat_spatial_config.yaml"
        ),
        "ckpt": construct_paths(
            ckpt_main_dir, "eurosat_spatial_0776", "epoch=18-step=4826.ckpt"
        ),
    },
}

# Define other parameters
layer = ["conv1"]
downsample_method = "avg_pool"
getitem_keys = ["image", "label"]
device = "cuda" if torch.cuda.is_available() else "cpu"
n_batches_to_process = 2

# Downsampling methods and benchmarks
downsample_methods = ["avg_pool", "mean_std", "avg_pool", "max_pool", "nodownsample"]
downsample_benchmark = {}
layer_benchmark = {}

collect_activations_from_layers = ["conv1"]
getitem_keys = ["image", "label"]
verbose = False
test_size = 0.2
n_estimators = 100
split_seed = 31
fixed_classifier_seed = 31

n_optuna_trials = 20
min_cluster = 2
max_cluster_ratio = 0.3
min_fraction = 0.01
max_fraction = 0.2
fixed_seed = 31

# Print paths to verify
for key, path in paths.items():
    print(f"{key} config path: {path['config']}")
    print(f"{key} ckpt path: {path['ckpt']}")

# Forest

### Start

In [None]:
collect_aug_input = True

(
    X,
    y,
    model,
    datamodule,
    train_dataloader,
    val_dataloader,
    test_dataloader,
    cfg_dict,
    x_sample_train_tensor,
    x_sample_test_tensor,
) = get_X_y_arrays(
    paths["forest"]["config"],
    paths["forest"]["ckpt"],
    layer,
    downsample_method,
    getitem_keys,
    device,
    n_batches_to_process,
    mode="holdout",
    verbose=False,
    collect_aug_input=collect_aug_input,
)

In [None]:
x_sample_train_tensor = torch.Tensor(x_sample_train_tensor).to("cuda")
x_sample_test_tensor = torch.Tensor(x_sample_test_tensor).to("cuda")
print(x_sample_train_tensor.shape, x_sample_test_tensor.shape)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
all_layer_names = get_all_layer_names(model)

conv_layers = [layer for layer in all_layer_names if "conv" in layer]
selected_layers = pick_random_layers(all_layer_names, 10)
selected_layers = [[layer] for layer in selected_layers]

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict["data"]["class_name"],
    save_plot=True,
)

# HerbaceousVegetation

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["herb_veg"]["config"],
        paths["herb_veg"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)
clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict.data.class_name,
    save_plot=True,
)

# Highway

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["highway"]["config"],
        paths["highway"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict.data.class_name,
    save_plot=True,
)

# Industrial

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["industrial"]["config"],
        paths["industrial"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict.data.class_name,
    save_plot=True,
)

# Pasture

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["pasture"]["config"],
        paths["pasture"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=cfg_dict["data"]["class_name"],
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster,
    y_train_cluster,
    y_clusters,
    class_name=cfg_dict["data"]["class_name"],
    save_plot=True,
)

# Permanent Crop

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["permanentcrop"]["config"],
        paths["permanentcrop"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# Residential

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["residential"]["config"],
        paths["residential"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# River

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["river"]["config"],
        paths["river"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# SeaLake

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["sealake"]["config"],
        paths["sealake"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# Annual Crop

### Start

In [None]:
X, y, model, datamodule, train_dataloader, val_dataloader, test_dataloader, cfg_dict = (
    get_X_y_arrays(
        paths["annualcrop"]["config"],
        paths["annualcrop"]["ckpt"],
        layer,
        downsample_method,
        getitem_keys,
        device,
        n_batches_to_process,
        mode="holdout",
        verbose=False,
    )
)

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.class_name + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)

# Spatial Split

### Start

### Downsample Benchmark

In [None]:
for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )

    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.__class__.__name__ + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, datamodule.class_name)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = datamodule.__class__.__name__ + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, datamodule.class_name)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g: ", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k(
    X, y, M, confidence_intervals_g=confidence_intervals_g
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i + 1
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=cfg.data.class_name
)