Copyright (c) 2024 Microsoft Corporation.

Licensed under the MIT License

Experimenting with xBD dataset under 5 settings:
- Same Type Far: nepal_flood_post_midwest_flood_post
- Same Type Close: santa-rosa-wildfire-post_woolsey-fire-pos
- Different Type Far: hurricane-matthew_post_nepal_flood_post
- Different Type Close: hurricane-matthew_mexico-earthquake
- Pre Post: portugal-wildfire-pre-post

For each, we perform: 
- layer benchmarking
- activation extraction
- g training and evaluation
- g_hat training and evaluation
- g and g_hat statistical significance test
- g benchmark
- clustering benchmark
- num_cluster vs. g and g_hat performance investigation
- activation space visualization    


In [None]:
import os
import sys

import torch

sys.path.append("..")
from src.tardis.eurosat_xbd_utils import *
from src.tardis.utils import *

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(0))

%load_ext autoreload
%autoreload 2

fixed_seed = 31
set_seed(fixed_seed)

In [None]:
# Base directory path
base_dir = "/ws/geospatial-ood-detection/configs/xview/"

# Configuration paths
same_far_config_path = f"{base_dir}xview_config_samedisaster_distant.yaml"
same_close_config_path = f"{base_dir}xview_config_samedisaster_close.yaml"
different_far_config_path = f"{base_dir}xview_config_differentdisaster_distant.yaml"
different_close_config_path = f"{base_dir}xview_config_differentdisaster_close.yaml"
pre_post_config_path = f"{base_dir}xview_config_prepost.yaml"

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Parameters
collect_activations_from_layers = ["encoder.layer2.0.conv1"]
getitem_keys = ["image", "mask"]
n_batches_to_process = 10
downsample_method = "avg_pool"
verbose = False

# Downsample methods
downsample_methods = ["avg_pool", "mean_std", "avg_pool", "max_pool", "nodownsample"]
downsample_benchmark = {}

# Benchmark settings
test_size = 0.2
n_estimators = 100
split_seed = 31
fixed_classifier_seed = 31
layer_benchmark = {}

# Selected layers
model, _ = get_model_config(same_far_config_path, base_dir, device)
all_layer_names = get_all_layer_names(model)
selected_layers = pick_random_layers(all_layer_names, 10)
selected_layers = [[layer] for layer in selected_layers]
print("Selected 10 layers:", selected_layers)

# Optuna settings
n_optuna_trials = 20
min_cluster = 2
max_cluster_ratio = 0.3
min_fraction = 0.01
max_fraction = 0.2
fixed_seed = 31

# Same type - Far

In [None]:
model, cfg = get_model_config(same_far_config_path, device)
datamodule, datamodule_train, datamodule_val, datamodule_test = prepare_datamodule(cfg)

### Downsample Benchmark

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, nametag)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, nametag)

### Property extraction

In [None]:
X, y, test_property_lengths = create_feature_matrix_and_labels(
    model=model,
    dm=datamodule,
    train_dataloader=datamodule_train,
    test_dataloader=datamodule_test,
    layer_names=collect_activations_from_layers,
    device=device,
    getitem_keys=getitem_keys,
    n_batches_to_process=n_batches_to_process,
    downsample_method=downsample_method,
    verbose=verbose,
)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    save_plot=True,
    fname=nametag,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=nametag, save_plot=True
)

# Same type - Close

In [None]:
model, cfg = get_model_config(same_close_config_path, device)
datamodule, datamodule_train, datamodule_val, datamodule_test = prepare_datamodule(cfg)

### Downsample Benchmark

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, nametag)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, nametag)

### Property extraction

In [None]:
all_layer_names = get_all_layer_names(model)

In [None]:
collect_activations_from_layers = ["encoder.layer2.0.conv1"]
device = "cuda" if torch.cuda.is_available() else "cpu"
getitem_keys = ["image", "mask"]
n_batches_to_process = n_batches_to_process
downsample_method = "avg_pool"
verbose = False

X, y, test_property_lengths = create_feature_matrix_and_labels(
    model=model,
    dm=datamodule,
    train_dataloader=datamodule_train,
    test_dataloader=datamodule_test,
    layer_names=collect_activations_from_layers,
    device=device,
    getitem_keys=getitem_keys,
    n_batches_to_process=n_batches_to_process,
    downsample_method=downsample_method,
    verbose=verbose,
)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name="Disaster"
)

# Different type - Far

In [None]:
model, cfg = get_model_config(different_far_config_path, device)
datamodule, datamodule_train, datamodule_val, datamodule_test = prepare_datamodule(cfg)

### Downsample Benchmark

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, nametag)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, nametag)

### Property extraction

In [None]:
all_layer_names = get_all_layer_names(model)

In [None]:
collect_activations_from_layers = ["encoder.layer2.0.conv1"]
device = "cuda" if torch.cuda.is_available() else "cpu"
getitem_keys = ["image", "mask"]
n_batches_to_process = n_batches_to_process
downsample_method = "avg_pool"
verbose = False

X, y, test_property_lengths = create_feature_matrix_and_labels(
    model=model,
    dm=datamodule,
    train_dataloader=datamodule_train,
    test_dataloader=datamodule_test,
    layer_names=collect_activations_from_layers,
    device=device,
    getitem_keys=getitem_keys,
    n_batches_to_process=n_batches_to_process,
    downsample_method=downsample_method,
    verbose=verbose,
)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
    fname=nametag,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name=nametag
)

# Different type - Close

In [None]:
model, cfg = get_model_config(different_close_config_path, device)
datamodule, datamodule_train, datamodule_val, datamodule_test = prepare_datamodule(cfg)

### Downsample Benchmark

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, nametag)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, nametag)

### Property extraction

In [None]:
all_layer_names = get_all_layer_names(model)

In [None]:
collect_activations_from_layers = ["encoder.layer2.0.conv1"]
device = "cuda" if torch.cuda.is_available() else "cpu"
getitem_keys = ["image", "mask"]
n_batches_to_process = n_batches_to_process
downsample_method = "avg_pool"
verbose = False

X, y, test_property_lengths = create_feature_matrix_and_labels(
    model=model,
    dm=datamodule,
    train_dataloader=datamodule_train,
    test_dataloader=datamodule_test,
    layer_names=collect_activations_from_layers,
    device=device,
    getitem_keys=getitem_keys,
    n_batches_to_process=n_batches_to_process,
    downsample_method=downsample_method,
    verbose=verbose,
)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name="Disaster"
)

# Pre Post

In [None]:
model, cfg = get_model_config(pre_post_config_path, device)
datamodule, datamodule_train, datamodule_val, datamodule_test = prepare_datamodule(cfg)

### Downsample Benchmark

In [None]:
nametag = (
    cfg.id_ood_disaster[0]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[0]["pre-post"]
    + "_"
    + cfg.id_ood_disaster[1]["disaster_name"]
    + "-"
    + cfg.id_ood_disaster[1]["pre-post"]
)

for downsample in downsample_methods:
    print(f"Running experiments for {downsample} downsample method.")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=collect_activations_from_layers,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample,
        verbose=verbose,
    )
    print("Run g experiment for downsample method:", downsample)
    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + downsample
    downsample_benchmark[dict_key] = g_benchmark

In [None]:
plot_downsample_benchmark(downsample_benchmark, nametag)

### Layer Benchmark

In [None]:
for lyr in selected_layers:
    print(f"Running experiments for layer {lyr}")
    X, y, test_property_lengths = create_feature_matrix_and_labels(
        model=model,
        dm=datamodule,
        train_dataloader=datamodule_train,
        test_dataloader=datamodule_test,
        layer_names=lyr,
        device=device,
        getitem_keys=getitem_keys,
        n_batches_to_process=n_batches_to_process,
        downsample_method=downsample_method,
        verbose=verbose,
    )

    g_benchmark = run_g_experiment(
        X, y, split_seed, test_size, n_estimators, fixed_classifier_seed, clf=None
    )
    dict_key = nametag + "_" + str(lyr)

    layer_benchmark[dict_key] = g_benchmark

In [None]:
plot_layer_benchmark(layer_benchmark, all_layer_names, nametag)

### Property extraction

In [None]:
all_layer_names = get_all_layer_names(model)

In [None]:
collect_activations_from_layers = ["encoder.layer2.0.conv1"]
device = "cuda" if torch.cuda.is_available() else "cpu"
getitem_keys = ["image", "mask"]
n_batches_to_process = n_batches_to_process
downsample_method = "avg_pool"
verbose = False

X, y, test_property_lengths = create_feature_matrix_and_labels(
    model=model,
    dm=datamodule,
    train_dataloader=datamodule_train,
    test_dataloader=datamodule_test,
    layer_names=collect_activations_from_layers,
    device=device,
    getitem_keys=getitem_keys,
    n_batches_to_process=n_batches_to_process,
    downsample_method=downsample_method,
    verbose=verbose,
)

### g confidence interval

In [None]:
g_benchmark = run_multiple_experiments_g(
    X, y, test_size, n_estimators, random_seed=True
)

In [None]:
# Calculate confidence intervals for the specified columns
columns_of_interest = ["baseline_accuracy", "baseline_fpr95", "baseline_roc_auc"]
confidence_intervals_g = calculate_confidence_intervals(
    g_benchmark, columns_of_interest
)
confidence_intervals_g

### g_hat

In [None]:
results_df_g_hat = run_optuna_study(
    X,
    y,
    n_optuna_trials,
    test_size,
    min_cluster,
    max_cluster_ratio,
    min_fraction,
    max_fraction,
    n_estimators,
    fixed_seed,
)

In [None]:
N = 10
results_dict = results_df_g_hat.to_dict(orient="records")
k = int(len(X) * 0.3)
M = results_dict[0]["M"]
print("k, M, len(X), k/len(X)", k, M, len(X), k / len(X))

g_hat_benchmark = run_multiple_experiments_g_hat(X, y, test_size, k, M, N, fixed_seed)

### g g_hat mean std + ttest

In [None]:
t_tests_results = perform_benchmark_analysis(g_benchmark, g_hat_benchmark)
t_tests_results

### Benchmark g

In [None]:
classifiers = {
    "RandomForestUnblanaced": RandomForestClassifier(n_estimators=100, random_state=42),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "SVC": SVC(probability=True, class_weight="balanced", random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression(
        class_weight="balanced", max_iter=500, random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=100, class_weight="balanced", random_state=42
    ),
    "GaussianNB": GaussianNB(),
}

classifier_benchmark_df = benchmark_classifiers(
    X, y, test_size, k, M, classifiers, fixed_seed
)

### Benchmark clustering

In [None]:
clustering_methods = {
    "KMeans": KMeans(n_clusters=3, init="k-means++", random_state=42),
    "DBSCAN_eps_0.1": DBSCAN(eps=0.1, min_samples=5),
    "DBSCAN_eps_0.2": DBSCAN(eps=0.2, min_samples=5),
    "DBSCAN_eps_0.5": DBSCAN(eps=0.5, min_samples=5),
}

# A fixed classifier
classifier = RandomForestClassifier(
    n_estimators=100, class_weight="balanced", random_state=42
)

clustering_benchmark_df = benchmark_clustering_methods(
    X, y, test_size, k, M, clustering_methods, classifier
)

### Investigate k wrt g-g_hat

In [None]:
print("confidence_intervals_g", confidence_intervals_g)

df_results = benchmark_kmeans_with_varying_k_condidence_g_hat(
    X,
    y,
    M,
    test_size=0.2,
    n_runs=10,
    confidence_level=0.95,
    confidence_intervals_g=confidence_intervals_g,
    clf=None,
)

### Feature Level Visualization: Understand how clustering alters the structure of the feature space

In [None]:
i = 0
_, X_train_cluster, y_train_cluster, y_clusters = run_g_hat_experiment(
    X, y, test_size, k, M, fixed_seed, fixed_seed, i
)
print("Plotting")
plot_tsne_with_label_changes(
    X_train_cluster, y_train_cluster, y_clusters, class_name="Disaster"
)