In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# switch to the project directory
%cd ../..
# working directory should be ../pdi

/home/mytkom/Documents/alice/pdi


In [3]:
import sys
import os
module_path = os.path.abspath('src')

if module_path not in sys.path:
    sys.path.append(module_path)

#### How to use this notebook?
1. Train models with desired configs and use `scripts` subdirectory scripts to achieve that.
2. Fill `MODELS` dictionary with paths to the results dir of the run and name it appropriately as in dictionary element key.
3. Run desired plot/table generation cells. 

In [4]:
from pdi.constants import PART_NAME_TO_TARGET_CODE

MODELS = {
    "alpha=0": "results/attention_dann_hyperparameter_tuning/best_run_alpha_0",
    "alpha=0.15": "results/attention_dann_hyperparameter_tuning/sweep_118f437672375fa45c5e417106c304a1/kaon/run_25",
}
target_code = PART_NAME_TO_TARGET_CODE["kaon"]

save_dir = "reports"
os.makedirs(save_dir, exist_ok=True)

In [5]:
import json
from typing import cast
from pdi.config import Config
from pdi.engines import build_engine, DomainAdaptationEngine
from pdi.results_and_metrics import TestResults
from pdi.data.data_preparation import DataPreparation
from pdi.data.types import Split

sim_data_prep: DataPreparation | None = None
exp_data_prep: DataPreparation | None = None
checksums = set()
test_results: dict[str, tuple[list, list]] = {}
for k, v in MODELS.items():
    with open(f"{v}/config.json", 'r') as f:
        config_data = json.load(f)
    config = Config.from_dict(config_data)
    config.training.device = "cpu"
    config.validation.batch_size = 2048
    config.validation.num_workers = 2
    engine = build_engine(config, target_code, base_dir=v)
    engine = cast(DomainAdaptationEngine, engine)
    current_data_prep = engine.get_data_prep()
    if sim_data_prep is None or exp_data_prep is None:
        sim_data_prep = current_data_prep[0]
        exp_data_prep = current_data_prep[1]
    checksums.add(sim_data_prep._inputs_checksum + exp_data_prep._inputs_checksum)
    test_results[k] = engine.feature_extraction(model_dirpath=v)

if len(checksums) > 1:
    raise RuntimeError("You shouldn't compare models trained on different datasets.")

[DataPreparation] Calculating input_paths + configuration checksum:
[DataPreparation] 	resulting checksum: 0858460c6963f0b5c7dce050440ffd05
[DataPreparation] Successfuly loaded preprocessed data! No need for from scratch preparation.
[DataPreparation] Calculating input_paths + configuration checksum:
[DataPreparation] 	resulting checksum: cb3638b8f27a952941ad38bdb37f4570
[DataPreparation] Successfuly loaded preprocessed data! No need for from scratch preparation.
Model attention_dann has been initialized:
	Number of trainable parameters: 463426


Processing Simulated Data: 100%|██████████| 290/290 [02:09<00:00,  2.23it/s]
Processing Experimental Data: 100%|██████████| 356/356 [03:05<00:00,  1.92it/s]


[DataPreparation] Calculating input_paths + configuration checksum:
[DataPreparation] 	resulting checksum: 0858460c6963f0b5c7dce050440ffd05
[DataPreparation] Successfuly loaded preprocessed data! No need for from scratch preparation.
[DataPreparation] Calculating input_paths + configuration checksum:
[DataPreparation] 	resulting checksum: cb3638b8f27a952941ad38bdb37f4570
[DataPreparation] Successfuly loaded preprocessed data! No need for from scratch preparation.
Model attention_dann has been initialized:
	Number of trainable parameters: 463426


Processing Simulated Data: 100%|██████████| 290/290 [02:11<00:00,  2.21it/s]
Processing Experimental Data: 100%|██████████| 356/356 [03:04<00:00,  1.93it/s]


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.metrics.pairwise import rbf_kernel

SEED = 1401

def compute_mmd(X, Y, gamma=1.0):
    """
    Compute the Maximum Mean Discrepancy (MMD) between two datasets X and Y.
    Uses the RBF kernel with the specified gamma parameter.
    """
    XX = rbf_kernel(X, X, gamma=gamma)
    YY = rbf_kernel(Y, Y, gamma=gamma)
    XY = rbf_kernel(X, Y, gamma=gamma)
    mmd = XX.mean() + YY.mean() - 2 * XY.mean()
    return mmd

# Iterate over all test results
for key, (simulated_features, experimental_features) in test_results.items():
    print(f"Processing key: {key}")
    
    # Subsample the datasets
    np.random.seed(SEED)
    sim_sample_size = min(20000, len(simulated_features))
    exp_sample_size = min(20000, len(experimental_features))

    simulated_sample = simulated_features[np.random.choice(len(simulated_features), sim_sample_size, replace=False)]
    experimental_sample = experimental_features[np.random.choice(len(experimental_features), exp_sample_size, replace=False)]

    combined_features = np.vstack((simulated_sample, experimental_sample))
    labels = np.array([0] * len(simulated_sample) + [1] * len(experimental_sample))

    # t-SNE visualization
    tsne = TSNE(n_components=2, random_state=SEED, perplexity=30, n_iter=1000)
    tsne_results = tsne.fit_transform(combined_features)

    sim_tsne = tsne_results[:len(simulated_sample)]
    exp_tsne = tsne_results[len(simulated_sample):]

    plt.figure(figsize=(10, 7))
    plt.scatter(sim_tsne[:, 0], sim_tsne[:, 1], c='blue', label='Simulated', alpha=0.6)
    plt.scatter(exp_tsne[:, 0], exp_tsne[:, 1], c='orange', label='Experimental', alpha=0.6)
    plt.title(f't-SNE Visualization of Subsampled Simulated and Experimental Features ({key})')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Normalized diff histogram
    plt.figure(figsize=(10, 7))
    xmin = -100
    xmax = 100
    ymin = -100
    ymax = 100

    # Compute 2D histograms for both datasets
    hist_sim, xedges, yedges = np.histogram2d(sim_tsne[:, 0], sim_tsne[:, 1], bins=200, range=[[xmin, xmax], [ymin, ymax]])
    hist_exp, _, _ = np.histogram2d(exp_tsne[:, 0], exp_tsne[:, 1], bins=200, range=[[xmin, xmax], [ymin, ymax]])

    # Calculate the difference between the two histograms
    hist_diff = hist_sim - hist_exp

    # Normalize the difference to the range [-1, 1]
    max_abs_diff = np.max(np.abs(hist_diff))
    hist_diff_normalized = hist_diff / max_abs_diff

    # Plot the normalized difference as a 2D histogram
    im = plt.imshow(hist_diff_normalized.T, origin='lower', extent=[xmin, xmax, ymin, ymax], cmap='coolwarm', aspect='auto')

    plt.title(f'Normalized Difference of 2D Histograms ({key})')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')

    # Add a colorbar with an extended label
    cbar = plt.colorbar(im)
    cbar.set_label('Normalized Difference [-1, 1]\n(Positive: sim > exp, Negative: exp > sim)')

    plt.grid(False)
    plt.show()

    # Compute metrics
    metrics = {}

    # Kolmogorov-Smirnov (KS)
    ks_distances = []
    for i in range(simulated_sample.shape[1]):
        ks_stat, _ = ks_2samp(simulated_sample[:, i], experimental_sample[:, i])
        ks_distances.append(ks_stat)
    metrics['Kolmogorov-Smirnov Mean'] = np.mean(ks_distances)
    metrics['Kolmogorov-Smirnov Max'] = np.max(ks_distances)

    # Maximum Mean Discrepancy (MMD)
    mmd_value = compute_mmd(simulated_sample, experimental_sample, gamma=1.0)
    metrics['Maximum Mean Discrepancy (MMD)'] = mmd_value

    # Save metrics to CSV
    metrics_df = pd.DataFrame([metrics])
    output_path = f'{MODELS[key]}/two_sample_test_metrics.csv'
    metrics_df.to_csv(output_path, index=False)
    print(f"Two-sample test metrics for key '{key}' calculated and saved to '{output_path}'.")

Processing key: alpha=0
