In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# switch to the project directory
%cd ..
# working directory should be ../pdi

In [None]:
import sys
import os
module_path = os.path.abspath('src')

if module_path not in sys.path:
    sys.path.append(module_path)

#### How to use this notebook?
1. Train models with desired configs and use `scripts` subdirectory scripts to achieve that.
2. Fill `MODELS` dictionary with paths to the results dir of the run and name it appropriately as in dictionary element key.
3. Run desired plot/table generation cells. 

In [None]:
from pdi.constants import PART_NAME_TO_TARGET_CODE

MODELS = {
    "Attention": "results/attention_dann_kaon_alpha_0_1",
}
target_code = PART_NAME_TO_TARGET_CODE["kaon"]

save_dir = "reports"
os.makedirs(save_dir, exist_ok=True)

In [None]:
import json
from typing import cast
from pdi.config import Config
from pdi.engines import build_engine, DomainAdaptationEngine
from pdi.results_and_metrics import TestResults
from pdi.data.data_preparation import DataPreparation
from pdi.data.types import Split

sim_data_prep: DataPreparation | None = None
exp_data_prep: DataPreparation | None = None
checksums = set()
test_results: dict[str, tuple[list, list]] = {}
for k, v in MODELS.items():
    with open(f"{v}/config.json", 'r') as f:
        config_data = json.load(f)
    config = Config.from_dict(config_data)
    config.training.device = "cpu"
    config.validation.batch_size = 2048
    config.validation.num_workers = 2
    engine = build_engine(config, target_code, base_dir=v)
    engine = cast(DomainAdaptationEngine, engine)
    current_data_prep = engine.get_data_prep()
    if sim_data_prep is None or exp_data_prep is None:
        sim_data_prep = current_data_prep[0]
        exp_data_prep = current_data_prep[1]
    checksums.add(sim_data_prep._inputs_checksum + exp_data_prep._inputs_checksum)
    test_results[k] = engine.feature_extraction(model_dirpath=v)

if len(checksums) > 1:
    raise RuntimeError("You shouldn't compare models trained on different datasets.")

In [None]:
print(test_results["Attention"][0].shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

simulated_features, experimental_features = test_results["Attention"]

# Subsample the datasets
np.random.seed(42)
sim_sample_size = min(20000, len(simulated_features))
exp_sample_size = min(20000, len(experimental_features))

simulated_sample = simulated_features[np.random.choice(len(simulated_features), sim_sample_size, replace=False)]
experimental_sample = experimental_features[np.random.choice(len(experimental_features), exp_sample_size, replace=False)]

combined_features = np.vstack((simulated_sample, experimental_sample))

labels = np.array([0] * len(simulated_sample) + [1] * len(experimental_sample))

tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
tsne_results = tsne.fit_transform(combined_features)

sim_tsne = tsne_results[:len(simulated_sample)]
exp_tsne = tsne_results[len(simulated_sample):]

plt.figure(figsize=(10, 7))
plt.scatter(sim_tsne[:, 0], sim_tsne[:, 1], c='blue', label='Simulated', alpha=0.6)
plt.scatter(exp_tsne[:, 0], exp_tsne[:, 1], c='orange', label='Experimental', alpha=0.6)
plt.title('t-SNE Visualization of Subsampled Simulated and Experimental Features')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.metrics.pairwise import rbf_kernel

def compute_mmd(X, Y, gamma=1.0):
    """
    Compute the Maximum Mean Discrepancy (MMD) between two datasets X and Y.
    Uses the RBF kernel with the specified gamma parameter.
    """
    XX = rbf_kernel(X, X, gamma=gamma)
    YY = rbf_kernel(Y, Y, gamma=gamma)
    XY = rbf_kernel(X, Y, gamma=gamma)
    mmd = XX.mean() + YY.mean() - 2 * XY.mean()
    return mmd

metrics = {}

# Kolmogorov-Smirnov (KS)
ks_distances = []
for i in range(simulated_sample.shape[1]):
    ks_stat, _ = ks_2samp(simulated_sample[:, i], experimental_sample[:, i])
    ks_distances.append(ks_stat)
metrics['Kolmogorov-Smirnov Mean'] = np.mean(ks_distances)
metrics['Kolmogorov-Smirnov Max'] = np.max(ks_distances)

# Maximum Mean Discrepancy (MMD)
mmd_value = compute_mmd(simulated_sample, experimental_sample, gamma=1.0)
metrics['Maximum Mean Discrepancy (MMD)'] = mmd_value

# Save metrics to CSV
metrics_df = pd.DataFrame([metrics])
print(metrics_df)
metrics_df.to_csv(f'{MODELS["Attention"]}/two_sample_test_metrics.csv', index=False)

print("Two-sample test metrics calculated and saved to 'two_sample_test_metrics.csv'.")