In [3]:
from src.data_generation.datasets import gummy_worm_dataset_family
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import pickle
from joblib import Parallel, delayed


In [None]:

datasets = gummy_worm_dataset_family()

X_trains = []
y_trains = []
X_tests = []
y_tests = []

for dataset in datasets:
    samples, labels = dataset.generate_data(10000)    
    X_train, X_test, y_train, y_test = train_test_split(samples, labels, test_size=0.5, random_state=42)
    
    X_trains.append(X_train)
    y_trains.append(y_train)
    X_tests.append(X_test)
    y_tests.append(y_test)

# ensure numpy
X_trains = np.array(X_trains)
y_trains = np.array(y_trains)
X_tests = np.array(X_tests)
y_tests = np.array(y_tests)

print(X_trains.shape)
print(y_trains.shape)
print(X_tests.shape)
print(y_tests.shape)


In [None]:
print(X_trains[1, :3 , :])

In [None]:
from src.metrics.ece import ece


def calculate_ece_on_experiments(estimators, X_tests, y_tests, sample_size):
    eces = []
    for i, estimator in enumerate(estimators):
        p_pred = estimator.predict_proba(X_tests[i, :sample_size, :])
        ECE = ece(p_pred, y_tests[i, :sample_size], 15)
        eces.append(ECE)
    return np.mean(eces)

In [None]:
filenames = [
"../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__SVM__Gummy Worm Dataset Family__AbsoluteValues__20250405_032919.pkl",
#"../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__Neural Network__Gummy Worm Dataset Family__AbsoluteValues__20250311_015848.pkl",
#"../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__Logistic Regression__Gummy Worm Dataset Family__AbsoluteValues__20250311_015848.pkl",
#"../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__Random Forest__Gummy Worm Dataset Family__AbsoluteValues__20250311_015848.pkl",
#"../varying_test_sample_size_dataset_family/data/Exclamation Mark Dataset__SVM__Exclamation Mark Dataset Family__AbsoluteValues__20250408_001836.pkl",
#"../varying_test_sample_size_dataset_family/data/Exclamation Mark Dataset__Neural Network__Exclamation Mark Dataset Family__AbsoluteValues__20250408_234908.pkl",
#"../varying_test_sample_size_dataset_family/data/Exclamation Mark Dataset__Logistic Regression__Exclamation Mark Dataset Family__AbsoluteValues__20250408_234908.pkl",
#"../varying_test_sample_size_dataset_family/data/Exclamation Mark Dataset__Random Forest__Exclamation Mark Dataset Family__AbsoluteValues__20250408_234908.pkl",
]

subsample_sizes = np.linspace(100, 20000, 200, dtype=np.int64)
#for filename in filenames:
filename = "../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__SVM__Gummy Worm Dataset Family__AbsoluteValues__20250405_032919.pkl"
with (open(filename, 'rb') as file):
    print(filename)
    results = pickle.load(file)

    estimators = results['Estimators']
    means = results['Means']
    """
        eces = Parallel(n_jobs=-1, verbose=10)(  # n_jobs=-1 uses all available CPUs
            delayed(calculate_ece_on_experiments)(estimators, X_tests, y_tests, sample_size)
            for sample_size in subsample_sizes
        )
    """
    
    fig, ax = plt.subplots(figsize=(10, 6), dpi=150)

    for metric in means.keys():
        if metric == "ECE" or "True ECE" in metric:
            metric_means = np.array(means[metric])
            print("Metric", metric)
            ax.plot(subsample_sizes, metric_means, label=metric)
        
    ax.plot(subsample_sizes, eces, label="ECE corrected")
    plt.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.25)
    plt.xlabel('Sample Size', fontsize=12)
    plt.ylabel('Metric Values', fontsize=12)
    plt.title(f'Metrics and ECE corrected', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.legend()
    ax.grid(True, linestyle='--', alpha=0.6)

    plt.show(block=False)
        
        #with open(filename[:-4] + "ece_corrected.pkl", 'wb') as file:
         #   pickle.dump(eces, file)

In [4]:
subsample_sizes = np.linspace(100, 20000, 200, dtype=np.int64)
#for filename in filenames:
filename = "../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__Neural Network__Gummy Worm Dataset Family__AbsoluteValues__20250311_015848.pkl"
results = None
with (open(filename, 'rb') as file):
    print(filename)
    results = pickle.load(file)

../varying_test_sample_size_dataset_family/data/Gummy Worm Dataset__Neural Network__Gummy Worm Dataset Family__AbsoluteValues__20250311_015848.pkl


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [8]:
print(results.keys())

dict_keys(['True ECE Samples Dists', 'True ECE Samples Grid', 'True Probabilities Grid', 'Means', 'Std Devs'])
