In [7]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.datasets import cifar10

# load CIFAR10 test set
(_, _), (x_test, y_test) = cifar10.load_data()
x_test = x_test.astype("float32") / 255.0
y_test = y_test.flatten()
num_samples = len(y_test)

print("Test samples:", num_samples)

# load the 5 models (adam_lr001 versions)
models = []
for i in range(5):
    name = f"adam_lr001_model_{i}.h5"
    print("loading:", name)
    m = load_model(name)
    models.append(m)

print("Finished loading all 5 models.")

# run predictions for each model
all_preds = []
for i, m in enumerate(models):
    print("predicting with model", i)
    preds = m.predict(x_test, verbose=0)   # (10000, 10)
    all_preds.append(preds)

all_preds = np.array(all_preds)   # (5, 10000, 10)

# compute max confidence for each model on each image
confs = np.max(all_preds, axis=2)    # (5, 10000)

# naive MCE with tau = 0.95
tau = 0.95
excluded = (confs >= tau).astype(int)   # 1 = exclude, 0 = keep

# ensemble before MCE
ens_before = np.mean(all_preds, axis=0)
pred_before = np.argmax(ens_before, axis=1)
acc_before = np.mean(pred_before == y_test)

# ensemble after MCE
ens_after = []
for i in range(num_samples):
    keep = np.where(excluded[:, i] == 0)[0]

    if len(keep) == 0:
        # fallback if every model is excluded
        avg = np.ones(10) / 10
    else:
        avg = np.mean(all_preds[keep, i, :], axis=0)

    ens_after.append(avg)

ens_after = np.array(ens_after)
pred_after = np.argmax(ens_after, axis=1)
acc_after = np.mean(pred_after == y_test)

# exclusion stats
num_excluded_per_sample = np.sum(excluded, axis=0)
exclusion_rate = np.mean(num_excluded_per_sample > 0)
avg_num_excluded = np.mean(num_excluded_per_sample)

# RESULTS
print("MCE ORACLE RESULTS (τ = 0.95)")
print("Accuracy before MCE:", round(acc_before, 4))
print("Accuracy after  MCE:", round(acc_after, 4))
print("Samples with exclusions:", round(exclusion_rate * 100, 2), "%")
print("Average # models excluded:", round(avg_num_excluded, 3))

# per model details
print("PER-MODEL RESULTS")
for i in range(5):
    preds_i = np.argmax(all_preds[i], axis=1)
    acc_i = np.mean(preds_i == y_test)

    excl_cnt = np.sum(excluded[i])
    excl_rate = excl_cnt / num_samples

    print(f"\nModel {i}:")
    print("  Accuracy:", round(acc_i, 4))
    print(f"  Excluded: {excl_cnt}/{num_samples}")
    print("  Exclusion Rate:", round(excl_rate * 100, 2), "%")

# example output for image 0
print("SAMPLE 0 DETAILS")
print("Confidences:", confs[:, 0])
print("Excluded   :", excluded[:, 0])


Test samples: 10000
loading: adam_lr001_model_0.h5




loading: adam_lr001_model_1.h5




loading: adam_lr001_model_2.h5




loading: adam_lr001_model_3.h5
loading: adam_lr001_model_4.h5




Finished loading all 5 models.
predicting with model 0
predicting with model 1
predicting with model 2
predicting with model 3
predicting with model 4
MCE ORACLE RESULTS (τ = 0.95)
Accuracy before MCE: 0.2283
Accuracy after  MCE: 0.1253
Samples with exclusions: 100.0 %
Average # models excluded: 4.468
PER-MODEL RESULTS

Model 0:
  Accuracy: 0.1571
  Excluded: 9521/10000
  Exclusion Rate: 95.21 %

Model 1:
  Accuracy: 0.174
  Excluded: 9607/10000
  Exclusion Rate: 96.07 %

Model 2:
  Accuracy: 0.1717
  Excluded: 9787/10000
  Exclusion Rate: 97.87 %

Model 3:
  Accuracy: 0.2735
  Excluded: 8950/10000
  Exclusion Rate: 89.5 %

Model 4:
  Accuracy: 0.1952
  Excluded: 6819/10000
  Exclusion Rate: 68.19 %
SAMPLE 0 DETAILS
Confidences: [0.99999994 0.99999994 0.9998045  0.99999994 0.95953536]
Excluded   : [1 1 1 1 1]
