<a href="https://colab.research.google.com/github/lukas-stamm/bachelor_thesis/blob/model_dev/shuffle_generators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Shuffle Generators Approach**

In [1]:
!pip install timm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->tim

#### **Imports**

In [19]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import plotly.express as px
import plotly.graph_objects as go


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import (
    Dataset,
    DataLoader,
    Subset,
)

import timm
from torchvision import transforms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
)

#### **Dataset Class**

In [3]:
# Crop out watermark from the bottom of the image
def crop_bottom(image, px=25):
    width, height = image.size
    return image.crop((0, 0, width, height - px))

In [4]:
class AllGeneratorsDataset(Dataset):
    def __init__(self, root_dirs, transform=None):
        self.samples = []
        self.transform = transform

        for generator in root_dirs:
            for label_type in ['real', 'fake']:
                folder = os.path.join(generator, label_type)
                for file in os.listdir(folder):
                    if file.lower().endswith(('.jpg', '.png', '.webp')):
                        path = os.path.join(folder, file)
                        label = 0 if label_type == 'real' else 1
                        self.samples.append((path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        image = Image.open(path).convert("RGB")
        image = crop_bottom(image)  # use your watermark crop
        if self.transform:
            image = self.transform(image)
        return image, label

    def get_labels(self):
        return [label for _, label in self.samples]

In [5]:
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    loop = tqdm(loader, desc="Training", leave=False)

    for images, labels in loop:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(images)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        loop.set_postfix(loss=loss.item())
    return total_loss / len(loader)


In [6]:
def evaluate_and_report(model, loader, device, name="Evaluation", thresholds=[0.90, 0.80, 0.70, 0.60]):
    model.eval()
    predictions, targets, confs = [], [], []

    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            outputs = model(images)
            probs = torch.softmax(outputs, dim=1)
            max_probs, predicted = torch.max(probs, dim=1)

            predictions.extend(predicted.cpu().numpy())
            targets.extend(labels.cpu().numpy())
            confs.extend(max_probs.cpu().numpy())

    predictions, targets, confs = map(np.array, (predictions, targets, confs))
    correct = predictions == targets
    report = classification_report(targets, predictions, output_dict=True)


    print(f"\n📊 Results for: {name}")
    print(classification_report(targets, predictions, target_names=["Real", "Fake"]))
    print("🔁 Confusion Matrix:")
    print(confusion_matrix(targets, predictions))
    print(f"✅ Accuracy: {accuracy_score(targets, predictions):.4f}")
    print("\n📈 Confidence Stats:")
    print(f"Mean (All):       {confs.mean():.4f}")
    print(f"Mean (Correct):   {confs[correct].mean():.4f}")
    print(f"Mean (Incorrect): {confs[~correct].mean():.4f}")
    for t in thresholds:
        mask = confs >= t
        if mask.sum() > 0:
            acc = np.mean(correct[mask])
            print(f"\n🔎 Threshold ≥ {t:.2f}:")
            print(f"  Samples: {mask.sum()} ({mask.mean()*100:.2f}%)")
            print(f"  Accuracy: {acc:.4f}")

    return {
        "fold": name,
        "accuracy": accuracy_score(targets, predictions),
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"],
        "mean_confidence": confs.mean(),
        "high_conf_coverage": (confs >= 0.90).mean(),
        "high_conf_accuracy": np.mean(correct[confs >= 0.90]) if np.any(confs >= 0.90) else 0.0
    }

In [7]:
def train_and_predict(generators, dataset_base_path, transform, unfreeze_layers=False, num_epochs=5, batch_size=32, learning_rate=1e-4, weight_decay=0.0, n_splits=5, thresholds=[0.90, 0.80, 0.70, 0.60]):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    root_dirs = [os.path.join(dataset_base_path, g) for g in generators]
    dataset = AllGeneratorsDataset(root_dirs, transform=transform)
    labels = dataset.get_labels()

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(labels)), labels)):
        print(f"\n📂 Fold {fold+1}/{n_splits}")
        train_set = Subset(dataset, train_idx)
        val_set   = Subset(dataset, val_idx)
        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader   = DataLoader(val_set, batch_size=batch_size)

        # -- Xception model setup --
        model = timm.create_model('xception', pretrained=True, num_classes=2)
        # Freeze or unfreeze
        if not unfreeze_layers:
            for p in model.parameters(): p.requires_grad = False
        # Always train the classifier head
        for p in model.get_classifier().parameters(): p.requires_grad = True

        model = model.to(device)
        optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=learning_rate,
            weight_decay=weight_decay
        )
        criterion = nn.CrossEntropyLoss()

        # Training epochs
        for epoch in range(num_epochs):
            loss = train(model, train_loader, optimizer, criterion, device)
            print(f"🧪 Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}")

        # Evaluation & metrics
        m = evaluate_and_report(model, val_loader, device, name=f"Fold {fold+1}", thresholds=thresholds)
        all_metrics.append(m)

    return pd.DataFrame(all_metrics)

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!cp -r "/content/drive/MyDrive/BA_Data/DeepGuardDB_v1" /content/data/

In [15]:
# Example transform (basic or augmented)
transform_basic = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5],
                         [0.5, 0.5, 0.5])
])

# Run K-Fold
kfold_basic = train_and_predict(
    generators=['DALLE_dataset', 'IMAGEN_dataset', 'SD_dataset'],
    dataset_base_path='/content/data/DeepGuardDB_v1',
    transform=transform_basic,
)


📂 Fold 1/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.6557




🧪 Epoch 2/5, Loss: 0.5985




🧪 Epoch 3/5, Loss: 0.5685




🧪 Epoch 4/5, Loss: 0.5450




🧪 Epoch 5/5, Loss: 0.5303

📊 Results for: Fold 1
              precision    recall  f1-score   support

        Real       0.76      0.74      0.75      1200
        Fake       0.75      0.77      0.76      1200

    accuracy                           0.76      2400
   macro avg       0.76      0.76      0.76      2400
weighted avg       0.76      0.76      0.76      2400

🔁 Confusion Matrix:
[[893 307]
 [275 925]]
✅ Accuracy: 0.7575

📈 Confidence Stats:
Mean (All):       0.6717
Mean (Correct):   0.6912
Mean (Incorrect): 0.6109

🔎 Threshold ≥ 0.90:
  Samples: 67 (2.79%)
  Accuracy: 1.0000

🔎 Threshold ≥ 0.80:
  Samples: 400 (16.67%)
  Accuracy: 0.9300

🔎 Threshold ≥ 0.70:
  Samples: 922 (38.42%)
  Accuracy: 0.8839

🔎 Threshold ≥ 0.60:
  Samples: 1611 (67.12%)
  Accuracy: 0.8367

📂 Fold 2/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.6534




🧪 Epoch 2/5, Loss: 0.5958




🧪 Epoch 3/5, Loss: 0.5642




🧪 Epoch 4/5, Loss: 0.5427




🧪 Epoch 5/5, Loss: 0.5279

📊 Results for: Fold 2
              precision    recall  f1-score   support

        Real       0.74      0.75      0.75      1200
        Fake       0.75      0.73      0.74      1200

    accuracy                           0.74      2400
   macro avg       0.74      0.74      0.74      2400
weighted avg       0.74      0.74      0.74      2400

🔁 Confusion Matrix:
[[903 297]
 [321 879]]
✅ Accuracy: 0.7425

📈 Confidence Stats:
Mean (All):       0.6683
Mean (Correct):   0.6893
Mean (Incorrect): 0.6079

🔎 Threshold ≥ 0.90:
  Samples: 86 (3.58%)
  Accuracy: 0.9767

🔎 Threshold ≥ 0.80:
  Samples: 364 (15.17%)
  Accuracy: 0.9396

🔎 Threshold ≥ 0.70:
  Samples: 862 (35.92%)
  Accuracy: 0.8979

🔎 Threshold ≥ 0.60:
  Samples: 1583 (65.96%)
  Accuracy: 0.8238

📂 Fold 3/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.6493




🧪 Epoch 2/5, Loss: 0.5944




🧪 Epoch 3/5, Loss: 0.5628




🧪 Epoch 4/5, Loss: 0.5427




🧪 Epoch 5/5, Loss: 0.5261

📊 Results for: Fold 3
              precision    recall  f1-score   support

        Real       0.76      0.77      0.76      1200
        Fake       0.76      0.75      0.76      1200

    accuracy                           0.76      2400
   macro avg       0.76      0.76      0.76      2400
weighted avg       0.76      0.76      0.76      2400

🔁 Confusion Matrix:
[[919 281]
 [297 903]]
✅ Accuracy: 0.7592

📈 Confidence Stats:
Mean (All):       0.6685
Mean (Correct):   0.6900
Mean (Incorrect): 0.6006

🔎 Threshold ≥ 0.90:
  Samples: 78 (3.25%)
  Accuracy: 0.9872

🔎 Threshold ≥ 0.80:
  Samples: 373 (15.54%)
  Accuracy: 0.9464

🔎 Threshold ≥ 0.70:
  Samples: 890 (37.08%)
  Accuracy: 0.9112

🔎 Threshold ≥ 0.60:
  Samples: 1586 (66.08%)
  Accuracy: 0.8525

📂 Fold 4/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.6454




🧪 Epoch 2/5, Loss: 0.5935




🧪 Epoch 3/5, Loss: 0.5627




🧪 Epoch 4/5, Loss: 0.5436




🧪 Epoch 5/5, Loss: 0.5276

📊 Results for: Fold 4
              precision    recall  f1-score   support

        Real       0.71      0.83      0.76      1200
        Fake       0.79      0.66      0.72      1200

    accuracy                           0.74      2400
   macro avg       0.75      0.74      0.74      2400
weighted avg       0.75      0.74      0.74      2400

🔁 Confusion Matrix:
[[991 209]
 [407 793]]
✅ Accuracy: 0.7433

📈 Confidence Stats:
Mean (All):       0.6714
Mean (Correct):   0.6942
Mean (Incorrect): 0.6051

🔎 Threshold ≥ 0.90:
  Samples: 95 (3.96%)
  Accuracy: 1.0000

🔎 Threshold ≥ 0.80:
  Samples: 400 (16.67%)
  Accuracy: 0.9550

🔎 Threshold ≥ 0.70:
  Samples: 904 (37.67%)
  Accuracy: 0.9137

🔎 Threshold ≥ 0.60:
  Samples: 1583 (65.96%)
  Accuracy: 0.8275

📂 Fold 5/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.6598




🧪 Epoch 2/5, Loss: 0.6028




🧪 Epoch 3/5, Loss: 0.5693




🧪 Epoch 4/5, Loss: 0.5473




🧪 Epoch 5/5, Loss: 0.5307

📊 Results for: Fold 5
              precision    recall  f1-score   support

        Real       0.76      0.75      0.75      1200
        Fake       0.75      0.77      0.76      1200

    accuracy                           0.76      2400
   macro avg       0.76      0.76      0.76      2400
weighted avg       0.76      0.76      0.76      2400

🔁 Confusion Matrix:
[[897 303]
 [282 918]]
✅ Accuracy: 0.7562

📈 Confidence Stats:
Mean (All):       0.6661
Mean (Correct):   0.6851
Mean (Incorrect): 0.6073

🔎 Threshold ≥ 0.90:
  Samples: 83 (3.46%)
  Accuracy: 1.0000

🔎 Threshold ≥ 0.80:
  Samples: 363 (15.12%)
  Accuracy: 0.9394

🔎 Threshold ≥ 0.70:
  Samples: 858 (35.75%)
  Accuracy: 0.9033

🔎 Threshold ≥ 0.60:
  Samples: 1542 (64.25%)
  Accuracy: 0.8294


In [16]:
metrics_df = pd.DataFrame(kfold_basic)
metrics_df_basic_rounded = metrics_df.round(4)

display(metrics_df_basic_rounded)

Unnamed: 0,fold,accuracy,precision,recall,f1,mean_confidence,high_conf_coverage,high_conf_accuracy
0,Fold 1,0.7575,0.7577,0.7575,0.7575,0.6717,0.0279,1.0
1,Fold 2,0.7425,0.7426,0.7425,0.7425,0.6683,0.0358,0.9767
2,Fold 3,0.7592,0.7592,0.7592,0.7592,0.6685,0.0325,0.9872
3,Fold 4,0.7433,0.7501,0.7433,0.7416,0.6714,0.0396,1.0
4,Fold 5,0.7562,0.7563,0.7563,0.7562,0.6661,0.0346,1.0


In [11]:
transform_advanced = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(5),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5],
                         [0.5, 0.5, 0.5])
])

kfold_advanced = train_and_predict(
    generators=['DALLE_dataset', 'IMAGEN_dataset', 'SD_dataset'],
    dataset_base_path='/content/data/DeepGuardDB_v1',
    transform=transform_advanced,
    unfreeze_layers=True,
    weight_decay=1e-5
)


📂 Fold 1/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.2988




🧪 Epoch 2/5, Loss: 0.0938




🧪 Epoch 3/5, Loss: 0.0451




🧪 Epoch 4/5, Loss: 0.0244




🧪 Epoch 5/5, Loss: 0.0285

📊 Results for: Fold 1
              precision    recall  f1-score   support

        Real       0.93      0.97      0.95      1200
        Fake       0.97      0.93      0.95      1200

    accuracy                           0.95      2400
   macro avg       0.95      0.95      0.95      2400
weighted avg       0.95      0.95      0.95      2400

🔁 Confusion Matrix:
[[1166   34]
 [  84 1116]]
✅ Accuracy: 0.9508

📈 Confidence Stats:
Mean (All):       0.9750
Mean (Correct):   0.9823
Mean (Incorrect): 0.8345

🔎 Threshold ≥ 0.90:
  Samples: 2212 (92.17%)
  Accuracy: 0.9765

🔎 Threshold ≥ 0.80:
  Samples: 2285 (95.21%)
  Accuracy: 0.9685

🔎 Threshold ≥ 0.70:
  Samples: 2332 (97.17%)
  Accuracy: 0.9605

🔎 Threshold ≥ 0.60:
  Samples: 2367 (98.62%)
  Accuracy: 0.9544

📂 Fold 2/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.2913




🧪 Epoch 2/5, Loss: 0.0947




🧪 Epoch 3/5, Loss: 0.0414




🧪 Epoch 4/5, Loss: 0.0344




🧪 Epoch 5/5, Loss: 0.0225

📊 Results for: Fold 2
              precision    recall  f1-score   support

        Real       0.97      0.93      0.95      1200
        Fake       0.94      0.97      0.95      1200

    accuracy                           0.95      2400
   macro avg       0.95      0.95      0.95      2400
weighted avg       0.95      0.95      0.95      2400

🔁 Confusion Matrix:
[[1119   81]
 [  34 1166]]
✅ Accuracy: 0.9521

📈 Confidence Stats:
Mean (All):       0.9735
Mean (Correct):   0.9806
Mean (Incorrect): 0.8327

🔎 Threshold ≥ 0.90:
  Samples: 2204 (91.83%)
  Accuracy: 0.9764

🔎 Threshold ≥ 0.80:
  Samples: 2285 (95.21%)
  Accuracy: 0.9676

🔎 Threshold ≥ 0.70:
  Samples: 2329 (97.04%)
  Accuracy: 0.9626

🔎 Threshold ≥ 0.60:
  Samples: 2370 (98.75%)
  Accuracy: 0.9578

📂 Fold 3/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.2944




🧪 Epoch 2/5, Loss: 0.0950




🧪 Epoch 3/5, Loss: 0.0390




🧪 Epoch 4/5, Loss: 0.0289




🧪 Epoch 5/5, Loss: 0.0237

📊 Results for: Fold 3
              precision    recall  f1-score   support

        Real       0.94      0.96      0.95      1200
        Fake       0.96      0.94      0.95      1200

    accuracy                           0.95      2400
   macro avg       0.95      0.95      0.95      2400
weighted avg       0.95      0.95      0.95      2400

🔁 Confusion Matrix:
[[1149   51]
 [  76 1124]]
✅ Accuracy: 0.9471

📈 Confidence Stats:
Mean (All):       0.9745
Mean (Correct):   0.9823
Mean (Incorrect): 0.8356

🔎 Threshold ≥ 0.90:
  Samples: 2205 (91.88%)
  Accuracy: 0.9723

🔎 Threshold ≥ 0.80:
  Samples: 2279 (94.96%)
  Accuracy: 0.9649

🔎 Threshold ≥ 0.70:
  Samples: 2333 (97.21%)
  Accuracy: 0.9589

🔎 Threshold ≥ 0.60:
  Samples: 2365 (98.54%)
  Accuracy: 0.9535

📂 Fold 4/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.2894




🧪 Epoch 2/5, Loss: 0.0911




🧪 Epoch 3/5, Loss: 0.0466




🧪 Epoch 4/5, Loss: 0.0333




🧪 Epoch 5/5, Loss: 0.0199

📊 Results for: Fold 4
              precision    recall  f1-score   support

        Real       0.97      0.96      0.96      1200
        Fake       0.96      0.97      0.97      1200

    accuracy                           0.96      2400
   macro avg       0.97      0.96      0.96      2400
weighted avg       0.97      0.96      0.96      2400

🔁 Confusion Matrix:
[[1153   47]
 [  37 1163]]
✅ Accuracy: 0.9650

📈 Confidence Stats:
Mean (All):       0.9770
Mean (Correct):   0.9831
Mean (Incorrect): 0.8114

🔎 Threshold ≥ 0.90:
  Samples: 2223 (92.62%)
  Accuracy: 0.9856

🔎 Threshold ≥ 0.80:
  Samples: 2297 (95.71%)
  Accuracy: 0.9791

🔎 Threshold ≥ 0.70:
  Samples: 2337 (97.38%)
  Accuracy: 0.9739

🔎 Threshold ≥ 0.60:
  Samples: 2371 (98.79%)
  Accuracy: 0.9701

📂 Fold 5/5


  model = create_fn(


🧪 Epoch 1/5, Loss: 0.3028




🧪 Epoch 2/5, Loss: 0.0948




🧪 Epoch 3/5, Loss: 0.0396




🧪 Epoch 4/5, Loss: 0.0259




🧪 Epoch 5/5, Loss: 0.0205

📊 Results for: Fold 5
              precision    recall  f1-score   support

        Real       0.98      0.90      0.94      1200
        Fake       0.91      0.98      0.94      1200

    accuracy                           0.94      2400
   macro avg       0.94      0.94      0.94      2400
weighted avg       0.94      0.94      0.94      2400

🔁 Confusion Matrix:
[[1083  117]
 [  25 1175]]
✅ Accuracy: 0.9408

📈 Confidence Stats:
Mean (All):       0.9733
Mean (Correct):   0.9822
Mean (Incorrect): 0.8316

🔎 Threshold ≥ 0.90:
  Samples: 2194 (91.42%)
  Accuracy: 0.9722

🔎 Threshold ≥ 0.80:
  Samples: 2282 (95.08%)
  Accuracy: 0.9597

🔎 Threshold ≥ 0.70:
  Samples: 2328 (97.00%)
  Accuracy: 0.9540

🔎 Threshold ≥ 0.60:
  Samples: 2363 (98.46%)
  Accuracy: 0.9471


In [12]:
metrics_df = pd.DataFrame(kfold_advanced)
metrics_df_advanced_rounded = metrics_df.round(4)

display(metrics_df_advanced_rounded)

Unnamed: 0,fold,accuracy,precision,recall,f1,mean_confidence,high_conf_coverage,high_conf_accuracy
0,Fold 1,0.9508,0.9516,0.9508,0.9508,0.975,0.9217,0.9765
1,Fold 2,0.9521,0.9528,0.9521,0.9521,0.9735,0.9183,0.9764
2,Fold 3,0.9471,0.9473,0.9471,0.9471,0.9745,0.9188,0.9723
3,Fold 4,0.965,0.965,0.965,0.965,0.977,0.9262,0.9856
4,Fold 5,0.9408,0.9434,0.9408,0.9407,0.9733,0.9142,0.9722


***

#### **Visual Comparison of Runs**

In [17]:
metrics_df_basic_rounded["setup"] = "Basic"
metrics_df_advanced_rounded["setup"] = "Advanced"

# Combine into one DataFrame
combined_df = pd.concat([metrics_df_basic_rounded, metrics_df_advanced_rounded], ignore_index=True)

#### **Accuracy by Fold**

In [20]:
fig = px.bar(
    combined_df,
    x="fold",
    y="accuracy",
    color="setup",
    barmode="group",
    title="Accuracy per Fold: Basic vs Advanced"
)
fig.update_layout(yaxis_range=[0, 1], yaxis_title="Accuracy")
fig.show()

#### **Mean Confidence**

In [21]:
fig = px.bar(
    combined_df,
    x="fold",
    y="mean_confidence",
    color="setup",
    barmode="group",
    title="Mean Confidence per Fold"
)
fig.update_layout(yaxis_range=[0, 1], yaxis_title="Mean Confidence")
fig.show()

#### **High-Confidence Accuracy**

In [22]:
fig = px.bar(
    combined_df,
    x="fold",
    y="high_conf_accuracy",
    color="setup",
    barmode="group",
    title="High-Confidence Accuracy (≥ 0.90)"
)
fig.update_layout(yaxis_range=[0, 1], yaxis_title="Accuracy")
fig.show()

#### **High-Confidence Coverage**

In [23]:
fig = px.bar(
    combined_df,
    x="fold",
    y="high_conf_coverage",
    color="setup",
    barmode="group",
    title="High-Confidence Coverage (≥ 0.90)"
)
fig.update_layout(yaxis_range=[0, 1], yaxis_title="Coverage %")
fig.show()