In [None]:
import numpy as np
import collections
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict, LeaveOneGroupOut
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.base import clone
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import warnings

In [None]:
def evaluation(
    model,
    features_path: str,
    labels_path: str,
    label_name: str = "Label",
    cv_strategy=None,
    groups_path: str = None,
    multi_class="ovr"
):
    X = np.load(features_path)
    y = np.load(labels_path)
    groups = np.load(groups_path) if groups_path else None

    if cv_strategy is None:
        cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    is_logo = isinstance(cv_strategy, LeaveOneGroupOut)
    average_mode = "micro" if is_logo else "macro"

    accs, f1s, precs, recalls, aucs, names = [], [], [], [], [], []
    all_y_true, all_y_pred, all_y_proba = [], [], []

    print("Features Extracted from: ", features_path)
    print(f"\n CV {cv_strategy.__class__.__name__} Evaluation for: {label_name} using {model.__class__.__name__}")
    print("-" * 90)
    print(f"{'Fold/Group':<15}{'Acc':>8}{'F1':>8}{'Prec':>10}{'Recall':>10}{'AUROC':>10}")
    print("-" * 90)

    for i, (train_idx, val_idx) in enumerate(cv_strategy.split(X, y, groups=groups), 1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        group_val = groups[val_idx[0]] if groups is not None else i

        supports_weight = "class_weight" in model.get_params().keys()
        clf_params = model.get_params()
        if supports_weight and clf_params.get("class_weight", None) is None:
            class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
            weight_dict = {cls: w for cls, w in zip(np.unique(y_train), class_weights)}
            model.set_params(class_weight=weight_dict)

        clf = clone(model)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)

        try:
            y_proba = clf.predict_proba(X_val)
        except:
            y_proba = None

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average=average_mode, zero_division=0)
        prec = precision_score(y_val, y_pred, average=average_mode, zero_division=0)
        rec = recall_score(y_val, y_pred, average=average_mode, zero_division=0)

        accs.append(acc)
        f1s.append(f1)
        precs.append(prec)
        recalls.append(rec)
        names.append(group_val)

        if y_proba is not None:
            try:
                if multi_class == "ovr" and y_proba.shape[1] > 2:
                    auc = roc_auc_score(y_val, y_proba, multi_class=multi_class, average="macro")
                else:
                    # Binary case
                    if len(y_proba.shape) == 1:
                        auc = roc_auc_score(y_val, y_proba)
                    else:
                        auc = roc_auc_score(y_val, y_proba[:, 1])
            except:
                auc = float('nan')
                warnings.warn("AUROC not computed")
        else:
            auc = float('nan')
        aucs.append(auc)

        all_y_true.extend(y_val)
        all_y_pred.extend(y_pred)
        if y_proba is not None:
            if len(y_proba.shape) == 2 and y_proba.shape[1] > 1:
                all_y_proba.extend(y_proba[:, 1])
            else:
                all_y_proba.extend(y_proba)

        print(f"{str(group_val):<15}{acc:.3f}{f1:>8.3f}{prec:>10.3f}{rec:>10.3f}{auc:>10.3f}")

        if not is_logo or (i <= 5):
            cm = confusion_matrix(y_val, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm)
            disp.plot(cmap=plt.cm.Blues)
            plt.title(f"Confusion Matrix for Fold/Group {group_val}")
            plt.show()

    print("\n Average:")
    print(f"  Accuracy  = {np.mean(accs):.3f}")
    print(f"  Precision = {np.mean(precs):.3f}")
    print(f"  Recall    = {np.mean(recalls):.3f}")
    print(f"  {average_mode.title()}-F1 = {np.mean(f1s):.3f}")
    print(f"  AUROC     = {np.nanmean(aucs):.3f}")

    if is_logo:
        print("\n Global Evaluation (LOGO):")
        global_acc = accuracy_score(all_y_true, all_y_pred)
        global_f1 = f1_score(all_y_true, all_y_pred, average="macro", zero_division=0)
        global_rec = recall_score(all_y_true, all_y_pred, average="macro", zero_division=0)
        global_prec = precision_score(all_y_true, all_y_pred, average="macro", zero_division=0)

        try:
            global_auc = roc_auc_score(all_y_true, all_y_proba)
        except:
            global_auc = float('nan')

        print(f"  Accuracy  = {global_acc:.3f}")
        print(f"  Precision = {global_prec:.3f}")
        print(f"  Recall    = {global_rec:.3f}")
        print(f"  Macro-F1  = {global_f1:.3f}")
        print(f"  AUROC     = {global_auc:.3f}")

    result_df = pd.DataFrame({
        "Fold_or_Group": names,
        "Accuracy": accs,
        "Precision": precs,
        "Recall": recalls,
        f"{average_mode.title()}-F1": f1s,
        "AUROC": aucs
    })

    return result_df


In [None]:
model = MLPClassifier(
    hidden_layer_sizes=(128,),
    random_state=42,
    max_iter=500,
)

In [None]:
base_dir = "path/to/features"
feature_path = f"{base_dir}/features/features_swin.npy"
label_path = f"{base_dir}/features/labels_callback_binary.npy"
group_path = f"{base_dir}/features/labels_patient_id.npy"

# ==== Load data ====
X = np.load(feature_path)
y = np.load(label_path)
groups = np.load(group_path)

# ==== Only get patients who has at least 3 images, not greater than 120 patients ====
unique_ids, counts = np.unique(groups, return_counts=True)
valid_patients = unique_ids[counts >= 3][:120]
mask = np.isin(groups, valid_patients)

# ==== Apply mask filter ====
X_filtered = X[mask]
y_filtered = y[mask]
groups_filtered = groups[mask]

# ==== Save files ====
np.save("nonp/filtered_features.npy", X_filtered)
np.save("nonp/filtered_labels.npy", y_filtered)
np.save("nonp/filtered_groups.npy", groups_filtered)

# ==== Cross Validation ====
logo = LeaveOneGroupOut()
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
evaluation(model=model,
           features_path=feature_path,
           labels_path=label_path,
           label_name="Callback (Binary) - KFold",
           cv_strategy=kfold)

In [None]:
evaluation(
    model=model,
    features_path="nonp/filtered_features.npy",
    labels_path="nonp/filtered_labels.npy",
    label_name="Callback (Binary) - LOGO",
    cv_strategy=logo,
    groups_path="nonp/filtered_groups.npy",
    multi_class="raise"  # binary
)