# Imports & config

In [17]:
import os, math, json, random, numpy as np, pandas as pd
from pathlib import Path

import tensorflow as tf
from tensorflow.keras import layers as L, Model

from sklearn.metrics import classification_report, f1_score, balanced_accuracy_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedGroupKFold

import matplotlib.pyplot as plt

SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.keras.utils.set_random_seed(SEED)

print("TensorFlow:", tf.__version__)
IMG_SIZE = (224, 224)     # match ImageNet pretrain for MobileNetV2/ResNet/EffNet
BATCH = 32                # adjust if you see OOM; 16–32 is fine
EPOCHS_HEAD = 8           # was 6
EPOCHS_FINE = 20          # was 15

# keep “same number per class” on VAL/TEST, but let TRAIN keep more signal:
BALANCE_MODE_TRAIN = "over"      # <-- IMPORTANT: oversample train to max per class
BALANCE_MODE_VALTEST = "down"    # keep val/test strictly equal-sized per class
N_PER_CLASS_TRAIN = None         # None => auto (min for down, max for over)
N_PER_CLASS_VALTEST = None

MODELS_TO_RUN = ["mobilenetv2", "resnet50", "efficientnetb0", "densenet121", "vit_tiny"]
OUTPUT_DIR = "./outputs_ham10000"; os.makedirs(OUTPUT_DIR, exist_ok=True)


TensorFlow: 2.18.0


# Load HAM10000 metadata & paths

In [21]:
# Kaggle paths
meta_csv = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv"
image_dir1 = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1"
image_dir2 = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_2"

df = pd.read_csv(meta_csv)
all_image_paths = {os.path.splitext(f)[0]: os.path.join(image_dir1, f)
                   for f in os.listdir(image_dir1) if f.endswith(".jpg")}
all_image_paths.update({os.path.splitext(f)[0]: os.path.join(image_dir2, f)
                        for f in os.listdir(image_dir2) if f.endswith(".jpg")})

df["path"]  = df["image_id"].map(all_image_paths)
df = df.dropna(subset=["path"]).reset_index(drop=True)
df["label"] = df["dx"]

assert {"image_id","lesion_id","dx","path","label"}.issubset(df.columns)
print("Raw label counts:\n", df["label"].value_counts().sort_index())


Raw label counts:
 label
akiec     327
bcc       514
bkl      1099
df        115
mel      1113
nv       6705
vasc      142
Name: count, dtype: int64


# Grouped split (lesion-wise) + balancing helpers

In [22]:
def oversample_to_n(g: pd.DataFrame, n: int, seed=SEED) -> pd.DataFrame:
    if len(g) >= n:
        return g.sample(n=n, random_state=seed)
    reps = int(math.ceil(n / len(g)))
    g_rep = pd.concat([g.sample(frac=1.0, replace=True, random_state=seed+i) for i in range(reps)], ignore_index=True)
    return g_rep.sample(n=n, random_state=seed)

def balance_split(split_df: pd.DataFrame, mode="down", n_per_class=None, seed=SEED) -> pd.DataFrame:
    counts = split_df["label"].value_counts()
    if mode == "down":
        n = counts.min() if n_per_class is None else min(n_per_class, counts.min())
        out = (split_df.groupby("label", group_keys=False)
                        .apply(lambda g: g.sample(n=n, random_state=seed))
                        .sample(frac=1.0, random_state=seed)
                        .reset_index(drop=True))
    elif mode == "over":
        n = counts.max() if n_per_class is None else n_per_class
        out = (split_df.groupby("label", group_keys=False)
                        .apply(lambda g: oversample_to_n(g, n, seed))
                        .sample(frac=1.0, random_state=seed)
                        .reset_index(drop=True))
    else:
        raise ValueError("mode should be 'down' or 'over'")
    return out

# encode labels
classes_sorted = sorted(df["label"].unique())
label2id = {c:i for i,c in enumerate(classes_sorted)}
df["y"] = df["label"].map(label2id)

# Stratified + Grouped split (5 folds -> 60/20/20)
X = df["image_id"].values
y = df["y"].values
groups = df["lesion_id"].values
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
folds = list(sgkf.split(X, y, groups))
(train_idx_1, _), (train_idx_2, _), (train_idx_3, _), (val_idx, _), (test_idx, _) = folds[:5]

df_train = df.iloc[np.concatenate([train_idx_1, train_idx_2, train_idx_3])].copy()
df_val   = df.iloc[val_idx].copy()
df_test  = df.iloc[test_idx].copy()

# balance per split (same # per class as requested)
df_train_b = balance_split(df_train, BALANCE_MODE_TRAIN, N_PER_CLASS_TRAIN)
df_val_b   = balance_split(df_val,   BALANCE_MODE_VALTEST, N_PER_CLASS_VALTEST)
df_test_b  = balance_split(df_test,  BALANCE_MODE_VALTEST, N_PER_CLASS_VALTEST)

for name, d in [("train", df_train_b), ("val", df_val_b), ("test", df_test_b)]:
    print(name, d["label"].value_counts().sort_index().to_dict())

NUM_CLASSES = len(classes_sorted)


train {'akiec': 16032, 'bcc': 16032, 'bkl': 16032, 'df': 16032, 'mel': 16032, 'nv': 16032, 'vasc': 16032}
val {'akiec': 96, 'bcc': 96, 'bkl': 96, 'df': 96, 'mel': 96, 'nv': 96, 'vasc': 96}
test {'akiec': 93, 'bcc': 93, 'bkl': 93, 'df': 93, 'mel': 93, 'nv': 93, 'vasc': 93}


  .apply(lambda g: oversample_to_n(g, n, seed))
  .apply(lambda g: g.sample(n=n, random_state=seed))
  .apply(lambda g: g.sample(n=n, random_state=seed))


# tf.data loaders + augmentation

In [23]:
AUTOTUNE = tf.data.AUTOTUNE

def df_to_ds(dff, batch=BATCH, shuffle=True):
    paths = dff["path"].values
    labels = dff["y"].values.astype(np.int32)
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dff), seed=SEED, reshuffle_each_iteration=True)
    def _load(path, y):
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
        img = tf.image.resize(img, IMG_SIZE)
        return img, y
    ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
    return ds.batch(batch).prefetch(AUTOTUNE)

augment = tf.keras.Sequential([
    L.RandomFlip("horizontal"),
    L.RandomRotation(0.05),
    L.RandomZoom(0.08),
    L.RandomContrast(0.08),
])

def add_aug(ds):
    return ds.map(lambda x,y: (augment(x), y), num_parallel_calls=AUTOTUNE)

train_ds = add_aug(df_to_ds(df_train_b, shuffle=True))
val_ds   = df_to_ds(df_val_b, shuffle=False)
test_ds  = df_to_ds(df_test_b, shuffle=False)


# CNN backbones (MobileNetV2, ResNet50, EfficientNetB0, DenseNet121)

In [24]:
def build_cnn_backbone(name):
    name = name.lower()
    if name == "mobilenetv2":
        base = tf.keras.applications.MobileNetV2(include_top=False, weights="imagenet", input_shape=IMG_SIZE+(3,))
        preprocess = tf.keras.applications.mobilenet_v2.preprocess_input
        last_conv = "Conv_1"
    elif name == "resnet50":
        base = tf.keras.applications.ResNet50(include_top=False, weights="imagenet", input_shape=IMG_SIZE+(3,))
        preprocess = tf.keras.applications.resnet.preprocess_input
        last_conv = "conv5_block3_out"
    elif name == "efficientnetb0":
        base = tf.keras.applications.EfficientNetB0(include_top=False, weights="imagenet", input_shape=IMG_SIZE+(3,))
        preprocess = tf.keras.applications.efficientnet.preprocess_input
        last_conv = "top_conv"
    elif name == "densenet121":
        base = tf.keras.applications.DenseNet121(include_top=False, weights="imagenet", input_shape=IMG_SIZE+(3,))
        preprocess = tf.keras.applications.densenet.preprocess_input
        last_conv = None  # we'll auto-find last conv layer
    else:
        raise ValueError("Unknown CNN name")
    return base, preprocess, last_conv

def find_last_conv_layer(model):
    for layer in reversed(model.layers):
        if isinstance(layer, tf.keras.layers.Conv2D):
            return layer.name
    return None

def classifier_head_2d(x, num_classes):
    x = L.GlobalAveragePooling2D()(x)
    x = L.Dropout(0.35)(x)
    return L.Dense(num_classes, activation="softmax")(x)

def make_cnn_model(name, num_classes=NUM_CLASSES):
    base, preprocess, last_conv = build_cnn_backbone(name)
    inp = L.Input(shape=IMG_SIZE+(3,))
    x = preprocess(inp)
    feat = base(x, training=False)
    out = classifier_head_2d(feat, num_classes)
    model = Model(inp, out, name=name)
    model.base = base
    model.preprocess = preprocess
    model.last_conv_name = last_conv or find_last_conv_layer(base)
    return model


# Tiny ViT with attention-rollout helpers

In [25]:
def patch_embed(img, patch=16, dim=384):
    x = L.Conv2D(dim, kernel_size=patch, strides=patch, padding="valid", name="patch_embed")(img)
    x = L.Reshape((-1, dim))(x)  # (B, N, D)
    return x

def transformer_block(x, heads=6, dim=384, mlp_dim=768, drop=0.0, name="blk"):
    ln1 = L.LayerNormalization(name=name+"_ln1")(x)
    mha  = L.MultiHeadAttention(num_heads=heads, key_dim=dim//heads, dropout=drop, name=name+"_mha")
    attn_out, attn_scores = mha(ln1, ln1, return_attention_scores=True)
    x = L.Add(name=name+"_add1")([x, L.Dropout(drop)(attn_out)])
    ln2 = L.LayerNormalization(name=name+"_ln2")(x)
    h = L.Dense(mlp_dim, activation="gelu", name=name+"_mlp1")(ln2)
    h = L.Dropout(drop)(h)
    h = L.Dense(dim, name=name+"_mlp2")(h)
    x = L.Add(name=name+"_add2")([x, L.Dropout(drop)(h)])
    return x, attn_scores

def make_vit_tiny(num_classes=NUM_CLASSES, patch=16, dim=384, depth=6, heads=6, mlp_dim=768, drop=0.0):
    inp = L.Input(shape=IMG_SIZE+(3,), name="vit_input")
    x = (inp - 127.5) / 127.5
    tok = patch_embed(x, patch, dim)

    # CLS token via embedding of a zero index
    B_tokens = L.Lambda(lambda t: tf.shape(t)[1])(tok)
    num_tokens = (IMG_SIZE[0]//patch)*(IMG_SIZE[1]//patch) + 1
    cls_idx = L.Lambda(lambda t: tf.zeros((tf.shape(t)[0], 1), dtype=tf.int32))(tok)
    cls_tok = L.Embedding(1, dim, name="cls_token")(cls_idx)
    tok = L.Concatenate(axis=1)([cls_tok, tok])

    pos = L.Embedding(input_dim=num_tokens, output_dim=dim, name="pos_embed")(tf.range(0, num_tokens))
    tok = tok + pos  # broadcast over batch

    attn_tensors = []
    for i in range(depth):
        tok, attn = transformer_block(tok, heads, dim, mlp_dim, drop, name=f"blk{i+1}")
        attn_tensors.append(attn)

    cls = L.Lambda(lambda t: t[:,0], name="cls_slice")(tok)
    penult = L.Dropout(drop, name="pre_logits")(cls)
    out = L.Dense(num_classes, activation="softmax", name="head")(penult)

    model = Model(inp, out, name="vit_tiny")
    attn_model = Model(inp, attn_tensors, name="vit_tiny_attn")
    feat_model = Model(inp, penult, name="vit_tiny_feat")

    model.preprocess = lambda z: (z - 127.5) / 127.5
    return model, attn_model, feat_model


# Train/eval, attention, and overlay saving

In [28]:
def train_and_eval(model, train_ds, val_ds, epochs_head=EPOCHS_HEAD, epochs_fine=EPOCHS_FINE, unfreeze_ratio=0.4):
    # ----- Phase 1: train the head -----
    if hasattr(model, "base"):
        model.base.trainable = False
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )
    es = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor="val_accuracy")
    model.fit(train_ds, validation_data=val_ds, epochs=epochs_head, callbacks=[es], verbose=1)

    # ----- Phase 2: fine-tune last X% of backbone -----
    if hasattr(model, "base"):
        model.base.trainable = True
        if hasattr(model.base, "layers"):
            k = int((1.0 - unfreeze_ratio) * len(model.base.layers))
            for i, layer in enumerate(model.base.layers):
                layer.trainable = (i >= k)

    # 🩹 label_smoothing fallback for older TF
    try:
        loss_fine = tf.keras.losses.SparseCategoricalCrossentropy(label_smoothing=0.1)
    except TypeError:
        loss_fine = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-5),
        loss=loss_fine,
        metrics=["accuracy"]
    )
    model.fit(train_ds, validation_data=val_ds, epochs=epochs_fine, callbacks=[es], verbose=1)

    # ----- Validation eval -----
    y_true, y_prob = [], []
    for xb, yb in val_ds:
        y_true.append(yb.numpy())
        y_prob.append(model.predict(xb, verbose=0))
    y_true = np.concatenate(y_true); y_prob = np.concatenate(y_prob)
    y_pred = y_prob.argmax(axis=1)

    acc = (y_pred == y_true).mean()
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average="macro")
    try:
        auroc_macro = roc_auc_score(tf.one_hot(y_true, NUM_CLASSES).numpy(), y_prob, average="macro", multi_class="ovr")
    except Exception:
        auroc_macro = float("nan")
    report = classification_report(y_true, y_pred, target_names=classes_sorted, digits=4)
    cm = confusion_matrix(y_true, y_pred)
    return dict(acc=acc, bal_acc=bal_acc, f1_macro=f1_macro, auroc_macro=auroc_macro,
                report=report, cm=cm, y_true=y_true, y_prob=y_prob, y_pred=y_pred)


In [None]:
tmp_model = make_cnn_model("mobilenetv2", NUM_CLASSES)
tmp_metrics = train_and_eval(tmp_model, train_ds, val_ds, epochs_head=EPOCHS_HEAD, epochs_fine=EPOCHS_FINE, unfreeze_ratio=0.5)
print({k: v if not isinstance(v, np.ndarray) else '...' for k,v in tmp_metrics.items() if k in ["acc","bal_acc","f1_macro","auroc_macro"]})

In [None]:
tmp_model = make_cnn_model("resnet50", NUM_CLASSES)
tmp_metrics = train_and_eval(tmp_model, train_ds, val_ds, epochs_head=EPOCHS_HEAD, epochs_fine=EPOCHS_FINE, unfreeze_ratio=0.5)
print({k: v if not isinstance(v, np.ndarray) else '...' for k,v in tmp_metrics.items() if k in ["acc","bal_acc","f1_macro","auroc_macro"]})

In [None]:
tmp_model = make_cnn_model("efficientnetb0", NUM_CLASSES)
tmp_metrics = train_and_eval(tmp_model, train_ds, val_ds, epochs_head=EPOCHS_HEAD, epochs_fine=EPOCHS_FINE, unfreeze_ratio=0.5)
print({k: v if not isinstance(v, np.ndarray) else '...' for k,v in tmp_metrics.items() if k in ["acc","bal_acc","f1_macro","auroc_macro"]})

# Train all models + save attention previews

In [None]:
results = {}

for name in MODELS_TO_RUN:
    print(f"\n========== Training {name} ==========")
    if name == "vit_tiny":
        vit_model, vit_attn_model, vit_feat_model = make_vit_tiny(NUM_CLASSES, patch=16, dim=384, depth=6, heads=6, drop=0.0)
        model = vit_model
        metrics = train_and_eval(model, train_ds, val_ds)
        save_attention_overlays(name, model, val_ds, vit_attn_model, out_dir=OUTPUT_DIR)
        results[name] = {**metrics, "feat_model": vit_feat_model, "attn_model": vit_attn_model, "model": model}
    else:
        model = make_cnn_model(name, NUM_CLASSES)
        metrics = train_and_eval(model, train_ds, val_ds)
        save_attention_overlays(name, model, val_ds, out_dir=OUTPUT_DIR)
        results[name] = {**metrics, "model": model}

print("\n=== Validation Results ===")
print("Model\tAcc\tBalAcc\tMacroF1\tMacroAUROC")
for k,v in results.items():
    au = v['auroc_macro'] if v['auroc_macro'] == v['auroc_macro'] else float('nan')
    print(f"{k}\t{v['acc']*100:.1f}\t{v['bal_acc']*100:.1f}\t{v['f1_macro']:.3f}\t{au:.3f}")

# save textual reports & confusion matrices
for k,v in results.items():
    with open(os.path.join(OUTPUT_DIR, f"{k}_VAL_report.txt"), "w") as f:
        f.write(v["report"])
    np.save(os.path.join(OUTPUT_DIR, f"{k}_VAL_cm.npy"), v["cm"])


# Soft-vote ensemble on validation

In [None]:
def soft_vote(probs_list, weights=None):
    P = np.stack(probs_list, axis=0)
    if weights is None:
        weights = np.ones((len(probs_list), 1, 1))
    else:
        weights = np.asarray(weights)[:, None, None]
    P = (P * weights).sum(axis=0) / weights.sum()
    return P

# choose top-3 by macro-F1
top3 = sorted(results.items(), key=lambda kv: kv[1]["f1_macro"], reverse=True)[:3]
ens_probs = soft_vote([kv[1]["y_prob"] for kv in top3])
y_true_val = top3[0][1]["y_true"]
ens_pred = ens_probs.argmax(axis=1)

ens_acc = (ens_pred == y_true_val).mean()
ens_bal = balanced_accuracy_score(y_true_val, ens_pred)
ens_f1  = f1_score(y_true_val, ens_pred, average="macro")
ens_auroc = roc_auc_score(tf.one_hot(y_true_val, NUM_CLASSES).numpy(), ens_probs, average="macro", multi_class="ovr")

print(f"Ensemble (top-3 soft vote) — Acc: {ens_acc:.3f}, BalAcc: {ens_bal:.3f}, MacroF1: {ens_f1:.3f}, MacroAUROC: {ens_auroc:.3f}")


# Hybrid CNN+ViT (feature fusion) and validation eval

In [None]:
def make_hybrid_cnn_vit(cnn_model_name="efficientnetb0", vit_feat_model=None, num_classes=NUM_CLASSES):
    # CNN branch
    base, preprocess, _ = build_cnn_backbone(cnn_model_name)
    inpx = L.Input(shape=IMG_SIZE+(3,))
    x_cnn = preprocess(inpx)
    feat_cnn = base(x_cnn, training=False)
    feat_cnn = L.GlobalAveragePooling2D()(feat_cnn)

    # ViT feature branch
    if vit_feat_model is None:
        vit_model, vit_attn_model, vit_feat_model = make_vit_tiny(NUM_CLASSES)
    feat_vit = vit_feat_model((inpx - 127.5)/127.5)

    fused = L.Concatenate()([feat_cnn, feat_vit])
    fused = L.Dropout(0.4)(fused)
    fused = L.Dense(512, activation="gelu")(fused)
    out = L.Dense(num_classes, activation="softmax")(fused)

    model = Model(inpx, out, name=f"hybrid_{cnn_model_name}_vit")
    model.base_cnn = base
    model.base_vit = vit_feat_model
    return model

vit_feat = results["vit_tiny"]["feat_model"]
hybrid = make_hybrid_cnn_vit("efficientnetb0", vit_feat_model=vit_feat)
hyb_val = train_and_eval(hybrid, train_ds, val_ds, unfreeze_ratio=0.3)

print("Hybrid (VAL) — Acc: {:.3f}, BalAcc: {:.3f}, MacroF1: {:.3f}, MacroAUROC: {:.3f}".format(
    hyb_val["acc"], hyb_val["bal_acc"], hyb_val["f1_macro"], hyb_val["auroc_macro"]))


# Final TEST evaluation (best single, ensemble, hybrid)

In [None]:
def eval_on_test(model, test_ds):
    y_true, y_prob = [], []
    for xb, yb in test_ds:
        y_true.append(yb.numpy()); y_prob.append(model.predict(xb, verbose=0))
    y_true = np.concatenate(y_true); y_prob = np.concatenate(y_prob); y_pred = y_prob.argmax(axis=1)
    return dict(
        acc=(y_pred==y_true).mean(),
        bal_acc=balanced_accuracy_score(y_true, y_pred),
        f1_macro=f1_score(y_true, y_pred, average="macro"),
        auroc_macro=roc_auc_score(tf.one_hot(y_true, NUM_CLASSES).numpy(), y_prob, average="macro", multi_class="ovr"),
        report=classification_report(y_true, y_pred, target_names=classes_sorted, digits=4),
        cm=confusion_matrix(y_true, y_pred)
    )

best_single = max(results.items(), key=lambda kv: kv[1]["f1_macro"])[0]
print(f"Best single on VAL by Macro-F1: {best_single}")

# (a) best single model on TEST
test_single = eval_on_test(results[best_single]["model"], test_ds)

# (b) ensemble on TEST (recompute per batch to save memory)
top3 = sorted(results.items(), key=lambda kv: kv[1]["f1_macro"], reverse=True)[:3]
y_true_test, probs_each = [], []
for xb, yb in test_ds:
    y_true_test.append(yb.numpy())
    probs_each.append(soft_vote([r["model"].predict(xb, verbose=0) for _, r in top3]))
y_true_test = np.concatenate(y_true_test); probs_ens = np.concatenate(probs_each)
y_pred_ens = probs_ens.argmax(axis=1)
test_ens = dict(
    acc=(y_pred_ens==y_true_test).mean(),
    bal_acc=balanced_accuracy_score(y_true_test, y_pred_ens),
    f1_macro=f1_score(y_true_test, y_pred_ens, average="macro"),
    auroc_macro=roc_auc_score(tf.one_hot(y_true_test, NUM_CLASSES).numpy(), probs_ens, average="macro", multi_class="ovr")
)

# (c) hybrid on TEST
test_hybrid = eval_on_test(hybrid, test_ds)

print("\n=== TEST Results ===")
print("Single(best)\tAcc\tBalAcc\tMacroF1\tMacroAUROC")
print(f"{best_single}\t{test_single['acc']*100:.1f}\t{test_single['bal_acc']*100:.1f}\t{test_single['f1_macro']:.3f}\t{test_single['auroc_macro']:.3f}")

print("\nEnsemble(top-3)\tAcc\tBalAcc\tMacroF1\tMacroAUROC")
print(f"ensemble\t{test_ens['acc']*100:.1f}\t{test_ens['bal_acc']*100:.1f}\t{test_ens['f1_macro']:.3f}\t{test_ens['auroc_macro']:.3f}")

print("\nHybrid(EffB0+ViT)\tAcc\tBalAcc\tMacroF1\tMacroAUROC")
print(f"hybrid\t{test_hybrid['acc']*100:.1f}\t{test_hybrid['bal_acc']*100:.1f}\t{test_hybrid['f1_macro']:.3f}\t{test_hybrid['auroc_macro']:.3f}")

# save reports
with open(os.path.join(OUTPUT_DIR, f"{best_single}_TEST_report.txt"), "w") as f: f.write(test_single["report"])
with open(os.path.join(OUTPUT_DIR, "ensemble_TEST_metrics.json"), "w") as f: json.dump({k: float(v) for k,v in test_ens.items()}, f, indent=2)
with open(os.path.join(OUTPUT_DIR, "hybrid_TEST_report.txt"), "w") as f: f.write(test_hybrid["report"])
