# Model Training Notebook â€” CLIP ViT-B (Baseline)

This notebook trains and evaluates **CLIP ViT-B (Baseline)** for the Keris image classification task.  
It has been refactored for **reproducibility** and to serve as a clean **appendix artifact** for journal submission.

## Recommended folder conventions
- **Input data**: keep dataset paths configurable (see the *Configuration* cell).
- **Outputs / artifacts**: write all run artifacts under `artifacts/06_clip_vit_b/` (created automatically below).

## Reproducibility checklist
- Fixed random seed (NumPy / framework seed)
- Best-effort deterministic operations (may vary by GPU/driver)
- Logged environment versions


In [None]:
# --- Environment & reproducibility (PyTorch) ---
import os, sys, platform, random
import numpy as np

SEED = int(os.environ.get("SEED", "42"))
os.environ["PYTHONHASHSEED"] = str(SEED)

random.seed(SEED)
np.random.seed(SEED)

import torch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("Python   :", sys.version.split()[0])
print("Platform :", platform.platform())
print("NumPy    :", np.__version__)
print("Torch    :", torch.__version__)
print("CUDA     :", torch.version.cuda)
print("Device   :", "cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# --- Configuration (paths & artifact directory) ---
from pathlib import Path

# Project root: by default, current working directory
PROJECT_ROOT = Path.cwd()

# Edit these paths if needed
DATA_ROOT = PROJECT_ROOT / "dataset"      # <-- set your dataset root here
NPY_ROOT  = PROJECT_ROOT / "npy"          # <-- set your .npy root here (if used)

# All outputs should go here
ARTIFACT_DIR = PROJECT_ROOT / "artifacts" / "06_clip_vit_b"
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT :", PROJECT_ROOT)
print("DATA_ROOT    :", DATA_ROOT)
print("NPY_ROOT     :", NPY_ROOT)
print("ARTIFACT_DIR :", ARTIFACT_DIR)

## Training & evaluation (original workflow)
The cells below contain the original training pipeline with minimal functional changes.


In [None]:
import sys, site
print("python exe:", sys.executable)
print("site-packages:", site.getsitepackages()[:2])

In [None]:
import torch
print("torch file:", torch.__file__)

In [None]:
import transformers
from transformers.utils import is_torch_available, is_tf_available

print("transformers:", transformers.__version__)
print("is_torch_available:", is_torch_available())
print("is_tf_available   :", is_tf_available())

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPVisionModel, CLIPImageProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("torch:", torch.__version__)
print("device:", device)

In [None]:
import torch
print("torch:", torch.__version__)
print("torch cuda build:", torch.version.cuda)
print("is_available:", torch.cuda.is_available())
print("device_count:", torch.cuda.device_count())

# paksa init
torch.cuda.init()
print("GPU0:", torch.cuda.get_device_name(0))

In [None]:
import os
from glob import glob
from PIL import Image

# import keras_hub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical # convert to one-hot-encoding

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import Model

from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import backend as K

%matplotlib inline
import matplotlib.pyplot as plt
import random

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

In [None]:
# Load Dataset
X_train= "npyBilah/aug/x_train_aug.npy"
X_test = "npyBilah/x_test.npy"
X_val = "npyBilah/x_valid.npy"
y_train= "npyBilah/aug/y_train_aug.npy"
y_test = "npyBilah/y_test.npy"
y_val = "npyBilah/y_valid.npy"
X_train = np.load(X_train)
X_test = np.load(X_test)
X_val = np.load(X_val)
y_train = np.load(y_train)
y_test = np.load(y_test)
y_val = np.load(y_val)
# seed_everything(1)

In [None]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

In [None]:
plt.imshow(X_train[200])

In [None]:
# Hitung jumlah class
classes, counts = np.unique(y_train, axis=0, return_counts=True)

# Print hasilnya
print("Kelas: ", classes)
print("Jumlah: ", counts)

In [None]:
# Hitung jumlah class
classes1, counts1 = np.unique(y_test, axis=0, return_counts=True)

# Print hasilnya
print("Kelas: ", classes1)
print("Jumlah: ", counts1)

In [None]:
# Hitung jumlah class
classes2, counts2 = np.unique(y_val, axis=0, return_counts=True)

# Print hasilnya
print("Kelas: ", classes2)
print("Jumlah: ", counts2)

In [None]:
counts, counts1, counts2

In [None]:
# print("GPUs:", tf.config.list_physical_devices("GPU"))
num_classes = y_train.shape[1]
print("num_classes:", num_classes)

In [None]:
class NumpyCLIPDataset(Dataset):
    def __init__(self, X, y_onehot, processor):
        self.X = X
        self.y = y_onehot
        self.processor = processor

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        img = self.X[idx]  # (H,W,3) float32 [0,1]
        img_u8 = (np.clip(img, 0, 1) * 255.0).astype(np.uint8)

        proc = self.processor(images=img_u8, return_tensors="pt")
        pixel_values = proc["pixel_values"].squeeze(0)  # (3,224,224)

        y_idx = int(np.argmax(self.y[idx]))  # one-hot -> index
        return pixel_values, y_idx

processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")

train_ds = NumpyCLIPDataset(X_train, y_train, processor)
val_ds   = NumpyCLIPDataset(X_val,   y_val,   processor)
test_ds  = NumpyCLIPDataset(X_test,  y_test,  processor)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

y_labels = np.argmax(y_train, axis=1)
classes = np.unique(y_labels)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_labels)

class_weights_t = torch.ones(num_classes, dtype=torch.float32)
for c, w in zip(classes, cw):
    class_weights_t[int(c)] = float(w)
class_weights_t = class_weights_t.to(device)

print("class_weights:", class_weights_t)

class CLIPViTB16Baseline(nn.Module):
    def __init__(self, num_classes, head="linear", train_backbone=False):
        super().__init__()
        self.backbone = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")

        # freeze backbone (baseline)
        for p in self.backbone.parameters():
            p.requires_grad = train_backbone

        hidden = self.backbone.config.hidden_size  # biasanya 768

        if head == "linear":
            self.classifier = nn.Linear(hidden, num_classes)
        elif head == "shallow":
            self.classifier = nn.Sequential(
                nn.Linear(hidden, 512),
                nn.GELU(),
                nn.Dropout(0.2),
                nn.Linear(512, num_classes)
            )
        else:
            raise ValueError("head must be 'linear' or 'shallow'")

    def forward(self, pixel_values):
        out = self.backbone(pixel_values=pixel_values)
        feat = out.pooler_output  # (B, hidden)
        logits = self.classifier(feat)
        return logits

clip_model = CLIPViTB16Baseline(
    num_classes=num_classes,
    head="linear",          # ganti "shallow" kalau mau MLP
    train_backbone=False    # baseline: frozen
).to(device)

trainable = sum(p.numel() for p in clip_model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in clip_model.parameters())
print("Trainable params:", trainable)
print("Total params    :", total)

In [None]:
criterion = nn.CrossEntropyLoss(weight=class_weights_t)

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, weight=None):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight

    def forward(self, logits, target):
        ce = F.cross_entropy(logits, target, weight=self.weight, reduction="none")
        pt = torch.exp(-ce)
        fl = self.alpha * (1 - pt) ** self.gamma * ce
        return fl.mean()

criterion = FocalLoss(alpha=0.25, gamma=2.0, weight=class_weights_t)

import copy

optimizer = torch.optim.AdamW(
    [p for p in clip_model.parameters() if p.requires_grad],
    lr=1e-3, weight_decay=1e-4
)

patience = 8
best_val = float("inf")
wait = 0
best_state = None


In [None]:
def run_epoch(model, loader, train=True):
    model.train(train)
    total_loss = 0.0
    total = 0
    correct = 0

    for pixel_values, y in loader:
        pixel_values = pixel_values.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        if train:
            optimizer.zero_grad(set_to_none=True)

        logits = model(pixel_values)
        loss = criterion(logits, y)

        if train:
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y.size(0)
        total += y.size(0)
        pred = torch.argmax(logits, dim=1)
        correct += (pred == y).sum().item()

    return total_loss / total, correct / total

epochs = 30
for ep in range(1, epochs + 1):
    tr_loss, tr_acc = run_epoch(clip_model, train_loader, train=True)
    va_loss, va_acc = run_epoch(clip_model, val_loader,   train=False)

    print(f"Epoch {ep:02d} | train loss {tr_loss:.4f} acc {tr_acc:.4f} | val loss {va_loss:.4f} acc {va_acc:.4f}")

    if va_loss < best_val - 1e-4:
        best_val = va_loss
        best_state = copy.deepcopy(clip_model.state_dict())
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping triggered.")
            break

if best_state is not None:
    clip_model.load_state_dict(best_state)

In [None]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, roc_auc_score

clip_model.eval()
all_prob = []
all_true = []

with torch.no_grad():
    for pixel_values, y in test_loader:
        pixel_values = pixel_values.to(device, non_blocking=True)
        logits = clip_model(pixel_values)
        prob = torch.softmax(logits, dim=1).cpu().numpy()
        all_prob.append(prob)
        all_true.append(y.numpy())

y_prob = np.concatenate(all_prob, axis=0)
y_true = np.concatenate(all_true, axis=0)
y_pred = np.argmax(y_prob, axis=1)

print(classification_report(y_true, y_pred, digits=4))

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

prec_w = precision_score(y_true, y_pred, average="weighted", zero_division=0)
rec_w  = recall_score(y_true, y_pred, average="weighted", zero_division=0)
f1_w   = f1_score(y_true, y_pred, average="weighted", zero_division=0)

y_true_oh = np.eye(num_classes)[y_true]
auc_ovr_macro = roc_auc_score(y_true_oh, y_prob, multi_class="ovr", average="macro")

print("\n=== SKLEARN (MULTICLASS, ARGMAX) ===")
print("Precision (macro)   :", prec_macro)
print("Recall (macro)      :", rec_macro)
print("F1 (macro)          :", f1_macro)
print("Precision (weighted):", prec_w)
print("Recall (weighted)   :", rec_w)
print("F1 (weighted)       :", f1_w)
print("AUC (OVR macro)     :", auc_ovr_macro)

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

def evaluate_multiclass_torch(
    model: torch.nn.Module,
    dataloader,
    num_classes: int,
    device=None,
    criterion=None,
    topk: int = 3,
    average: str = "macro",   # "macro" atau "weighted"
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if criterion is None:
        criterion = nn.CrossEntropyLoss()

    model.eval()

    total_loss = 0.0
    n_total = 0
    topk_correct = 0

    y_true_all = []
    y_pred_all = []
    y_prob_all = []

    with torch.no_grad():
        for pixel_values, labels in dataloader:
            pixel_values = pixel_values.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True).long()

            logits = model(pixel_values)              # (B, C)
            loss = criterion(logits, labels)

            bs = labels.size(0)
            total_loss += loss.item() * bs
            n_total += bs

            probs = F.softmax(logits, dim=1)          # (B, C)
            preds = torch.argmax(probs, dim=1)        # (B,)

            # top-k accuracy
            if topk is not None and topk > 1:
                topk_idx = torch.topk(probs, k=topk, dim=1).indices  # (B, K)
                topk_correct += (topk_idx == labels.unsqueeze(1)).any(dim=1).sum().item()

            y_true_all.append(labels.detach().cpu().numpy())
            y_pred_all.append(preds.detach().cpu().numpy())
            y_prob_all.append(probs.detach().cpu().numpy())

    y_true = np.concatenate(y_true_all, axis=0)
    y_pred = np.concatenate(y_pred_all, axis=0)
    y_prob = np.concatenate(y_prob_all, axis=0)

    avg_loss = total_loss / max(n_total, 1)

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average=average, zero_division=0)
    rec  = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1   = f1_score(y_true, y_pred, average=average, zero_division=0)

    # AUC multiclass OVR macro
    # (butuh y_true one-hot)
    auc = np.nan
    try:
        y_true_1hot = np.eye(num_classes, dtype=np.int32)[y_true]
        auc = roc_auc_score(y_true_1hot, y_prob, average="macro", multi_class="ovr")
    except Exception as e:
        # kalau ada class yang tidak muncul di test set, AUC bisa gagal
        print(f"[WARN] AUC OVR macro tidak bisa dihitung: {e}")

    topk_acc = np.nan
    if topk is not None and topk > 1:
        topk_acc = topk_correct / max(n_total, 1)

    return {
        "loss": avg_loss,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc_ovr_macro": auc,
        f"top_{topk}_acc": topk_acc,
    }

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()

metrics = evaluate_multiclass_torch(
    model=clip_model.to(device),
    dataloader=test_loader,
    num_classes=num_classes,
    device=device,
    criterion=criterion,
    topk=3,
    average="macro" 
)

print("Loss       :", metrics["loss"])
print("Accuracy   :", metrics["accuracy"])
print("Precision  :", metrics["precision"])
print("Recall     :", metrics["recall"])
print("AUC        :", metrics["auc_ovr_macro"])
print("Top K        :", metrics["top_3_acc"])
print("F1-Score   :", metrics["f1"])

In [None]:
# Hitung jumlah class
classes, counts = np.unique(y_test, axis=0, return_counts=True)

# Print hasilnya
print("Kelas: ", classes)
print("Jumlah: ", counts)

In [None]:
import itertools
# Function to plot confusion matrix    
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
torch.save(
    {
        "state_dict": clip_model.state_dict(),
        "num_classes": num_classes,
        "clip_name": "openai/clip-vit-base-patch16",
        "head": "linear",
        "train_backbone": False,
    },
    "model-h5/baseline_clip_vitb16_linearhead.pt"
)
print("Saved: baseline_clip_vitb16_linearhead.pt")

In [None]:
# experiment.end()

## Notes
- Keep dataset paths and output paths configurable for reproducibility.
- If you publish this notebook, ensure no private paths or secrets are embedded.
