In [None]:
import os
import random
import numpy as np
import torch

def set_seed(seed: int = 42):
    # Python built-in random
    random.seed(seed)
    
    # Numpy
    np.random.seed(seed)
    
    # PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    
    # For reproducibility
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # Hash-based operations
    os.environ["PYTHONHASHSEED"] = str(seed)

# Call this once at the very top of your notebook
set_seed(42)
import random

# --- TAMBAHKAN BLOK INI UNTUK REPRODUCIBILITY ---
SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # jika menggunakan multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [None]:
# ===================================================================
# === 1. IMPORT LIBRARY ===
# ===================================================================
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import os
import pandas as pd
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report
import cv2
import timm
from pathlib import Path
from tqdm.auto import tqdm
import imagehash

# ===================================================================
# === 2. KONFIGURASI DAN SETUP (TIDAK DIUBAH) ===
# ===================================================================
# Direktori
TRAIN_DIR = "/kaggle/input/semoga-ajaa/dataset fixx/train"
TEST_DIR = "/kaggle/input/semoga-ajaa/dataset fixx/test/test"

# Konfigurasi Model dan Training
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
IMG_SIZE = 518
BATCH_SIZE = 8
NUM_WORKERS = os.cpu_count()

# --- HYPERPARAMETER UNTUK GRADUAL UNFREEZING ---
EPOCHS_S1 = 3
LR_S1 = 3e-4
EPOCHS_S2 = 5
LR_S2 = 5e-5
EPOCHS_S3 = 12
LR_S3 = 1.59e-06

# Statistik ImageNet
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

print(f"Menggunakan device: {DEVICE}")
print(f"Ukuran gambar: {IMG_SIZE}x{IMG_SIZE}, Batch Size: {BATCH_SIZE}")

# ===================================================================
# === 3. FUNGSI DAN KELAS HELPER (TIDAK DIUBAH) ===
# ===================================================================
def convert_path_to_df(dataset, is_test=False):
    image_dir = Path(dataset)
    filepaths = list(image_dir.glob(r'**/*.*'))
    if is_test:
        filepaths = pd.Series(filepaths, name='Filepath').astype(str)
        return pd.DataFrame({'Filepath': filepaths})
    else:
        labels = [p.parts[-2] for p in filepaths]
        filepaths = pd.Series(filepaths, name='Filepath').astype(str)
        labels = pd.Series(labels, name='Label')
        return pd.concat([filepaths, labels], axis=1)

class CustomDataset(Dataset):
    def __init__(self, dataframe, image_column, label_column=None, transform=None):
        self.dataframe = dataframe
        self.image_column = image_column
        self.label_column = label_column
        self.transform = transform
    def __len__(self): return len(self.dataframe)
    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx][self.image_column]
        image = Image.open(img_path).convert('RGB')
        if self.transform: image = self.transform(image)
        if self.label_column:
            label = self.dataframe.iloc[idx][self.label_column]
            return image, torch.tensor(label, dtype=torch.long)
        return image
class DualTransformDataset(Dataset):
    def __init__(self, dataframe, image_column, label_column, transform_main, transform_extra):
        self.dataframe = dataframe
        self.image_column = image_column
        self.label_column = label_column
        self.transform_main = transform_main
        self.transform_extra = transform_extra

    def __len__(self):
        # Gandakan ukuran dataset (dua versi per gambar)
        return len(self.dataframe) * 2

    def __getitem__(self, idx):
        # Tentukan apakah pakai augmentasi utama atau tambahan
        base_idx = idx // 2
        use_extra = idx % 2 == 1

        row = self.dataframe.iloc[base_idx]
        img_path = row[self.image_column]
        label = torch.tensor(row[self.label_column], dtype=torch.long)

        image = Image.open(img_path).convert('RGB')

        if use_extra:
            image = self.transform_extra(image)
        else:
            image = self.transform_main(image)

        return image, label

class CLAHETransform:
    def __init__(self, clip_limit=2.0, tile_grid_size=(8, 8)):
        self.clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
    def __call__(self, img):
        img_np = np.array(img); img_lab = cv2.cvtColor(img_np, cv2.COLOR_RGB2Lab)
        l, a, b = cv2.split(img_lab); l_clahe = self.clahe.apply(l)
        img_lab_clahe = cv2.merge((l_clahe, a, b))
        img_rgb_clahe = cv2.cvtColor(img_lab_clahe, cv2.COLOR_Lab2RGB)
        return Image.fromarray(img_rgb_clahe)

class TestDataset(Dataset):
    def __init__(self, dataframe, image_column, transform=None):
        self.dataframe = dataframe
        self.image_column = image_column
        self.transform = transform
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx][self.image_column]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, img_path

# ===================================================================
# === 4. PERSIAPAN DATA DENGAN STRATEGI "VALIDASI BERSIH" (DIUBAH) ===
# ===================================================================
def get_phash(filepath):
    try:
        with Image.open(filepath) as img: return imagehash.phash(img)
    except Exception: return None

# --- Langkah 4.1: Memuat dan memfilter data training secara manual ---
train_df = convert_path_to_df(TRAIN_DIR)

print(f"Jumlah data training sebelum filtering manual: {len(train_df)}")


# --- Langkah 4.2: Identifikasi Kebocoran dan Pisahkan Data ---
test_df = convert_path_to_df(TEST_DIR, is_test=True)
print("\n" + "="*50)
print("Mengidentifikasi kebocoran data (train vs test)...")
tqdm.pandas(desc="Menghitung Hash Data Test")
test_hashes = set(test_df['Filepath'].progress_apply(get_phash))
test_hashes.discard(None)

tqdm.pandas(desc="Menghitung Hash Data Train")
train_df['hash'] = train_df['Filepath'].progress_apply(get_phash)

train_df['is_leak'] = train_df['hash'].isin(test_hashes)
print(f"Ditemukan {train_df['is_leak'].sum()} gambar di training set yang identik dengan gambar di test set.")

print("\nMenerapkan strategi 'Validasi Bersih'...")
leaked_df = train_df[train_df['is_leak']].copy()
clean_df = train_df[~train_df['is_leak']].copy()

# --- Langkah 4.3: Pemetaan Label dan Split Data ---
label_mapping = {
    "Ayam Bakar": 0,
    "Ayam Betutu": 1,
    "Ayam Goreng": 2,
    "Ayam Pop": 3,
    "Bakso": 4,
    "Coto Makassar": 5,
    "Gado Gado": 6,
    "Gudeg": 7,
    "Nasi Goreng": 8,
    "Pempek": 9,
    "Rawon": 10,
    "Rendang": 11,
    "Sate Madura": 12,
    "Sate Padang": 13,
    "Soto": 14
}

clean_df['Label'] = clean_df['Label'].map(label_mapping)
leaked_df['Label'] = leaked_df['Label'].map(label_mapping)

val_split = pd.DataFrame()
if not clean_df.empty:
    try:
        clean_train_split, val_split = train_test_split(
            clean_df, test_size=0.2, random_state=42, stratify=clean_df['Label'])
    except ValueError:
        print("Peringatan: Gagal stratify, menggunakan split biasa.")
        clean_train_split, val_split = train_test_split(clean_df, test_size=0.2, random_state=42)
else:
    clean_train_split = clean_df

train_split = pd.concat([clean_train_split, leaked_df], ignore_index=True)
train_split.drop(columns=['hash', 'is_leak'], inplace=True)
val_split.drop(columns=['hash', 'is_leak'], inplace=True)
print(f"Ukuran set training final (sisa bersih + semua bocor): {len(train_split)}")
print(f"Ukuran set validasi murni (hanya dari data bersih): {len(val_split)}")
print("="*50 + "\n")

# --- Langkah 4.4: Lanjutkan dengan pipeline seperti biasa ---
train_transform = transforms.Compose([
    CLAHETransform(),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD)
])
# Transform tambahan: rotasi 90° kiri/kanan
train_transform_extra = transforms.Compose([
    CLAHETransform(),
    transforms.RandomChoice([
        transforms.RandomRotation((90, 90)),
        transforms.RandomRotation((-90, -90))
    ]),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

val_test_transform = transforms.Compose([
    CLAHETransform(),
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

train_dataset = DualTransformDataset(
    train_split,
    image_column='Filepath',
    label_column='Label',
    transform_main=train_transform,
    transform_extra=train_transform_extra
)

val_dataset = CustomDataset(val_split, 'Filepath', 'Label', val_test_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, persistent_workers=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, persistent_workers=True)
# PASTIKAN SEMUA LABEL BERTIPE INTEGER
train_split["Label"] = train_split["Label"].astype(int)

# Kode Anda yang sebelumnya
class_weights = compute_class_weight("balanced", classes=np.unique(train_split["Label"]), y=train_split["Label"])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)


# ===================================================================
# === 5. DEFINISI MODEL (TIDAK DIUBAH) ===
# ===================================================================
class FusionDINOv2(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        self.dinov2 = timm.create_model("vit_base_patch14_dinov2", pretrained=True, num_classes=0)
        self.convnext = timm.create_model("convnext_tiny", pretrained=True, num_classes=0)
        fusion_dim = self.dinov2.num_features + self.convnext.num_features
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        feat_dino = self.dinov2(x)
        feat_conv = self.convnext(x)
        feat_combined = torch.cat((feat_dino, feat_conv), dim=1)
        out = self.classifier(feat_combined)
        return out
class SingleSwinModel(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        self.backbone = timm.create_model("swin_base_patch4_window7_224", pretrained=True, num_classes=num_classes)
    def forward(self, x):
        return self.backbone(x)

# ===================================================================
# === 6. INISIALISASI MODEL, LOSS, DAN SCALER (TIDAK DIUBAH) ===
# ===================================================================
model = FusionDINOv2(num_classes=len(label_mapping)).to(DEVICE)
criterion = nn.CrossEntropyLoss(weight=class_weights)
scaler = torch.amp.GradScaler()
best_f1 = 0.0

# ==================================================================================
# === 7. STRATEGI TRAINING: GRADUAL UNFREEZING (TIDAK DIUBAH) ===
#==================================================================================
# --- TAHAP 1: Latih hanya Classifier Head ---
print("\n" + "="*50 + "\nTAHAP 1: Melatih Classifier Head\n" + "="*50)
for param in model.dinov2.parameters(): param.requires_grad = False
for param in model.convnext.parameters(): param.requires_grad = False
for param in model.classifier.parameters(): param.requires_grad = True
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR_S1)
for epoch in range(EPOCHS_S1):
    model.train()
    progress_bar = tqdm(train_loader, desc=f"S1 Epoch {epoch+1}/{EPOCHS_S1}")
    for images, labels in progress_bar:
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        with torch.amp.autocast(device_type = 'cuda'):
            outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

# --- TAHAP 2: Latih Head + Setengah Atas DINOv2 ---
print("\n" + "="*50 + "\nTAHAP 2: Melatih Head + Setengah Atas DINOv2\n" + "="*50)
total_blocks = len(model.dinov2.blocks)
for i in range(total_blocks // 2, total_blocks):
    for param in model.dinov2.blocks[i].parameters():
        param.requires_grad = True
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR_S2)
for epoch in range(EPOCHS_S2):
    model.train()
    progress_bar = tqdm(train_loader, desc=f"S2 Epoch {epoch+1}/{EPOCHS_S2}")
    for images, labels in progress_bar:
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        with torch.amp.autocast(device_type = 'cuda'):
            outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()



print("\n" + "="*50 + "\nTAHAP 3: Fine-tuning Seluruh Model\n" + "="*50)
for param in model.parameters(): param.requires_grad = True
optimizer = optim.AdamW(model.parameters(), weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, 
    max_lr=LR_S3, 
    epochs=EPOCHS_S3, 
    steps_per_epoch=len(train_loader)
)

for epoch in range(EPOCHS_S3):
    model.train()
    train_loss = 0.0
    train_bar = tqdm(train_loader, desc=f"S3 Epoch {epoch+1}/{EPOCHS_S3} (Train)")
    for images, labels in train_bar:
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        with torch.amp.autocast(device_type = 'cuda'):
            outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        train_loss += loss.item()
    
    model.eval()
    val_loss, all_preds, all_labels = 0.0, [], []
    val_bar = tqdm(val_loader, desc=f"S3 Epoch {epoch+1}/{EPOCHS_S3} (Val)")
    with torch.no_grad():
        for images, labels in val_bar:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            with torch.amp.autocast(device_type = 'cuda'):
                outputs = model(images)
                loss = criterion(outputs, labels)
            val_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    f1 = f1_score(all_labels, all_preds, average="macro")
    
    print(f"\nEpoch [{epoch+1}/{EPOCHS_S3}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Macro F1: {f1:.4f}")
    
    report = classification_report(all_labels, all_preds, target_names=list(label_mapping.keys()))
    print("\n--- Laporan Klasifikasi Validasi ---")
    print(report)
    
    
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_gradual_unfreeze_model_one.pth")
        print(f"✅ Model disimpan (F1 terbaik baru: {best_f1:.4f})")



In [None]:
###################################################################################
### BLOK C: FINAL INFERENCE & SUBMISSION (FusionDINOv2 SAJA) ###
###################################################################################
print("\n" + "#"*80)
print("### MEMULAI BLOK C: INFERENCE & SUBMISSION (FusionDINOv2 SAJA) ###")
print("#"*80 + "\n")

# C.1. Inisialisasi model FusionDINOv2
model_fusion = FusionDINOv2(num_classes=len(label_mapping)).to(DEVICE)

# C.2. Muat bobot terbaik
MODEL_PATH = "best_gradual_unfreeze_model_one.pth"
print(f"Memuat bobot dari: {MODEL_PATH}")
model_fusion.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model_fusion.eval()

# C.3. Siapkan Test Loader (resolusi tinggi)
final_test_transform = transforms.Compose([
    CLAHETransform(),
    transforms.Resize((518, 518)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

test_df = convert_path_to_df(TEST_DIR, is_test=True)
test_dataset = TestDataset(test_df, 'Filepath', transform=final_test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

reverse_label_mapping = {v: k for k, v in label_mapping.items()}

# C.4. Inference
submission_data = []
with torch.no_grad():
    for images, paths in tqdm(test_loader, desc="Inferensi FusionDINOv2"):
        images = images.to(DEVICE)
        outputs = model_fusion(images)
        preds = torch.argmax(outputs, dim=1)

        for i, path in enumerate(paths):
            img_id = os.path.splitext(os.path.basename(path))[0]
            label_int = preds[i].item()
            label_str = reverse_label_mapping[label_int]
            submission_data.append({'id': img_id, 'style': label_str})

# C.5. Simpan hasil akhir
print("\nMenyimpan hasil prediksi ke submission_fusion_only.csv...")
submission_df = pd.DataFrame(submission_data)
submission_df.sort_values(by='id', inplace=True)
submission_df.to_csv("submission_fusion_only.csv", index=False)

print("✅ File submission_fusion_only.csv berhasil dibuat!")
print("\n" + "="*50 + "\nPROSES SELESAI (FUSIONDINOv2 SAJA)\n" + "="*50)
