# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Import Required Libraries 📚</h1></span>

In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random
import glob
from matplotlib import pyplot as plt

import h5py
from PIL import Image
from io import BytesIO

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Training Configuration ⚙️</h1></span>

In [2]:
CONFIG = {
    "seed": 42,
    'epochs': 15,
    'n_fold': 5,
    'fold': 0,
    "img_size": 224,
    "model_name": "tf_efficientnet_b0_ns",
    'train_batch_size': 64,
    "valid_batch_size": 64,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Set Seed for Reproducibility</h1></span>

In [3]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [4]:
ROOT_DIR = "/kaggle/input/isic-2024-challenge"
TRAIN_CSV = f'{ROOT_DIR}/train-metadata.csv'
TRAIN_HDF = f'{ROOT_DIR}/train-image.hdf5'
SAMPLE = f'{ROOT_DIR}/sample_submission.csv'

weight_files = [
    "/kaggle/input/effnetb0-1-50-ratio/pAUC0.1517_Loss0.0085_epoch22_fold0.bin",
    "/kaggle/input/effnetb0-1-50-ratio/pAUC0.1537_Loss0.0065_epoch24_fold1.bin",
    "/kaggle/input/effnetb0-1-50-ratio/pAUC0.1734_Loss0.0074_epoch10_fold2.bin",
    "/kaggle/input/effnetb0-1-50-ratio/pAUC0.1290_Loss0.0069_epoch10_fold3.bin",
    "/kaggle/input/effnetb0-1-50-ratio/pAUC0.1583_Loss0.0069_epoch24_fold4.bin"
]

# <h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Read the Data 📖</h1>

In [5]:
df = pd.read_csv(TRAIN_CSV)
# df['target'] = 0 # dummy

In [6]:
df_sub = pd.read_csv(SAMPLE)
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.3
1,ISIC_0015729,0.3
2,ISIC_0015740,0.3


# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Dataset Class</h1></span>

In [7]:
class ISICDataset(Dataset):
    def __init__(self, df, file_hdf, transforms=None):
        self.df = df
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = df['isic_id'].values
        self.targets = df['target'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.isic_ids)
    
    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        img = np.array( Image.open(BytesIO(self.fp_hdf[isic_id][()])) )
        target = self.targets[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'target': target,
        }

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Augmentations</h1></span>

In [8]:
data_transforms = {
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">GeM Pooling</h1></span>

In [9]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Create Model</h1></span>

In [10]:
class ISICModel(nn.Module):
    def __init__(self, model_name, num_classes=1, pretrained=True, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)

        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.linear = nn.Linear(in_features, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, images):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        output = self.sigmoid(self.linear(pooled_features))
        return output
    
models = []

# Load all models
for weight_file in weight_files:
    model = ISICModel(CONFIG['model_name'], pretrained=False)
    model.load_state_dict(torch.load(weight_file))
    model.to(CONFIG['device'])
    model.eval()
    models.append(model)

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;">Prepare Dataloaders</span>

In [11]:
def save_to_hdf5(df, input_hdf5, output_hdf5):
    # Ensure output file is not open
    if os.path.exists(output_hdf5):
        os.remove(output_hdf5)
    
    with h5py.File(input_hdf5, "r") as inp_file:
        with h5py.File(output_hdf5, "w") as out_file:
            for isic_id in df['isic_id'].values:
                img_bytes = inp_file[isic_id][()]
                out_file.create_dataset(isic_id, data=img_bytes)


VALID_HDF = 'valid.hdf5'

In [12]:
df.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


In [13]:
# N_SPLITS = 5
# gkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# df_train["fold"] = -1
# for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
#     df_train.loc[val_idx, "fold"] = idx
    
    

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
df["kfold"] = -1
for fold, (train_idx, val_idx) in enumerate(sgkf.split(df, df.target, groups=df.patient_id)):
    df.loc[val_idx, "kfold"] = fold

In [14]:
df['kfold'].tail()

401054    4
401055    3
401056    1
401057    3
401058    0
Name: kfold, dtype: int64

<span style="color: #000508; font-family: Segoe UI; font-size: 1.5em; font-weight: 300;">Start Inference</span>

In [15]:
oof_predictions = np.zeros(len(df))

for fold in range(CONFIG['n_fold']):
    print(f"Fold: {fold}")
    df_train = df[df.kfold != fold].reset_index(drop=True)
    # Preserve original indices
    df_valid = df[df.kfold == fold]
    original_indices = df_valid.index.values  # Store original indices
    
    # Reset indices for validation data to ensure DataLoader compatibility
    df_valid = df_valid.reset_index(drop=True)
    save_to_hdf5(df_valid, TRAIN_HDF, VALID_HDF)
    valid_dataset = ISICDataset(df_valid, VALID_HDF, transforms=data_transforms["valid"])
    
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'],
                              num_workers=2, shuffle=False, pin_memory=True)
    
    print(f"Total batches in valid_loader: {len(valid_loader)}")

    
    model = models[fold]
    
    fold_predictions = []
#     fold_targets = []
    
    with torch.no_grad():
        for data in tqdm(valid_loader, total=len(valid_loader)):
            images = data['image'].to(CONFIG['device'], dtype=torch.float)
            targets = data['target'].to(CONFIG['device'], dtype=torch.float)
            
            outputs = model(images).cpu().numpy()
            fold_predictions.extend(outputs)
            
            
#             fold_targets.extend(targets.cpu().numpy())
            
        fold_predictions = np.array(fold_predictions).flatten()
#         fold_targets = np.array(fold_targets)
            # Ensure predictions match number of validation samples
        assert len(fold_predictions) == len(original_indices), "Mismatch in predictions and validation indices"

        oof_predictions[original_indices] = fold_predictions
        print(f"Validation indices for the current fold: {original_indices}")
        print(f"Assigned predictions for fold {fold}: {oof_predictions[original_indices][:10]}")
     
        total_processed = sum(len(df[df.kfold == fold]) for fold in range(CONFIG['n_fold']))
        print(f"Total processed samples across folds: {total_processed}")


# Final check for zero predictions
zero_indices = np.where(oof_predictions == 0)[0]
if zero_indices.size > 0:
    print(f"Indices with zero predictions: {zero_indices[:10]}")  # Print first few zero indices
else:
    print("All predictions updated correctly.")

df['oof_predictions_effnetb0'] = oof_predictions
df.to_csv('oof_predictions.csv', index=False)

print(len(oof_predictions))
print(len(df))

print("OOF Stacking complete. OOF predictions saved to 'oof_predictions.csv'")

Fold: 0
Total batches in valid_loader: 1112


100%|██████████| 1112/1112 [02:00<00:00,  9.22it/s]


Validation indices for the current fold: [     1      3      4 ... 401051 401052 401058]
Assigned predictions for fold 0: [0.11310272 0.07607744 0.07885528 0.12455974 0.04582517 0.09488209
 0.04011615 0.45077193 0.01951986 0.2872549 ]
Total processed samples across folds: 401059
Fold: 1
Total batches in valid_loader: 1364


100%|██████████| 1364/1364 [02:18<00:00,  9.82it/s]


Validation indices for the current fold: [    14     15     25 ... 401039 401040 401056]
Assigned predictions for fold 1: [0.04177888 0.06273213 0.09936213 0.03112837 0.25753629 0.03823307
 0.09420696 0.0574858  0.02376986 0.06771408]
Total processed samples across folds: 401059
Fold: 2
Total batches in valid_loader: 1214


100%|██████████| 1214/1214 [02:14<00:00,  9.04it/s]


Validation indices for the current fold: [     2      8     12 ... 401048 401049 401050]
Assigned predictions for fold 2: [0.04394739 0.14159714 0.18218321 0.1087203  0.11780225 0.09254287
 0.12195077 0.0874887  0.13089873 0.05417652]
Total processed samples across folds: 401059
Fold: 3
Total batches in valid_loader: 1303


100%|██████████| 1303/1303 [02:17<00:00,  9.47it/s]


Validation indices for the current fold: [     0      6     43 ... 401047 401055 401057]
Assigned predictions for fold 3: [0.09282339 0.04861762 0.1033756  0.07535781 0.14564964 0.07135081
 0.09662806 0.05274736 0.06386439 0.18290392]
Total processed samples across folds: 401059
Fold: 4
Total batches in valid_loader: 1275


100%|██████████| 1275/1275 [02:16<00:00,  9.33it/s]


Validation indices for the current fold: [     5      9     11 ... 401045 401053 401054]
Assigned predictions for fold 4: [0.13725773 0.05724898 0.07836838 0.12246684 0.04991007 0.09034625
 0.05380093 0.06132026 0.02535415 0.15617739]
Total processed samples across folds: 401059
All predictions updated correctly.
401059
401059
OOF Stacking complete. OOF predictions saved to 'oof_predictions.csv'
