# Skin Cancer Detection with 3D-TBP

## 0. Setup

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
current_dir = "/kaggle/input/isic-2024-challenge"

# List all files in the current directory (excluding subdirectories)
files_in_dir = [f for f in os.listdir(current_dir)]
for file in files_in_dir:
    print(file)

    
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

sample_submission.csv
train-metadata.csv
test-metadata.csv
test-image.hdf5
train-image
train-image.hdf5


In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import io
import os 
import h5py
import cv2
import copy
import re
import optuna
import torch
from google.colab.patches import cv2_imshow
from IPython.display import display, Javascript
import plotly.express as px
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,KNNImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, roc_curve, auc
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression,SGDClassifier,BayesianRidge
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.combine import *
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.pipeline import Pipeline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import VotingClassifier,StackingClassifier
from torchvision import transforms
import torchvision.transforms.v2 as T
from PIL import Image
import pickle
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from tqdm import tqdm

In [4]:
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading

In [27]:
# load all the data required for model ensembling
result_dir = "/kaggle/input/results-3"
result_dn_dir = "/kaggle/input/results4"
current_dir = "/kaggle/input/isic-2024-challenge"
#all_dir = "/kaggle/input/all-isic-data-20240629"
id_dir = "/kaggle/input/datasplit"

# predictions
#train_preds = pd.read_csv(os.path.join(result_dir, "train_predictions.csv"))
val_preds = pd.read_csv(os.path.join(result_dir, "val_predictions.csv"))
test_preds = pd.read_csv(os.path.join(result_dir, "test_predictions.csv"))
val_preds_dn = pd.read_csv(os.path.join(result_dn_dir, "val_predictions.csv"))
test_preds_dn = pd.read_csv(os.path.join(result_dn_dir, "test_predictions.csv"))
val_preds["densenet"] = val_preds_dn["densenet"]
test_preds["densenet"] = test_preds_dn["densenet"]

# metadata
isic_meta = pd.read_csv(os.path.join(current_dir,'train-metadata.csv'), low_memory=False)
#all_meta = pd.read_csv(os.path.join(all_dir, "metadata.csv"))
#augmented_meta = pd.read_csv(os.path.join(result_dir, "augmented_metadata.csv"))

# train/validation/test ISIC_IDs
train_id = pd.read_csv(os.path.join(id_dir, "train_id.csv"))
val_id = pd.read_csv(os.path.join(id_dir, "val_id.csv"))
test_id = pd.read_csv(os.path.join(id_dir, "test_id.csv"))

**Dropping Features**

Notice that the following features are only included in `train-metadata`:
- `target`: Binary class {0: benign, 1: malignant}; i.e., labels
- `lesion_id`: Unique lesion identifier. Present in lesions that were manually tagged as a lesion of interest
- `iddx_full`: Fully classified lesion diagnosis
- `iddx_1`: First level lesion diagnosis
- `iddx_2`: Second level lesion diagnosis
- `iddx_3`: Third level lesion diagnosis
- `iddx_4`: Fourth level lesion diagnosis
- `iddx_5`: Fifth level lesion diagnosis
- `mel_mitotic_index`: Mitotic index of invasive malignant melanomas
- `mel_thick_mm`: Thickness in depth of melanoma invasion
- `tbp_lv_dnn_lesion_confidence`: Lesion confidence score (0-100 scale)

We will exclude all these features but `target` to avoid target leakage, because some of these variables directly reveal the target outcome.

Besides, the following features are irrelevant to our training:
- `patient_id`: Unique patient identifier; assumed that all the images are independent
- `image_type`: Structured field of the ISIC Archive for image type; all the images have the same value of "TBP tile: close-up"
- `tbp_tile_type`: Lighting modality of the 3D TBP source image
- `attribution`: Image attribution, synonymous with image source
- `copyright_license`: Copyright license

All these features will be excluded.

In [7]:
def drop_colns(df):
    """
    Remove features causing "target leakage" and irrelavant to training 
    """
    coln_list = [
        "patient_id", "lesion_id", "iddx_full", 
        "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5",
        "mel_mitotic_index", "mel_thick_mm", "tbp_lv_dnn_lesion_confidence",
        "image_type", "tbp_tile_type", "attribution", "copyright_license"
    ]
    df_new = df.drop(columns=coln_list)
    return df_new

In [8]:
def one_hot_encode(meta):
    """
    Perform one-hot encoding for all the categorical variables
    """
    # identify categorical features (excl. `isic_id` and `target`)
    cat_cols = meta.select_dtypes(["object"]).columns[1:]

    # one-hot encoding
    meta_new = pd.get_dummies(meta, columns=cat_cols)

    return meta_new

In [9]:
isic_meta_upd = one_hot_encode(drop_colns(isic_meta))
isic_meta_upd.shape

(401059, 72)

In [10]:
train_meta = isic_meta_upd[isic_meta_upd["isic_id"].isin(train_id["isic_id"].values)]
val_meta = isic_meta_upd[isic_meta_upd["isic_id"].isin(val_id["isic_id"].values)]
test_meta = isic_meta_upd[isic_meta_upd["isic_id"].isin(test_id["isic_id"].values)]

In [11]:
# proportion of missing values for each feature
missing_prop = train_meta.isna().sum() / train_meta.shape[0]
missing_prop[missing_prop > 0] # displaying only the features with missing values

age_approx    0.006903
dtype: float64

### Metadata Preprocessing

In [13]:
def data_processing_metadata(result):
    """
    Imputation + standardization

    Input
    result: Output of `data_split`, a dictionary of the form
            {"train": [meta_train, img_train], "valid": ..., "test": ...}
    """
    # unpackage "result"
    result_copy = copy.deepcopy(result)
    meta_train = result_copy["train"]
    meta_valid = result_copy["valid"]
    meta_test = result_copy["test"]

    # KNN imputation
    # fit imputer ONLY on training data (avoiding info leakage)
    knn_imputer = KNNImputer()
    knn_imputer.fit(meta_train.iloc[:, 2:]) # excl. `isic_id` and `target`
    # apply imputer to ALL data
    meta_train.iloc[:, 2:] = knn_imputer.transform(meta_train.iloc[:, 2:])
    meta_valid.iloc[:, 2:] = knn_imputer.transform(meta_valid.iloc[:, 2:])
    meta_test.iloc[:, 2:] = knn_imputer.transform(meta_test.iloc[:, 2:])

    # standardization
    # fit scaler ONLY on training data (avoiding info leakage)
    num_cols = meta_train.select_dtypes(["int64", "float64"]).columns[1:35] # before the one-hot variables
    std_scaler = StandardScaler()
    std_scaler.fit(meta_train[num_cols]) # `isic_id` is not included in "num_cols"
    # apply scaler to ALL data
    meta_train[num_cols] = std_scaler.transform(meta_train[num_cols])
    meta_valid[num_cols] = std_scaler.transform(meta_valid[num_cols])
    meta_test[num_cols] = std_scaler.transform(meta_test[num_cols])

    result_new = {
        "train": meta_train,
        "valid": meta_valid,
        "test": meta_test
    }
    return result_new

In [14]:
res = {
    "train": train_meta,
    "valid": val_meta,
    "test": test_meta
}
res_ = data_processing_metadata(res)

In [15]:
print("Proportion of positive examples in training set")
print(np.sum(res_["train"]["target"]) / res_["train"].shape[0])
print("\nProportion of positive examples in validation set")
print(np.sum(res_["valid"]["target"]) / res_["valid"].shape[0])
print("\nProportion of positive examples in test set")
print(np.sum(res_["test"]["target"]) / res_["test"].shape[0])

Proportion of positive examples in training set
0.0009807384628171298

Proportion of positive examples in validation set
0.0009807343872072342

Proportion of positive examples in test set
0.000977409863860769


**Missing Values**

As can be seen in the previous missing value summary, there are missing entries for the following features ("feature_name": missing_proportion):
- `age_approx` (Approximate age of patient at time of imaging): ~0.7%
- `sex`: ~2.9%
- `anatom_site_general` (Location of the lesion on the patient's body): ~1.4%

Instead of removing these samples (images), we will perform imputation after splitting the dataset into a training set and a test set.

### 2.4. Data Resampling

The dataset is extremely imbalanced, since there are only less than 0.1% of images are positive samples. Up-sampling and/or down-sampling are required to address this issue.

In [14]:
def data_resampling(X_train, Y_train, down=0.1, up=0.3, seed=123):
    """
    Resample the training metadata to alleviate data imbalances
    """
    X_train_new = copy.deepcopy(X_train)
    Y_train_new = copy.deepcopy(Y_train)
    #X_train = meta_train.drop(columns=["isic_id", "target"])
    #Y_train = meta_train["target"]

    # down-sample the negative samples
    down_sampler = RandomUnderSampler(sampling_strategy=down, random_state=seed)
    X_train_new, Y_train_new = down_sampler.fit_resample(X_train_new, Y_train_new)
    # up-sample the positive samples using SMOTE
    up_sampler = SMOTE(sampling_strategy=up, random_state=seed)
    X_train_new, Y_train_new = up_sampler.fit_resample(X_train_new, Y_train_new)

    return X_train_new, Y_train_new

In [28]:
# extract all the datasets
meta_train = res_["train"]
meta_valid = res_["valid"]
meta_test = res_["test"]

X_train = meta_train.drop(columns=["isic_id", "target"])
Y_train = meta_train["target"]
X_valid = meta_valid.drop(columns=["isic_id", "target"])
Y_valid = meta_valid["target"]
X_test = meta_test.drop(columns=["isic_id", "target"])
Y_test = meta_test["target"]

In [148]:
# optional
meta_test = meta_test[meta_test["isic_id"].isin(isic_meta["isic_id"])]

## 3. Hyperparameter Tuning

In [7]:
def hyperparameter_tuning(X_train, Y_train, X_valid, Y_valid, model="xgb", seed=123):
    """
    Tune the hyperparameters on the validation set
    """
    # ElasticNet, Random Forest, XGBoost
    assert model in ["xgb", "cat", "gbm"]
    
    # objective to be optimized
    def objective(trial):
        num_positive = np.sum(Y_train == 1)
        num_negative = np.sum(Y_train == 0)
        ratio = num_negative / num_positive

        # XGBoost
        if model == "xgb":
            params = {
                'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
                'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'eta': trial.suggest_loguniform('eta', 1e-3, 0.3),
                'gamma': trial.suggest_loguniform('gamma', 1e-8, 10.0),
                'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 10.0),
                'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
                'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [None, ratio])
            }
            clf = xgb.XGBClassifier(**params, random_state=seed)

        # CatBoost
        elif model == "cat":
            params = {
                'iterations': trial.suggest_int('iterations', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
                'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
                'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
                'border_count': trial.suggest_int('border_count', 32, 255),
                #'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 5.0),
                'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [None, ratio]),
                'verbose': 0,
            }
            if params['bootstrap_type'] == 'Bayesian':
                params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
            elif params['bootstrap_type'] == 'Bernoulli':
                params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
            clf = cb.CatBoostClassifier(**params, random_state=seed)

        # LightGBM
        elif model == "gbm":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 16, 256),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
                #'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 5.0),
                'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [None, ratio]),
                'verbosity': -1
            }
            clf = lgb.LGBMClassifier(**params, random_state=seed)
    
        # training
        clf.fit(X_train, Y_train)
        # prediction and evaluation
        _, _, _, pAUC = evaluate_model(clf, X_valid, Y_valid)
        return pAUC

    # Optuna optimization
    study = optuna.create_study(direction='maximize') # maximize pAUC
    study.optimize(objective, n_trials=50, timeout=600)

    return study

## 4. Model Training

In [8]:
def evaluate_model(clf, X_test, Y_test, min_tpr=0.8, show_scores=False):
    # predictions
    Y_pred = clf.predict(X_test)
    Y_probs = clf.predict_proba(X_test)[:, 1]

    # AUROC
    auroc = roc_auc_score(Y_test, Y_probs)
    
    # AUPRC
    auprc = average_precision_score(Y_test, Y_probs)
    
    # F1 score
    f1 = f1_score(Y_test, Y_pred)

    # partial AUC
    v_gt = abs(np.asarray(Y_test) - 1)
    v_pred = np.array([1.0 - x for x in Y_probs])
    max_fpr = abs(1 - min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=0.8)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    if show_scores:
        print(f"AUROC: {auroc:.4f}")
        print(f"AUPRC: {auprc:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"pAUC at {min_tpr}: {partial_auc:.4f}")

    return auroc, auprc, f1, partial_auc

In [32]:
study_xgb = hyperparameter_tuning(X_train, Y_train, X_valid, Y_valid, model="xgb")

[I 2025-04-22 01:02:21,228] A new study created in memory with name: no-name-40c64e27-a123-48e2-bb79-63c90bee1a79
[I 2025-04-22 01:02:28,406] Trial 0 finished with value: 0.17419546518514334 and parameters: {'lambda': 0.0015019471339901681, 'alpha': 0.10102669176703379, 'max_depth': 10, 'eta': 0.046468334255844575, 'gamma': 0.024052681212082015, 'min_child_weight': 4.083995016209901e-05, 'subsample': 0.505773648653792, 'colsample_bytree': 0.6765274623725224, 'n_estimators': 116, 'scale_pos_weight': None}. Best is trial 0 with value: 0.17419546518514334.
[I 2025-04-22 01:02:31,903] Trial 1 finished with value: 0.07373623551143574 and parameters: {'lambda': 0.007511152102032316, 'alpha': 1.5371367917142942, 'max_depth': 12, 'eta': 0.008778468121347163, 'gamma': 4.340983339787114, 'min_child_weight': 6.421381699602577, 'subsample': 0.81571511270474, 'colsample_bytree': 0.9570039683159998, 'n_estimators': 61, 'scale_pos_weight': None}. Best is trial 0 with value: 0.17419546518514334.
[I 20

In [33]:
study_cat = hyperparameter_tuning(X_train, Y_train, X_valid, Y_valid, model="cat")

[I 2025-04-22 01:12:34,826] A new study created in memory with name: no-name-ec911660-3626-4306-91b4-7085f953c9ce
[I 2025-04-22 01:13:13,941] Trial 0 finished with value: 0.17812182379649732 and parameters: {'iterations': 851, 'learning_rate': 0.0037200870591674725, 'depth': 6, 'l2_leaf_reg': 4.285788495087914, 'bootstrap_type': 'Bernoulli', 'random_strength': 0.10837371707583308, 'border_count': 183, 'scale_pos_weight': None, 'subsample': 0.8237516445941734}. Best is trial 0 with value: 0.17812182379649732.
[I 2025-04-22 01:14:05,659] Trial 1 finished with value: 0.1766268789305959 and parameters: {'iterations': 923, 'learning_rate': 0.014074350365508073, 'depth': 6, 'l2_leaf_reg': 3.975103397769698, 'bootstrap_type': 'Bayesian', 'random_strength': 0.1103763905033374, 'border_count': 160, 'scale_pos_weight': 1018.6398305084746, 'bagging_temperature': 1.04533256000498}. Best is trial 0 with value: 0.17812182379649732.
[I 2025-04-22 01:14:36,819] Trial 2 finished with value: 0.157676725

In [34]:
study_gbm = hyperparameter_tuning(X_train, Y_train, X_valid, Y_valid, model="gbm")

[I 2025-04-22 01:22:57,413] A new study created in memory with name: no-name-75c79c7a-b401-43db-9483-8a6461c0bc64
[I 2025-04-22 01:23:43,717] Trial 0 finished with value: 0.17684056938999973 and parameters: {'n_estimators': 906, 'learning_rate': 0.022844961471150563, 'num_leaves': 254, 'max_depth': 9, 'min_child_samples': 10, 'subsample': 0.7323663097435247, 'colsample_bytree': 0.6524451252850628, 'reg_alpha': 0.006866653418055607, 'reg_lambda': 0.6953459817229648, 'scale_pos_weight': None}. Best is trial 0 with value: 0.17684056938999973.
[I 2025-04-22 01:24:05,522] Trial 1 finished with value: 0.17544770016074904 and parameters: {'n_estimators': 593, 'learning_rate': 0.2026221686303442, 'num_leaves': 120, 'max_depth': 6, 'min_child_samples': 18, 'subsample': 0.7709682839001166, 'colsample_bytree': 0.7451953303426103, 'reg_alpha': 0.028728391358643635, 'reg_lambda': 3.4271708585570684, 'scale_pos_weight': None}. Best is trial 0 with value: 0.17684056938999973.
[I 2025-04-22 01:24:11,4

### Prediction of Single Models

In [35]:
seed = 123

# XGBoost
best_params_xgb = study_xgb.best_params
clf_xgb = xgb.XGBClassifier(**best_params_xgb, random_state=seed)
clf_xgb.fit(X_train, Y_train)
probs_xgb = clf_xgb.predict_proba(X_test)[:, 1]

# CatBoost
best_params_cat = study_cat.best_params
clf_cat = cb.CatBoostClassifier(**best_params_cat, verbose=0, random_state=seed)
clf_cat.fit(X_train, Y_train)
probs_cat = clf_cat.predict_proba(X_test)[:, 1]

# LightGBM
best_params_gbm = study_gbm.best_params
clf_gbm = lgb.LGBMClassifier(**best_params_gbm, verbose=-1, random_state=seed)
clf_gbm.fit(X_train, Y_train)
probs_gbm = clf_gbm.predict_proba(X_test)[:, 1]

In [36]:
# results
test_preds_trees = {
    "isic_id": res_["test"]["isic_id"],
    "xgboost": probs_xgb,
    "catboost": probs_cat,
    "lightgbm": probs_gbm
}
test_preds_trees = pd.DataFrame(test_preds_trees)

val_preds_trees = {
    "isic_id": res_["valid"]["isic_id"],
    "xgboost": clf_xgb.predict_proba(X_valid)[:, 1],
    "catboost": clf_cat.predict_proba(X_valid)[:, 1],
    "lightgbm": clf_gbm.predict_proba(X_valid)[:, 1]
}
val_preds_trees = pd.DataFrame(val_preds_trees)

In [5]:
# exports
val_preds_trees = pd.read_csv("/kaggle/input/preds-trees-poster/val_preds_trees.csv")
test_preds_trees = pd.read_csv("/kaggle/input/preds-trees-poster/test_preds_trees.csv")

In [28]:
# rearrange data for CNN-based methods
#all_preds = pd.concat([train_preds, val_preds, test_preds], axis=0)
all_preds = pd.concat([val_preds, test_preds], axis=0)

val_preds_red = all_preds[all_preds["isic_id"].isin(val_preds_trees["isic_id"].values)]
test_preds_red = all_preds[all_preds["isic_id"].isin(test_preds_trees["isic_id"].values)]

all_val_preds = val_preds_red.merge(val_preds_trees, on="isic_id", how="outer").dropna()
all_test_preds = test_preds_red.merge(test_preds_trees, on="isic_id", how="outer").dropna()

all_test_preds.head()

Unnamed: 0,isic_id,densenet,efficientnet,resnet,GroundTruth,xgboost,catboost,lightgbm
0,ISIC_0015845,0.100781,0.382003,0.00975,0.0,0.07618,0.201912,0.045785
1,ISIC_0024200,0.000803,0.182278,0.000461,0.0,9.023658e-05,0.001399,0.001022
2,ISIC_0035502,0.210058,0.377963,0.044277,0.0,6.115978e-05,0.001102,0.001434
3,ISIC_0051665,0.022299,0.051352,0.013658,0.0,2.073411e-06,0.000177,0.000481
4,ISIC_0051896,0.001666,0.102308,0.000458,0.0,4.743489e-07,0.000112,0.000445


### Ensemble Model

In [21]:
def compute_pAUC(Y_probs, Y_test, min_tpr=0.8, print_out=True):
    
    # partial AUC
    v_gt = abs(np.asarray(Y_test) - 1)
    v_pred = np.array([1.0 - x for x in Y_probs])
    max_fpr = abs(1 - min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=0.8)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    if print_out:
        print(f"pAUC at {min_tpr}: {partial_auc:.4f}")

    return partial_auc

In [29]:
compute_pAUC(all_test_preds["densenet"], all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1559


0.15593763597429705

In [30]:
compute_pAUC(all_val_preds["densenet"], all_val_preds["GroundTruth"])

pAUC at 0.8: 0.1727


0.17271088302428242

In [31]:
def average_ensemble(preds, models=list()):
    """
    Hyperparameter tuning is not required.
    """
    avg_preds = np.mean(preds[models], axis=1)
    return avg_preds

In [32]:
# AVG (densenet, efficientnet, resnet)
compute_pAUC(average_ensemble(all_test_preds, models=["densenet", "efficientnet", "resnet"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1637


0.16368152497364027

In [33]:
# AVG (xgboost, catboost, lightgbm)
compute_pAUC(average_ensemble(all_test_preds, models=["xgboost", "lightgbm", "lightgbm"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1710


0.17099752272145777

In [34]:
# AVG (xgboost, efficientnet)
compute_pAUC(average_ensemble(all_test_preds, models=["xgboost", "densenet"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1561


0.15610995665933866

In [53]:
def weighted_avg_ensemble(val_preds, test_preds, models=list(), seed=123):
    """
    Tune the weights.
    """
    # hyperparameter tuning
    assert len(models) >= 1
    weights = [0 for _ in range(len(models))]
    np.random.shuffle(models)
    
    def objective(trial):
        # choose weights
        params = dict()
        weighted_score = np.zeros(val_preds.shape[0])
        upper_bound = 1
        for i in range(len(weights)):
            if i < len(weights) - 1:
                if np.round(upper_bound) == 0:
                    weight_i = 0
                else:
                    weight_i = np.round(trial.suggest_float(f"weight_{models[i]}", 0, upper_bound), 3)
                upper_bound -= weight_i
            else: # i == len(weights) - 1
                weights_i = upper_bound
            
            params[f"weight_{models[i]}"] = weight_i
            weighted_score += weight_i * val_preds[models[i]].values

        # pAUC
        pAUC = compute_pAUC(weighted_score, all_val_preds["GroundTruth"], print_out=False)
        
        return pAUC

    # Optuna optimization
    study = optuna.create_study(direction='maximize') # maximize pAUC
    study.optimize(objective, n_trials=50, timeout=600)

    # weighted average
    best_weights = study.best_params
    best_weights[f"weight_{models[-1]}"] = 1 - np.sum(list(best_weights.values()))
    weighted_score = np.zeros(test_preds.shape[0])
    for weight_model in best_weights.keys():
        weight_i = best_weights[weight_model]
        model_i = weight_model.split("_")[1]
        weighted_score += weight_i * test_preds[model_i].values
    
    return weighted_score

In [74]:
# WTD_AVG (densenet, efficientnet, resnet)
compute_pAUC(weighted_avg_ensemble(all_val_preds, all_test_preds, 
                                   models=["resnet", "efficientnet", "densenet"]),
             all_test_preds["GroundTruth"])

[I 2025-04-28 02:05:56,864] A new study created in memory with name: no-name-d184f324-c211-4108-bb9e-a61944362e29
[I 2025-04-28 02:05:56,890] Trial 0 finished with value: 0.1713169814704832 and parameters: {'weight_densenet': 0.47241048629304394, 'weight_resnet': 0.1719704350806666}. Best is trial 0 with value: 0.1713169814704832.
[I 2025-04-28 02:05:56,913] Trial 1 finished with value: 0.16918382817007696 and parameters: {'weight_densenet': 0.20713652748078804, 'weight_resnet': 0.5940605442651337}. Best is trial 0 with value: 0.1713169814704832.
[I 2025-04-28 02:05:56,935] Trial 2 finished with value: 0.17052608023986332 and parameters: {'weight_densenet': 0.45680312445381943, 'weight_resnet': 0.4578857108385063}. Best is trial 0 with value: 0.1713169814704832.
[I 2025-04-28 02:05:56,956] Trial 3 finished with value: 0.16878729888003946 and parameters: {'weight_densenet': 0.12457445723786842, 'weight_resnet': 0.5057058844477487}. Best is trial 0 with value: 0.1713169814704832.
[I 2025

pAUC at 0.8: 0.1616


0.16155927175629492

In [19]:
# WTD_AVG (xgboost, catboost, lightgbm)
compute_pAUC(weighted_avg_ensemble(all_val_preds, all_test_preds, 
                                   models=["xgboost", "catboost", "lightgbm"]),
             all_test_preds["GroundTruth"])

[I 2025-04-27 23:37:56,467] A new study created in memory with name: no-name-a2b53d23-75cf-4147-9769-761797cf71df
[I 2025-04-27 23:37:56,490] Trial 0 finished with value: 0.1838838804540755 and parameters: {'weight_xgboost': 0.8986290703446821}. Best is trial 0 with value: 0.1838838804540755.
[I 2025-04-27 23:37:56,512] Trial 1 finished with value: 0.1838838804540755 and parameters: {'weight_xgboost': 0.9089224317423841}. Best is trial 0 with value: 0.1838838804540755.
[I 2025-04-27 23:37:56,533] Trial 2 finished with value: 0.1838838804540755 and parameters: {'weight_xgboost': 0.7745232659549046}. Best is trial 0 with value: 0.1838838804540755.
[I 2025-04-27 23:37:56,553] Trial 3 finished with value: 0.1838838804540755 and parameters: {'weight_xgboost': 0.9814502201018589}. Best is trial 0 with value: 0.1838838804540755.
[I 2025-04-27 23:37:56,574] Trial 4 finished with value: 0.1838838804540755 and parameters: {'weight_xgboost': 0.6137505958898807}. Best is trial 0 with value: 0.1838

pAUC at 0.8: 0.1724


0.17243012652637504

In [76]:
# WTD_AVG (xgboost, catboost, lightgbm)
compute_pAUC(weighted_avg_ensemble(all_val_preds, all_test_preds, 
                                   models=["xgboost", "densenet"]),
             all_test_preds["GroundTruth"])

[I 2025-04-28 02:06:51,171] A new study created in memory with name: no-name-677a986a-f211-441b-97a6-7f767e94532f
[I 2025-04-28 02:06:51,195] Trial 0 finished with value: 0.17285796349595509 and parameters: {'weight_densenet': 0.005467958783009319}. Best is trial 0 with value: 0.17285796349595509.
[I 2025-04-28 02:06:51,217] Trial 1 finished with value: 0.17285796349595509 and parameters: {'weight_densenet': 0.1664173013943976}. Best is trial 0 with value: 0.17285796349595509.
[I 2025-04-28 02:06:51,238] Trial 2 finished with value: 0.17285796349595509 and parameters: {'weight_densenet': 0.6320069772757612}. Best is trial 0 with value: 0.17285796349595509.
[I 2025-04-28 02:06:51,259] Trial 3 finished with value: 0.17285796349595509 and parameters: {'weight_densenet': 0.5933767714882316}. Best is trial 0 with value: 0.17285796349595509.
[I 2025-04-28 02:06:51,280] Trial 4 finished with value: 0.17285796349595509 and parameters: {'weight_densenet': 0.8795043819092149}. Best is trial 0 wi

pAUC at 0.8: 0.1617


0.16168572785258795

In [42]:
def stacked_ensemble(val_preds, test_preds, models=list(), method="logit", seed=123):
    assert method in ["logit", "rf"]
    assert len(models) >= 1

    # predicted probs
    val_preds_meta = val_preds[models]
    test_preds_meta = test_preds[models]
    
    # meta model
    if method == "logit":
        clf = LogisticRegression()
    elif method == "rf":
        clf = RandomForestClassifier(n_estimators=1000, random_state=seed)

    clf.fit(val_preds_meta, val_preds["GroundTruth"])
    meta_preds = clf.predict_proba(test_preds_meta)[:, 1]
    return meta_preds

In [26]:
# STACKED (xgboost, catboost, lightgbm) with logit
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, models=["xgboost", "catboost", "lightgbm"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1691


0.1690748461606155

In [46]:
# STACKED (xgboost, efficientnet) with logit
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, models=["xgboost", "densenet"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1559


0.15594530553577332

In [28]:
# STACKED (xgboost, catboost, lightgbm) with rf
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, models=["xgboost", "catboost", "lightgbm"], method="rf"),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1012


0.10118268024589853

In [45]:
# STACKED (xgboost, efficientnet) with rf
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, models=["xgboost", "densenet"], method="rf"),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1157


0.11565052802842768

In [44]:
# STACKED (resnet, efficientnet, densenet) with logit
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, models=["resnet", "efficientnet", "densenet"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1635


0.16347689129552637

In [47]:
# STACKED (resnet, efficientnet, densenet) with rf
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, models=["resnet", "efficientnet", "densenet"], method="rf"),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1330


0.13300303920378337

In [48]:
# AVG (all)
compute_pAUC(average_ensemble(all_test_preds, models=["xgboost", "catboost", "lightgbm",
                                                      "resnet", "efficientnet", "densenet"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1645


0.16450555977002318

In [68]:
# WTD_AVG (densenet, efficientnet, resnet)
compute_pAUC(weighted_avg_ensemble(all_val_preds, all_test_preds, 
                                   models=["xgboost", "catboost", "lightgbm", 
                                           "resnet", "efficientnet", "densenet"]),
             all_test_preds["GroundTruth"])

[I 2025-04-28 02:02:19,091] A new study created in memory with name: no-name-7748f3d5-9734-4d87-bd72-19b098041dd3
[I 2025-04-28 02:02:19,118] Trial 0 finished with value: 0.1806046669813858 and parameters: {'weight_resnet': 0.0897897936709009, 'weight_lightgbm': 0.8529070944308235}. Best is trial 0 with value: 0.1806046669813858.
[I 2025-04-28 02:02:19,141] Trial 1 finished with value: 0.16937832802879169 and parameters: {'weight_resnet': 0.9805297130734819}. Best is trial 0 with value: 0.1806046669813858.
[I 2025-04-28 02:02:19,165] Trial 2 finished with value: 0.17763784616998549 and parameters: {'weight_resnet': 0.42604989068252264, 'weight_lightgbm': 0.42456164107125477}. Best is trial 0 with value: 0.1806046669813858.
[I 2025-04-28 02:02:19,187] Trial 3 finished with value: 0.17912362542998936 and parameters: {'weight_resnet': 0.28319419563231907, 'weight_lightgbm': 0.023346010332572897, 'weight_catboost': 0.42702802342709456}. Best is trial 0 with value: 0.1806046669813858.
[I 20

pAUC at 0.8: 0.1748


0.1748079961442374

In [69]:
# STACKED (all)
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, 
                              models=["resnet", "efficientnet", "densenet", 
                                      "xgboost", "catboost", "lightgbm"]),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1639


0.1638684361590999

In [70]:
# STACKED (all)
compute_pAUC(stacked_ensemble(all_val_preds, all_test_preds, 
                              models=["resnet", "efficientnet", "densenet", 
                                      "xgboost", "catboost", "lightgbm"], method="rf"),
             all_test_preds["GroundTruth"])

pAUC at 0.8: 0.1520


0.15195373268288387

## 4. Model Testing