In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

import math
from scipy.stats import zscore
from numpy import nanmean, nanstd


from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import itertools

import shap
import pickle

from joblib import Parallel, delayed
from concurrent.futures import ThreadPoolExecutor



pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import torch

import optuna

In [2]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

In [3]:
df_train = pd.read_csv('/kaggle/input/isic-2024-challenge/train-metadata.csv')
df_train.head()

Columns (51,52) have mixed types. Specify dtype option on import or set low_memory=False.


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_location,tbp_lv_location_simple,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,16.261975,26.922447,23.954773,33.684638,28.953117,53.058545,55.828924,54.367448,62.025701,3.152561,27.47617,0.0,3.982447,2.967674,-7.658253,8.360566,5.784302,0.901302,Right Leg - Upper,Right Leg,1.543016,0.002628592,7.09136,0.0,9.307003,0.0,2.036195,2.63778,0.590476,85,-182.703552,613.493652,-42.427948,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,25.36474,26.331,24.54929,41.21903,35.29926,39.70291,44.06404,48.86152,55.36236,0.919497,12.23529,0.0,6.34783,1.781713,-6.500838,6.839008,4.987244,0.639885,Head & Neck,Head & Neck,0.821918,1.334303e-07,2.116402,0.0,3.354148,0.0,0.853227,3.912844,0.285714,55,-0.078308,1575.687,57.1745,Memorial Sloan Kettering Cancer Center,CC-BY,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,17.12817,37.97046,33.48541,44.17492,37.6118,59.26585,62.90973,53.96118,61.67052,3.265153,24.18462,0.0,5.447655,4.485044,-7.709336,9.092376,6.290359,0.932147,Torso Back Top Third,Torso Back,1.194905,0.0002959177,4.798335,0.0,8.886309,0.0,1.743651,1.950777,0.361905,105,123.6497,1472.01,232.9089,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,12.164757,21.448144,21.121356,25.7462,24.374023,56.414429,60.060388,18.649518,23.314841,6.07994,14.889242,0.51452,2.077572,0.326788,-4.665323,4.783413,6.400196,0.654458,Torso Front Top Half,Torso Front,2.481328,21.98945,1.975874,1.771705,9.514499,0.66469,1.258541,1.573733,0.209581,130,-141.02478,1442.185791,58.359802,ACEMID MIA,CC-0,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,20.05747,26.4649,25.71046,36.21798,32.60874,46.94607,52.04118,46.27631,54.85574,2.101708,19.90256,0.0,4.668053,0.754434,-8.579431,9.148495,6.531302,0.946448,Torso Front Top Half,Torso Front,0.929916,0.001378832,3.658854,0.0,6.467562,0.0,2.085409,2.480509,0.313433,20,-72.31564,1488.72,21.42896,Memorial Sloan Kettering Cancer Center,CC-BY,,Benign,Benign,,,,,,,70.44251


In [4]:
numeric_df = df_train.select_dtypes(include=['number'])
corr = numeric_df.corr(method = 'pearson')
corr = corr.abs()
corr.style.background_gradient(cmap='inferno')

Unnamed: 0,target,age_approx,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,tbp_lv_L,tbp_lv_Lext,tbp_lv_areaMM2,tbp_lv_area_perim_ratio,tbp_lv_color_std_mean,tbp_lv_deltaA,tbp_lv_deltaB,tbp_lv_deltaL,tbp_lv_deltaLB,tbp_lv_deltaLBnorm,tbp_lv_eccentricity,tbp_lv_minorAxisMM,tbp_lv_nevi_confidence,tbp_lv_norm_border,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,mel_thick_mm,tbp_lv_dnn_lesion_confidence
target,1.0,0.007734,0.032682,0.019788,0.023206,0.026366,0.013711,0.009431,0.000807,0.044884,0.032671,0.004053,0.000417,0.045139,0.009624,0.024271,0.001046,0.035069,0.011256,0.012237,0.015172,0.005295,0.035757,0.013341,0.006747,0.022264,0.036188,0.025441,0.012669,0.026084,0.002583,0.000867,0.002652,0.009926,0.007896,,0.054766
age_approx,0.007734,1.0,0.01616,0.010769,0.12104,0.087802,0.062579,0.069054,0.008615,0.074416,0.168048,0.077823,0.125966,0.015647,0.18163,0.07677,0.177898,0.08239,0.196032,0.196661,0.172622,0.100439,0.021973,0.285836,0.207641,0.070458,0.033006,0.04996,0.163012,0.090737,0.198716,0.004545,0.013595,0.018804,0.023817,0.080273,0.099672
clin_size_long_diam_mm,0.032682,0.01616,1.0,0.043167,0.075178,0.11833,0.037388,0.108182,0.056407,0.069811,0.048127,0.074518,0.005368,0.902768,0.478078,0.497229,0.035069,0.206246,0.215007,0.201913,0.23833,0.119727,0.858842,0.05812,0.268672,0.49965,0.965004,0.464033,0.235598,0.180026,0.017673,0.007453,0.000696,0.02132,0.007704,0.435815,0.080093
tbp_lv_A,0.019788,0.010769,0.043167,1.0,0.760407,0.417361,0.490904,0.736131,0.6756,0.57171,0.418779,0.311349,0.367616,0.02518,0.078353,0.094474,0.497827,0.001081,0.307697,0.303209,0.111847,0.051384,0.021112,0.054253,0.083821,0.093426,0.047431,0.082424,0.191993,0.139608,0.07393,0.005565,0.007303,0.217637,0.027398,0.422305,0.145164
tbp_lv_Aext,0.023206,0.12104,0.075178,0.760407,1.0,0.24517,0.356352,0.495618,0.650936,0.511267,0.748907,0.150641,0.101165,0.096634,0.164587,0.166624,0.184699,0.136899,0.119104,0.126344,0.249228,0.099519,0.109939,0.263145,0.203346,0.154987,0.054264,0.129133,0.183322,0.162648,0.209659,0.001797,0.013792,0.202624,0.047441,0.157602,0.028886
tbp_lv_B,0.026366,0.087802,0.11833,0.417361,0.24517,1.0,0.909632,0.920575,0.827217,0.489875,0.38292,0.525166,0.51448,0.105209,0.129852,0.036423,0.304182,0.541909,0.150081,0.181451,0.066,0.015677,0.12356,0.174185,0.105317,0.036509,0.138852,0.037803,0.052532,0.020432,0.062444,0.002183,0.001574,0.312377,0.094583,0.494025,0.229783
tbp_lv_Bext,0.013711,0.062579,0.037388,0.490904,0.356352,0.909632,1.0,0.885437,0.939995,0.334485,0.33362,0.43307,0.456322,0.029186,0.075584,0.074415,0.267,0.143808,0.235789,0.248105,0.031391,0.016296,0.047276,0.134365,0.068076,0.065123,0.05607,0.055291,0.1525,0.049738,0.048411,0.002139,0.000612,0.313613,0.115279,0.117989,0.15727
tbp_lv_C,0.009431,0.069054,0.108182,0.736131,0.495618,0.920575,0.885437,1.0,0.900188,0.119916,0.116227,0.527899,0.543948,0.089963,0.132318,0.014772,0.45212,0.401652,0.244462,0.26731,1.9e-05,0.011669,0.102103,0.145763,0.116299,0.013964,0.125094,0.008137,0.12271,0.0427,0.07971,0.004079,0.004474,0.325761,0.07858,0.506807,0.230427
tbp_lv_Cext,0.000807,0.008615,0.056407,0.6756,0.650936,0.827217,0.939995,0.900188,1.0,0.084,0.000533,0.407605,0.409315,0.057112,0.002575,0.003434,0.153108,0.068875,0.151436,0.159299,0.062204,0.048911,0.076928,0.015939,0.016913,0.000548,0.064193,0.000294,0.060868,0.099963,0.034788,0.001096,0.005816,0.329296,0.107489,0.173719,0.113506
tbp_lv_H,0.044884,0.074416,0.069811,0.57171,0.511267,0.489875,0.334485,0.119916,0.084,1.0,0.75872,0.158006,0.095044,0.075774,0.041566,0.126352,0.182406,0.490281,0.163595,0.13475,0.170672,0.063493,0.095841,0.097553,0.012819,0.125971,0.084406,0.115967,0.136785,0.15807,0.017431,0.003031,0.006276,0.07904,0.05503,0.119483,0.071059


In [5]:
# !python /kaggle/input/script-5-fold-resnest101/main.py
# !mv submission.csv submission_resnest101_oof.csv

In [6]:
!python /kaggle/input/script-5-fold-effnetv1b0/main.py
!mv submission.csv submission_effnetv1b0_oof.csv

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.07it/s]


In [7]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

do_ud = True


num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+ 
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']


norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
_patient_sum_ratio = [f'{col}_patient_sum_ratio' for col in num_cols + new_num_cols]
_patient_minmax = [f'{col}_patient_minmax' for col in num_cols + new_num_cols]
_patient_rank = [f'{col}_patient_rank' for col in num_cols + new_num_cols]
_patient_quantile_scaled = [f'{col}_patient_quantile_scaled' for col in num_cols + new_num_cols]


special_cols = ['count_per_patient']
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols + _patient_rank + _patient_minmax + _patient_sum_ratio

In [8]:
def select_features_using_corr_matrix(df, threshold=0.91):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    selected_features = df.columns.difference(to_drop)
    return selected_features.tolist() 

In [9]:
df_pos = df_train[df_train['target'] == 1]
print(len(df_pos))

393


In [10]:
df_pos['clin_size_long_diam_mm'].quantile(0.75)

7.87

In [11]:
def preprocess(df_train, df_test):
    global cat_cols
#     cb_shap_features.extend(cat_cols)
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
#     lgb_shap_features.extend(new_cat_cols)
#     xgb_shap_features.extend(new_cat_cols)
#     cb_shap_features.extend(new_cat_cols)
    selected_features.extend(new_cat_cols)
    cat_cols = new_cat_cols
    
    return df_train, df_test

In [12]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

### Data Read & Feature Engineering

In [13]:

def read_data(path, oof_path=None, oof_path2=None):
    df = (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            (pl.col(col) / (pl.col(col).sum().over('patient_id') + err)).alias(f'{col}_patient_sum_ratio') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).min().over('patient_id')) / (pl.col(col).max().over('patient_id') - pl.col(col).min().over('patient_id') + err)).alias(f'{col}_patient_minmax') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            (pl.col(col).rank('ordinal').over('patient_id')).alias(f'{col}_patient_rank') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).quantile(0.25).over('patient_id')) / (pl.col(col).quantile(0.75).over('patient_id') - pl.col(col).quantile(0.25).over('patient_id') + err)).alias(f'{col}_patient_quantile_scaled') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
    )
    
    if oof_path:
        df_effb0_oof = pd.read_csv(oof_path)
        df_effb0_oof = df_effb0_oof[['oof_predictions_effnetb0']].reset_index(drop=True)
        df = df.reset_index(drop=True)
        df['oof_predictions_effnetb0'] = df_effb0_oof['oof_predictions_effnetb0']
        feature_cols.append('oof_predictions_effnetb0')

    
    if oof_path2:
        df_resnet18_oof = pd.read_csv(oof_path2)
        df_resnet18_oof = df_resnet18_oof[['oof_predictions_eva02']].reset_index(drop=True)
        df = df.reset_index(drop=True)
        df['oof_predictions_eva02'] = df_resnet18_oof['oof_predictions_eva02']
        feature_cols.append('oof_predictions_eva02')
    
    return df

In [14]:
# # Open the pickle file in binary read mode
# with open('/kaggle/input/shap-75-v2/lgbm_selected_features.pkl_75', 'rb') as file:
#     # Load the data from the file
#     lgb_shap_features = pickle.load(file)
    
# # Open the pickle file in binary read mode
# with open('/kaggle/input/shap-75-v2/cb_selected_features.pkl_75', 'rb') as file:
#     # Load the data from the file
#     cb_shap_features = pickle.load(file)
    
# # Open the pickle file in binary read mode
# with open('/kaggle/input/shap-75-v2/xgb_selected_features.pkl_75', 'rb') as file:
#     # Load the data from the file
#     xgb_shap_features = pickle.load(file)
    
# lgb_shap_features += ['oof_predictions_effnetb0']
# xgb_shap_features += ['oof_predictions_effnetb0']
# cb_shap_features += ['oof_predictions_effnetb0']

In [15]:
# Usage
# , oof_path2='/kaggle/input/384-5-fold-resnest-oof-predictions/oof_predictions_resnest101.csv'
df_train = read_data(train_path, oof_path='/kaggle/input/384x384-1-50-ratio-5fold-effnetb0-oof-predictions/oof_predictions.csv')
df_test = read_data(test_path)

# hog_df = pd.read_csv('/kaggle/input/hog-featuresv2/hog_features (3).csv')
# hog_df = hog_df.iloc[:, :25]
# hog_df['image_id'] = hog_df['image_id'].str.replace('.jpg', '')

# df_train = pd.merge(df_train, hog_df, how='outer', left_on='isic_id', right_on='image_id')
# df_train = df_train.drop(columns=['image_id'])

# hog_num_cols = hog_df.select_dtypes(include='number').columns.tolist()

def ugly_duckling_processing(df, num_cols):
    ud_columns = num_cols.copy()
    ud_num_cols = []
    
    #if false - only do location-based ugly ducklings
    include_patient_wide_ud = False  
    
    counter = 0
    
    def calc_ugly_duckling_scores(group, grouping):
        nonlocal counter
        counter += 1
        if counter % 10 == 0: print(".", end="", flush=True)
        z_scores = group[ud_columns].apply(lambda x: zscore(x, nan_policy='omit'))
        ud_scores = np.abs(z_scores)
        prefix = 'ud_' if grouping == 'patient' else 'ud_loc_'
        ud_scores.columns = [f'{prefix}{col}' for col in ud_columns]
        return ud_scores

    print("Analyzing ducklings", end="", flush=True)
    ud_location_col = 'tbp_lv_location'
    ud_scores_loc = df.groupby(['patient_id', ud_location_col])[ud_columns + ['patient_id', ud_location_col]].apply(
        lambda x: calc_ugly_duckling_scores(x, 'location')
    ).reset_index(level=[0, 1], drop=True)
    
    print("\nConcat ducklings")
    df = pd.concat([df, ud_scores_loc], axis=1)
    
    if include_patient_wide_ud:
        print("Analyzing ducklings (part 2)", end="", flush=True)
        ud_scores_patient = df.groupby('patient_id')[ud_columns + ['patient_id']].apply(
            lambda x: calc_ugly_duckling_scores(x, 'patient')
        ).reset_index(level=0, drop=True)
        df = pd.concat([df, ud_scores_patient], axis=1)
        print()  # New line after progress indicator

    print("Extending ducklings")
    ud_num_cols.extend([f'ud_loc_{col}' for col in ud_columns])
    if include_patient_wide_ud:
        ud_num_cols.extend([f'ud_{col}' for col in ud_columns])

    print("Enhancing ugly duckling features", end="", flush=True)
    
    # 1. Percentile-based ugly duckling scores
    def calc_percentile_ud_scores(group):
        nonlocal counter
        counter += 1
        if counter % 10 == 0: print(".", end="", flush=True)
        percentiles = group[ud_columns].rank(pct=True)
        return percentiles.add_prefix('ud_percentile_')
    
    counter = 0  # Reset counter for percentile calculation
    ud_percentiles = df.groupby('patient_id')[ud_columns].apply(calc_percentile_ud_scores).reset_index(level=0, drop=True)
    df = pd.concat([df, ud_percentiles], axis=1)
    ud_num_cols.extend([f'ud_percentile_{col}' for col in ud_columns])
    print()  # New line after progress indicator

    # 2. Ugly duckling count features
    threshold = 2.0  # You can adjust this threshold
    if include_patient_wide_ud:
        ud_count = (df[[f'ud_{col}' for col in ud_columns]].abs() > threshold).sum(axis=1)
        df['ud_count_patient'] = ud_count
        ud_num_cols.append('ud_count_patient')
    
    ud_count_loc = (df[[f'ud_loc_{col}' for col in ud_columns]].abs() > threshold).sum(axis=1)
    df['ud_count_location'] = ud_count_loc
    ud_num_cols.append('ud_count_location')

    # 3. Ugly duckling severity features
    if include_patient_wide_ud:
        df['ud_max_severity_patient'] = df[[f'ud_{col}' for col in ud_columns]].abs().max(axis=1)
        ud_num_cols.append('ud_max_severity_patient')
    df['ud_max_severity_location'] = df[[f'ud_loc_{col}' for col in ud_columns]].abs().max(axis=1)
    ud_num_cols.append('ud_max_severity_location')

    # 4. Ugly duckling consistency features
    if include_patient_wide_ud:
        df['ud_consistency_patient'] = df[[f'ud_{col}' for col in ud_columns]].abs().std(axis=1)
        ud_num_cols.append('ud_consistency_patient')
    df['ud_consistency_location'] = df[[f'ud_loc_{col}' for col in ud_columns]].abs().std(axis=1)
    ud_num_cols.append('ud_consistency_location')

    return df, ud_num_cols

if do_ud:
    df_train, ud_num_cols = ugly_duckling_processing(df_train.copy(), num_cols+new_num_cols)
    df_test, _ = ugly_duckling_processing(df_test.copy(), num_cols+new_num_cols)

Columns (51,52) have mixed types. Specify dtype option on import or set low_memory=False.


Analyzing ducklings



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................






In [16]:
# category_encoder = OrdinalEncoder(
#     categories='auto',
#     dtype=int,
#     handle_unknown='use_encoded_value',
#     unknown_value=-2,
#     encoded_missing_value=-1,
# )

# X_cat = category_encoder.fit_transform(df_train[cat_cols])
# for c, cat_col in enumerate(cat_cols):
#     df_train[cat_col] = X_cat[:, c]
    
# X_cat = category_encoder.transform(df_test[cat_cols])
# for c, cat_col in enumerate(cat_cols):
#     df_test[cat_col] = X_cat[:, c]
    
num_cols = num_cols + new_num_cols + norm_cols + special_cols + _patient_rank + ud_num_cols + _patient_minmax + _patient_sum_ratio
# _patient_quantile_scaled + _patient_minmax + lbp_num_cols + _patient_sum_ratio

selected_features = select_features_using_corr_matrix(df_train[num_cols])
print(len(selected_features))
    
df_train, df_test = preprocess(df_train, df_test)
print(len(selected_features))



191
238


In [17]:
print(len(selected_features))
selected_features += ['oof_predictions_effnetb0']
# lgb_shap_features += ['oof_predictions_effnetb0']
# xgb_shap_features += ['oof_predictions_effnetb0']
# cb_shap_features += ['oof_predictions_effnetb0']
# print(len(lgb_shap_features))

print(len(selected_features))

df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)

238
239


In [18]:
N_SPLITS = 5
gkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

df_train["fold"] = -1
for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
    df_train.loc[val_idx, "fold"] = idx
    

# def balanced_fold_splitter(df, n_splits=5, seed=42):
#     np.random.seed(seed)
#     # Initialize fold assignments
#     df['fold'] = -1
    
#     # Get unique patients
#     unique_patients = df['patient_id'].unique()
#     np.random.shuffle(unique_patients)
    
#     # Initialize fold statistics
#     fold_target_means = {i: 0 for i in range(n_splits)}
#     fold_patient_counts = {i: 0 for i in range(n_splits)}
#     fold_sample_counts = {i: 0 for i in range(n_splits)}

#     # Assign patients to folds
#     for patient in unique_patients:
#         # Get all samples belonging to this patient
#         patient_samples = df[df['patient_id'] == patient]
#         target_mean = patient_samples['target'].mean()
#         num_samples = len(patient_samples)
        
#         # Evaluate which fold to assign this patient to based on balance criteria
#         fold_scores = {}
#         for fold in range(n_splits):
#             # Calculate how balanced this assignment would be
#             new_target_mean = (fold_target_means[fold] * fold_sample_counts[fold] + target_mean * num_samples) / (fold_sample_counts[fold] + num_samples)
#             fold_scores[fold] = (
#                 abs(new_target_mean - df['target'].mean()) +  # Keep target mean close to global mean
#                 abs(fold_patient_counts[fold] + 1 - len(unique_patients) / n_splits) +  # Balance patient count
#                 abs(fold_sample_counts[fold] + num_samples - len(df) / n_splits)  # Balance sample count
#             )
        
#         # Assign this patient to the best fold based on the lowest score
#         best_fold = min(fold_scores, key=fold_scores.get)
        
#         # Update the fold assignment and statistics
#         df.loc[df['patient_id'] == patient, 'fold'] = best_fold
#         fold_target_means[best_fold] = (fold_target_means[best_fold] * fold_sample_counts[best_fold] + target_mean * num_samples) / (fold_sample_counts[best_fold] + num_samples)
#         fold_patient_counts[best_fold] += 1
#         fold_sample_counts[best_fold] += num_samples

#     return df
    
# df_train = balanced_fold_splitter(df_train, n_splits=5)

# Verify the distribution across folds
print(df_train.groupby('fold').agg({
    'target': ['mean', 'count'],
    'patient_id': 'nunique'
}))

        target        patient_id
          mean  count    nunique
fold                            
0     0.001166  71164        207
1     0.000894  87294        208
2     0.000773  77645        210
3     0.001068  83361        209
4     0.001017  81595        208


In [22]:
df_train['fold'].tail()

401054    4
401055    3
401056    1
401057    3
401058    0
Name: fold, dtype: int64

### Optuna HyperParam Tuned Models

In [23]:
# # # Define transformers to select features for each model
# lgb_transformer = ColumnTransformer([('selector', 'passthrough', lgb_shap_features)], remainder='drop')
# # cb_transformer = ColumnTransformer([('selector', 'passthrough', cb_shap_features)], remainder='drop')
# xgb_transformer = ColumnTransformer([('selector', 'passthrough', xgb_shap_features)], remainder='drop')

In [24]:

lgb_params = {
        'objective': 'binary',
        'random_state': seed,  # Use the current seed
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_iterations': 241,
        'learning_rate': 0.03104273262811841,
        'num_leaves': 111,
        'min_child_samples': 100,
        'bagging_fraction': 0.9101945804571369,
        'feature_fraction': 0.34103959543345147,
#         'feature_fraction_bynode': 0.5450587760167019,
        'bagging_freq': 1,
        'lambda_l1': 3.211848972246674e-07, 
        'lambda_l2': 3.720742147032093e-07,
        'max_depth': 4,
        'scale_pos_weight': 2.785263418574575,
}


lgb_model = Pipeline([
#     ('feature_selection', lgb_transformer),
    ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

In [25]:
cb_params = {
    'loss_function':     'Logloss',
    'iterations':        200,
    'verbose':           False,
    'random_state':      seed,
    'cat_features':      cat_cols,
    'depth':             5,
    'learning_rate':     0.0721506164096434,
    'l2_leaf_reg':       7.475812134744556,
    'min_data_in_leaf':  59,
    'scale_pos_weight':  4.254188566545996,
    'subsample':         0.6016652239201599
}

cb_model = Pipeline([
#     ('feature_selection', cb_transformer),
    ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

In [26]:
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'n_estimators':       223,
    'learning_rate':      0.042310779751030335, 
    'lambda':             0.0002847007250281572, 
    'alpha':              4.478266669867162e-08, 
    'max_depth':          6, 
    'subsample':          0.811997027872852, 
    'min_child_weight':   6,

#     'colsample_bytree': 0.9104030105273816,
#     'colsample_bylevel': 0.7008826964577788,
#     'colsample_bynode': 0.8981511201813944,
    'scale_pos_weight':   3.884899202547225,
}


# xgb_params = {
#     'enable_categorical': True,
#     'tree_method':        'hist',
#     'random_state':       seed,
#     'n_estimators':       200,
#     'learning_rate': 0.06393994023577587, 
#     'subsample': 0.9461788900624941,
#     'colsample_bytree': 0.6509914045331329,
#     'colsample_bylevel': 0.5562031466017122,
#     'colsample_bynode': 0.4145102493056463, 
#     'reg_alpha': 0.08084697108107797,
#     'reg_lambda': 1.1464049506276477,
#     'max_depth': 5,
#     'scale_pos_weight': 2.709319246977388,
#     'min_child_weight': 12
# }



xgb_model = Pipeline([
#     ('feature_selection', xgb_transformer),
    ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

In [27]:
estimator = VotingClassifier([
    ('lgb', lgb_model), ('cb', cb_model), ('xgb', xgb_model),
], voting='soft')

### Cross Validation

### Training

In [28]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

In [29]:
%%time
scores = []
train_scores = []
models = []
oof_df = pd.DataFrame()

for fold in range(N_SPLITS):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    
    # Fit the estimator on the training data for the current fold
    estimator.fit(_df_train[selected_features], _df_train["target"])
    
    # Predict on the validation data and calculate the validation score
    preds_valid = estimator.predict_proba(_df_valid[selected_features])[:, 1]
    score_valid = comp_score(_df_valid[["target"]], pd.DataFrame(preds_valid, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score (Validation): {score_valid:.5f}")
    
    # Predict on the training data and calculate the train score
    preds_train = estimator.predict_proba(_df_train[selected_features])[:, 1]
    score_train = comp_score(_df_train[["target"]], pd.DataFrame(preds_train, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score (Train): {score_train:.5f}")
    
    # Append scores and models
    scores.append(score_valid)
    train_scores.append(score_train)
    models.append(estimator)
    
    # Collect out-of-fold predictions
    oof_single = _df_valid[["isic_id", "target"]].copy()
    oof_single["pred"] = preds_valid
    oof_df = pd.concat([oof_df, oof_single])

# Display the average train and validation scores across all folds
print(f"Average Partial AUC Score (Validation): {np.mean(scores):.5f}")
print(f"Average Partial AUC Score (Train): {np.mean(train_scores):.5f}")


Found `num_iterations` in params. Will use it instead of argument


fold: 0 - Partial AUC Score (Validation): 0.18213
fold: 0 - Partial AUC Score (Train): 0.19954


Found `num_iterations` in params. Will use it instead of argument


fold: 1 - Partial AUC Score (Validation): 0.17847
fold: 1 - Partial AUC Score (Train): 0.19956


Found `num_iterations` in params. Will use it instead of argument


fold: 2 - Partial AUC Score (Validation): 0.18807
fold: 2 - Partial AUC Score (Train): 0.19949


Found `num_iterations` in params. Will use it instead of argument


fold: 3 - Partial AUC Score (Validation): 0.18003
fold: 3 - Partial AUC Score (Train): 0.19964


Found `num_iterations` in params. Will use it instead of argument


fold: 4 - Partial AUC Score (Validation): 0.18835
fold: 4 - Partial AUC Score (Train): 0.19955
Average Partial AUC Score (Validation): 0.18341
Average Partial AUC Score (Train): 0.19956
CPU times: user 6min 49s, sys: 10.2 s, total: 6min 59s
Wall time: 2min 25s


In [30]:
lgbm_score = comp_score(oof_df["target"], oof_df["pred"], "")
print(f"ENSEMBLE Score: {lgbm_score:.5f}")

# effnetb10 1/50:

    # ENSEMBLE Score: 0.17409 effnetb0 + randomundersampler (0.01) + onehotencoder
    # ENSEMBLE Score: 0.17574 (Patient-Level Range Normalization) effnetb0 + randomundersampler (0.01) + onehotencoder
    # ENSEMBLE Score: 0.17628 (Ratio of Feature to Patient-Level Feature Sum) effnetb0 + randomundersampler (0.01) + onehotencoder

    # Score: 0.17589 (Ratio of Feature to Patient-Level Feature Sum) + effnetb0_224_1/50 ratio + randomundersampler (0.01) + onehotencoder
    # Score: 0.17630 (Ratio of Feature to Patient-Level Feature Sum) + (min-max features) + effnetb0_224_1/50 ratio + randomundersampler (0.01) + onehotencoder

    # Score: 0.17637 yukarıdakinin corr drop 0.9 (lgb,xgb feature parametreler kaldırıldı.)
    # Score: 0.17676 yukarıdakinin corr drop 0.91 (lgb,xgb feature parametreler kaldırıldı.)
    # Score: 0.17691 yukarıdakinin corr drop 0.92 (lgb,xgb feature parametreler kaldırıldı.)
    # Score: 0.17630 yukarıdakinin corr drop 0.93 (lgb,xgb feature parametreler kaldırıldı.)
    # Score: 0.17644 yukarıdakinin corr drop 0.94 (lgb,xgb feature parametreler kaldırıldı.)

    # Score: 0.17705 yukarıdakinin corr drop 0.91 (lgb,xgb feature parametreler var.) 224x224 effnetb0 
    # Score: 0.17960 yukarıdakinin corr drop 0.91 (lgb,xgb feature parametreler var.) 384x384 effnetb0

    # Score: 0.17724 yukarıdakinin corr drop 0.92 (lgb,xgb feature parametreler var.)
    # Score: 0.17788 yukarıdakinin corr drop 0.91 (lgb,xgb feature parametreler var.)
    # Score: 0.17669 yukarıdakinin corr drop 0.9 (lgb,xgb feature parametreler var.)
    # Score: 0.17741 yukarıdakinin corr drop 0.93 (lgb,xgb feature parametreler var.)

    # ENSEMBLE Score: 0.17513 (Ratio of Feature to Patient-Level Feature Sum) effnetb0 + randomundersampler (0.01) + ordinalencoder
    # ENSEMBLE Score: 0.17425 effnetb0 + undersampling yok ordinal encoder
    # ENSEMBLE Score: 0.17350 (Ratio of Feature to Patient-Level Feature Sum) effnetb0 + ordinalencoder
    # ENSEMBLE Score: 0.17305 (Ratio of Feature to Patient-Level Feature Sum) effnetb0 + ordinalencoder + cat_cols


# ENSEMBLE Score: 0.17360 undersampling yok (onehotencoder)

# ENSEMBLE Score: 0.17395 undersampling yok ordinal encoder

# ENSEMBLE Score: 0.17418 undersampling yok ordinal encoder 0.9 corr drop


# ENSEMBLE Score: 0.17690 (Ratio of Feature to Patient-Level Feature Sum) effnetb0 + randomundersampler (0.01) + onehotencoder + 100 feature selection

ENSEMBLE Score: 0.18288


### Prediction

In [31]:
df_effb0_oof = pd.read_csv('submission_effnetv1b0_oof.csv')
df_test["oof_predictions_effnetb0"] = df_effb0_oof["target"]

df_test['oof_predictions_effnetb0'].head()

0    0.129663
1    0.041468
2    0.054609
Name: oof_predictions_effnetb0, dtype: float64

In [32]:
# df_resnest101_oof = pd.read_csv('submission_resnest101_oof.csv')
# df_test["oof_predictions_resnest101"] = df_resnest101_oof["target"]

# df_test['oof_predictions_resnest101'].head()

In [33]:
df_subm = pd.read_csv(subm_path)
df_subm.head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.3
1,ISIC_0015729,0.3
2,ISIC_0015740,0.3


In [34]:
test_preds = np.zeros(len(df_test))

for model in models:
    test_preds += model.predict_proba(df_test[selected_features])[:, 1] / N_SPLITS

df_subm['target'] = test_preds

df_subm.to_csv('submission.csv', index=False)
df_subm.head()

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.294205
1,ISIC_0015729,0.180381
2,ISIC_0015740,0.214993
