In [1]:
import numpy as np
import pandas as pd
import pandas.api.types
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

import cv2
import numpy as np
import pandas as pd
import pydicom as dicom
from keras import Model, Input, Layer
from keras.src.layers import Dense, Dropout
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.src.callbacks import Callback
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tensorflow.python.framework import constant_op
from tensorflow.python.ops import clip_ops, math_ops
from tensorflow.keras import backend as K
from keras.src import ops
import tqdm
import lightgbm as lgb
from tabnet import TabNet, TabNetClassifier
from keras.src.layers import BatchNormalization, Activation

In [2]:
N_FOLDS = 5
base_path = "/Users/toru/PycharmProjects/isic-2024-challenge"
df_train = pd.read_csv(f"{base_path}/train-metadata.csv")
df_test = pd.read_csv(f"{base_path}/test-metadata.csv")


def feature_engineering(df):
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(
        df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(
        df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2)
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"] = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df[
        "tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + \
                                 df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt(
        (df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df[
        "tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df_train["tbp_lv_y"], df_train["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df[
        "tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] +
                                        df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4
    df["color_variance_ratio"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_stdLExt"]
    df["border_color_interaction"] = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"] = df["clin_size_long_diam_mm"] / df["tbp_lv_deltaLBnorm"]
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / df["age_approx"]
    df["color_asymmetry_index"] = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"] = df["tbp_lv_areaMM2"] * np.sqrt(
        df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2)
    df["color_range"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (
            df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (
                                df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"] = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"] = df["tbp_lv_perimeterMM"] / (
            2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
    df["age_size_symmetry_index"] = df["age_approx"] * df["clin_size_long_diam_mm"] * df[
        "tbp_lv_symm_2axis"]

    # 病変の色の標準偏差と年齢の関係
    df["color_age_interaction"] = df["tbp_lv_color_std_mean"] * df["age_approx"]

    # 病変の面積と年齢の関係
    df['area_age_interaction'] = df['tbp_lv_areaMM2'] * df['age_approx']

    # ネビス信頼度を面積で正規化
    df['nevi_confidence_area'] = df['tbp_lv_nevi_confidence'] / df['tbp_lv_areaMM2']

    # ネビス信頼度を色不規則性で割ったもの
    df['nevi_confidence_color'] = df['tbp_lv_nevi_confidence'] / df['tbp_lv_color_std_mean']

    # 病変の体積
    df['tbp_lv_areaMM3'] = df['3d_position_distance'] * df['tbp_lv_areaMM2']

    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast",
        "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",
        "size_age_interaction",
        "hue_color_std_interaction",
        "lesion_severity_index",
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index", "color_age_interaction",
        "area_age_interaction", "color_variance_ratio", "border_color_interaction",
        "size_color_contrast_ratio",
        "age_normalized_nevi_confidence", "color_asymmetry_index", "3d_volume_approximation",
        "color_range", "shape_color_consistency", "border_length_ratio",
        "age_size_symmetry_index", "nevi_confidence_area", "nevi_confidence_color", "tbp_lv_areaMM3"
    ]
    new_cat_cols = ["combined_anatomical_site"]

    return df, new_num_cols, new_cat_cols


df_train, new_num_cols, new_cat_cols = feature_engineering(df_train.copy())
df_test, _, _ = feature_engineering(df_test.copy())

num_cols = [
               'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B',
               'tbp_lv_Bext',
               'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
               'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean',
               'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
               'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
               'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
               'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
               'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
               'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
           ] + new_num_cols
# anatom_site_general
cat_cols = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple"] + new_cat_cols
train_cols = num_cols + cat_cols

# 削除する特徴量
remove_cols = [
    "hue_color_std_interaction",
    "luminance_contrast",
    "tbp_lv_norm_color",
    "tbp_lv_color_std_mean",
    "std_dev_contrast",
    "sex",
    "border_length_ratio",
    "tbp_tile_type",
    "tbp_lv_location_simple",
    "log_lesion_area",
    "comprehensive_lesion_index",
    "symmetry_perimeter_interaction",
    "combined_anatomical_site",
    "shape_complexity_index",
    "color_shape_composite_index",
    "tbp_lv_areaMM2",
    "lesion_shape_index",
    "tbp_lv_area_perim_ratio",
    "border_color_interaction",
    "shape_color_consistency",
]

category_encoder = OrdinalEncoder(
    categories='auto',
    dtype=int,
    handle_unknown='use_encoded_value',
    unknown_value=-2,
    encoded_missing_value=-1,
)

X_cat = category_encoder.fit_transform(df_train[cat_cols])
for c, cat_col in enumerate(cat_cols):
    df_train[cat_col] = X_cat[:, c]

# 特徴量を一部削除する
# 削除する特徴量
remove_cols = [
    "hue_color_std_interaction",
    "luminance_contrast",
    "tbp_lv_norm_color",
    "tbp_lv_color_std_mean",
    "std_dev_contrast",
    "sex",
    "border_length_ratio",
    "tbp_tile_type",
    "tbp_lv_location_simple",
    "log_lesion_area",

]

df_train = df_train.drop(remove_cols, axis=1)
df_test = df_test.drop(remove_cols, axis=1)

# train_cols からも削除する
for col in remove_cols:
    if col in train_cols:
        train_cols.remove(col)

# cat_cols からも削除する
for col in remove_cols:
    if col in cat_cols:
        cat_cols.remove(col)

  df_train = pd.read_csv(f"{base_path}/train-metadata.csv")


In [3]:
# 欠損値がある特徴量を確認
df_train[train_cols].isnull().sum()

# 欠損値を埋める
for col in train_cols:
    if col in num_cols:
        df_train[col] = df_train[col].fillna(df_train[col].mean())
        df_test[col] = df_test[col].fillna(df_test[col].mean())
    elif col in cat_cols:
        df_train[col] = df_train[col].fillna(df_train[col].mode()[0])
        df_test[col] = df_test[col].fillna(df_test[col].mode()[0])


In [4]:
# 陽性と悪性の数をそれぞれ取得
n_positive = df_train["target"].sum()
n_negative = len(df_train) - n_positive

# 陽性と悪性の数が同じになるようにアンダーサンプリング
df_train = pd.concat([
    df_train[df_train["target"] == 0].sample(n=n_positive * 2, random_state=42),
    df_train[df_train["target"] == 1]
], axis=0).reset_index(drop=True)

In [5]:
# df_train の 良性と悪性の数を確認
df_train["target"].value_counts()

target
0    786
1    393
Name: count, dtype: int64

In [6]:
gkf = GroupKFold(n_splits=N_FOLDS)  # , shuffle=True, random_state=42
df_train["fold"] = -1
for idx, (train_idx, val_idx) in enumerate(
        gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
    df_train.loc[val_idx, "fold"] = idx

In [7]:
# 隠れ層のサイズ
HIDDEN_SIZE = 100

def binary_crossentropy_balance(target, output):
    # target を output の型にキャスト
    target = tf.dtypes.cast(target, output.dtype)
    epsilon_ = constant_op.constant(0.00001, output.dtype)

    # nan を防ぐためにクリップ
    output = clip_ops.clip_by_value(output, epsilon_, 0.99999)

    # 交差エントロピーの計算
    bce = target * math_ops.log(output + epsilon_) * 2
    bce += (1.0 - target) * math_ops.log(1.0 - output + epsilon_)

    bce_sum = -K.sum(bce, axis=-1)
    return bce_sum

In [17]:



def create_model():
    """
    3層の全結合層
    :return: 
    """
    column_size = len(train_cols)
    input_shape = (column_size,)
    inputs = Input(shape=input_shape)

    x = inputs
    
    # 隠れ層
    x = Dense(HIDDEN_SIZE)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
 
    x = Dense(HIDDEN_SIZE)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
 
    x = Dropout(0.1)(x)

    # 出力層
    x = Dense(1, activation='sigmoid')(x)

    # optimizer : Adam
    opt = tf.keras.optimizers.AdamW(learning_rate=0.002)

    # モデルの作成
    model = Model(inputs=inputs, outputs=x, name="nn_featurea")

    # モデルのコンパイル
    model.compile(optimizer=opt, loss=binary_crossentropy_balance, metrics=['accuracy'])

    return model

In [18]:
from keras.src.callbacks import EarlyStopping


class CustomCheckpoint(Callback):
    def __init__(self, filepath):
        super(CustomCheckpoint, self).__init__()
        self.filepath = filepath
        self.best_val_accuracy = 1.0

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_log_loss_metrics = logs.get('val_loss')

        if val_log_loss_metrics is not None and val_log_loss_metrics < self.best_val_accuracy and val_log_loss_metrics < 0.8:
            self.best_val_accuracy = val_log_loss_metrics
            filepath = f'{self.filepath}.keras'
            self.model.save(filepath, overwrite=True)
            print(f'Saved model to {filepath} with validation accuracy: {val_log_loss_metrics:.4f}')


# カスタムチェックポイントのコールバックを作成
custom_checkpoint = CustomCheckpoint(filepath='model_dense')
Ecall = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

In [19]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str,
               min_tpr: float = 0.80):
    v_gt = abs(np.asarray(solution.values) - 1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1 - min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr ** 2 + (max_fpr - 0.5 * max_fpr ** 2) / (1.0 - 0.5) * (
            partial_auc_scaled - 0.5)
    return partial_auc


BATCH_SIZE = 300
EPOCHS = 1000
STEP_SIZE_TRAIN = len(df_train) // BATCH_SIZE

scores = []
models = []
for fold in range(N_FOLDS):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    
    model = create_model()
    
    x_dataset = _df_train[train_cols]
    y_dataset = _df_train["target"]

    x_valid_datasets = _df_valid[train_cols]
    y_valid_datasets = _df_valid[["target"]]

    history = model.fit(
        x=x_dataset,  # 説明変数
        y=y_dataset,  # 目的変数
        batch_size=BATCH_SIZE,  # バッチサイズ
        epochs=EPOCHS,  # エポック数
        validation_data=(x_valid_datasets, y_valid_datasets),  # 検証データ
        callbacks=[Ecall]  # コールバック
    )
    
    preds = model.predict(x_valid_datasets)
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
    preds_df = pd.DataFrame(preds, columns=["prediction"])
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    scores.append(score)
    models.append(model)

Epoch 1/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 180ms/step - accuracy: 0.6675 - loss: 0.9236 - val_accuracy: 0.6653 - val_loss: 0.9252
Epoch 2/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.6645 - loss: 0.9257 - val_accuracy: 0.6653 - val_loss: 0.9252
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
fold: 0 - Partial AUC Score: 0.02000
Epoch 1/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 204ms/step - accuracy: 0.5002 - loss: 0.9280 - val_accuracy: 0.3093 - val_loss: 0.9078
Epoch 2/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.3376 - loss: 0.9271 - val_accuracy: 0.3093 - val_loss: 0.9080
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
fold: 1 - Partial AUC Score: 0.02000
Epoch 1/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 251ms/step - accuracy: 0.6878 - loss: 0.9096 - val_accuracy: 

In [20]:
preds_df

Unnamed: 0,prediction
0,0.500332
1,0.500332
2,0.500332
3,0.500332
4,0.500332
...,...
230,0.500332
231,0.500332
232,0.500332
233,0.500332


In [21]:
np.mean(scores)

0.01999999999999999

In [26]:
# 欠損値を埋める
for col in train_cols:
    if col in num_cols:
        df_test[col] = df_test[col].fillna(df_test[col].mean())
    elif col in cat_cols:
        df_test[col] = df_test[col].fillna(df_test[col].mode()[0])
        
# 欠損値を確認
df_test[train_cols].isnull().sum()

preds = np.mean([model.predict(df_test[train_cols]) for model in models], 0)

ValueError: could not convert string to float: 'Torso Back Top Third'

In [None]:
df_sub = pd.read_csv(f"{base_path}/sample_submission.csv")
df_sub["target"] = preds
df_sub

In [None]:
df_sub.to_csv("submission.csv", index=False)