# Imports 

In [None]:
import numpy as np
import pandas as pd
import sklearn
import time 
import matplotlib
import tensorflow as tf
import random
import seaborn as sns 
import imblearn

print(f'Numpy: {np.__version__}')
print(f'Pandas: {pd.__version__}')
print(f'Sklearn: {sklearn.__version__}')
print(f'Matplotlib: {matplotlib.__version__}')
print(f'TensorFlow:{tf.__version__}')
print(f'Imb-Learn:{imblearn.__version__}')

import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

# DNN 
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy, categorical_crossentropy

# Validation metrics
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_curve, auc

# Sampling 
from imblearn.over_sampling import SMOTE

# Varibales 

In [None]:
train_path = '/kaggle/input/isic-2024-challenge/train-metadata.csv'
test_path = '/kaggle/input/isic-2024-challenge/test-metadata.csv'

In [None]:
# common columns
#train_cols_set, test_cols_set = set(train_data.columns), set(test_data.columns)
#cols = list(test_cols_set.intersection(train_cols_set))

common_cols = ['sex', 'tbp_lv_deltaLB', 'tbp_lv_location_simple', 'tbp_lv_areaMM2', 'tbp_lv_deltaL', 'tbp_lv_norm_border', 'tbp_lv_A',
 'tbp_lv_B', 'tbp_lv_symm_2axis', 'tbp_lv_L', 'tbp_lv_location', 'tbp_lv_deltaLBnorm', 'tbp_lv_radial_color_std_max',
 'tbp_lv_Lext',  'tbp_lv_Bext', 'tbp_lv_H', 'tbp_lv_perimeterMM', 'tbp_lv_stdL', 'tbp_lv_minorAxisMM', 'tbp_lv_stdLExt',
 'tbp_lv_y', 'tbp_lv_color_std_mean', 'clin_size_long_diam_mm', 'age_approx', 'tbp_lv_eccentricity', 'tbp_lv_deltaA',
 'tbp_tile_type', 'tbp_lv_Hext', 'tbp_lv_C', 'tbp_lv_norm_color', 'tbp_lv_deltaB', 'anatom_site_general', 'tbp_lv_x',
 'tbp_lv_Aext',  'tbp_lv_Cext', 'tbp_lv_symm_2axis_angle', 'tbp_lv_nevi_confidence', 'tbp_lv_z', 'tbp_lv_area_perim_ratio']

In [None]:
# patient_id: Unique patient identifier. (object)
# image_type: Structured field of the ISIC Archive for image type. (o)
# copyright_license: Copyright license. (object)
to_drop = ['isic_id','patient_id','image_type', 'copyright_license', 'attribution']

In [None]:
# categorical columns 
cat_cols_list = ['sex',
                 'tbp_lv_location_simple', 
                 'tbp_lv_location', 
                 'tbp_tile_type', 
                 'anatom_site_general']

# mapping 
sex = {'male':0, 'female':1}

tbp_lv_location_simple =  {'Head & Neck': 0, 
                           'Left Arm': 1, 
                           'Left Leg': 2, 
                           'Right Arm': 3, 
                           'Right Leg': 4, 
                           'Torso Back': 5, 
                           'Torso Front': 6}

tbp_lv_location = {'Head & Neck': 0, 'Left Arm': 1,'Left Arm - Lower': 2, 'Left Arm - Upper': 3,'Left Leg': 4, 
                   'Left Leg - Lower': 5, 'Left Leg - Upper': 6, 'Right Arm': 7, 'Right Arm - Lower': 8, 'Right Arm - Upper': 9, 'Right Leg': 10, 'Right Leg - Lower': 11,
  'Right Leg - Upper': 12, 'Torso Back': 13, 'Torso Back Bottom Third': 14, 'Torso Back Middle Third': 15, 'Torso Back Top Third': 16,
  'Torso Front': 17, 'Torso Front Bottom Half': 18, 'Torso Front Top Half': 19}

tbp_tile_type = {'3D: XP': 0, '3D: white': 1}

anatom_site_general = {'anterior torso': 0, 'head/neck': 1, 'lower extremity': 2, 'posterior torso': 3, 'upper extremity': 4}

# genral dictionary
cat_cols = {'sex': sex, 
           'tbp_lv_location_simple':tbp_lv_location_simple, 
           'tbp_lv_location': tbp_lv_location, 
           'tbp_tile_type':tbp_tile_type,
           'anatom_site_general': anatom_site_general}

The Feature Extraction Section uses Chi-Square, ANOVA F-score, Fisher score and mRMR method to identify the most relevat features. Then Intersection is used to find the common features.  

Chi 2: 27 \
ANOVA:32 \
Fisher:32 \
mRMR:22 \
Common features:15

In [None]:
intersection_cols = ['sex', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_areaMM2',
  'tbp_lv_color_std_mean', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_location', 'tbp_lv_location_simple',
  'tbp_lv_minorAxisMM', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max',
  'tbp_lv_stdL']

intersection_cols.append('target')
print(f'16 selected columns +1 target  = {len(intersection_cols)}')

# Classes

In [None]:
class Data:
    
    def __init__(self, path, common_columns, categorical_cols, target_col):
        self.path = path
        self.common_columns = common_columns
        self.categorical_cols = categorical_cols
        self.target_col = target_col
    
    def read_data(self):
        #read data
        self.df = pd.read_csv(self.path,  low_memory=False)
        print(f'Data Shape:{self.df.shape}')

    def data_prep(self):
        
        if self.target_col: 
            self.common_columns.append('target')      
            
        self.df = self.df[self.common_columns] # df with common cols+target 
        print(f'Common Cols:{len(self.common_columns)}.Data Shape: {self.df.shape}')  
        
        # drop missing values
        missing_vals = self.df.isnull().sum().sum()
        if missing_vals > 0:
            self.df = self.df.dropna()
            self.df = self.df.reset_index (drop = True)
            
        print(f'Missing Vals:{missing_vals}.Data Shape: {self.df.shape}')
        
        # remove dublicate 
        print(f'Dublicate Values:{self.df.duplicated().sum()}') # no dublicated values 
        
        #columns mapping
        for key in self.categorical_cols.keys():
            self.df[key] = self.df[key].map(self.categorical_cols[key])
        
        # Scaling columns 
        scaling_lst = list(self.df.select_dtypes(exclude=['int64']).dtypes.index)
        # categorical columns with more than 2 unique values
        scaling_lst.append('tbp_lv_symm_2axis_angle')
        scaling_lst.append('tbp_lv_location_simple')
        scaling_lst.append('tbp_lv_location')
        scaling_lst.append('anatom_site_general')

        scaler = StandardScaler()
        self.df[scaling_lst] = scaler.fit_transform(self.df[scaling_lst])
        
        # shuflle data
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        print(f'Data Shape: {self.df.shape}')
        
        if self.target_col:
            self.common_columns.remove('target')
            
        print(f'Final Shape: {self.df.shape}')
        
    def get_data(self):
        return self.df
        

In [None]:
class DataFrame_Split:
    """
    Aim: create two sub-datasets from the main data file. 
    Each sub-dataset should:
        Contain the same number of attack instances.
        Have an equal of normal samples.
    Return: List of Dfs 
    """
    def __init__(self, df):
        self.df = df  
        
        # normal // mal index 
        norm_indx = self.df[self.df['target'] == 0].index.to_list()
        mal_indx = self.df[self.df['target'] == 1].index.to_list()
        
        # normal // mal df
        self.norm_df = self.df.iloc[norm_indx]
        self.mal_df = self.df.iloc[mal_indx]
        print(f'Malicus df: {self.mal_df.shape} .... Normal df: {self.norm_df.shape}')
        
    def create_random_subsets(self, data, n_splits):
        shuffled_data = data.sample(frac=1).reset_index(drop=True) # Shuffle the dataset
        avg_size = len(shuffled_data) // n_splits # size of each subset

        subsets = []
        for i in range(n_splits):
            start_idx = i * avg_size
            if i == n_splits - 1:  # Make sure the last split gets all remaining data
                subsets.append(shuffled_data[start_idx:])
            else:
                subsets.append(shuffled_data[start_idx:start_idx + avg_size])
        
        return  subsets
        
                
    def main(self):
        norm_sub_dfs = self.create_random_subsets(self.norm_df, n_splits=2)
        
        self.sub_dfs = []
        for i in range(len(norm_sub_dfs)): 
            
            sub_df = pd.concat([norm_sub_dfs[i], self.mal_df], ignore_index=True) # concat dfs 
            sub_df = sub_df.sample(frac=1).reset_index(drop=True) # suffle 
            print(f'Data_{i}: {sub_df.shape}')
            self.sub_dfs.append(sub_df)
                
    def get_sub_df(self):
        return  self.sub_dfs

In [None]:
class Ratio_data():
    
    def __init__(self, df, ratio):
        self.df = df
        self.ratio = ratio # [0, 1]
    
    def data_ratio(self):
        norm_indx = self.df[self.df.target == 0].index.to_list()
        mal_indx = self.df[self.df.target == 1].index.to_list()
        
        # random select ratio of the normal data 
        norm_rand_indx = random.sample(norm_indx, int(len(norm_indx)*self.ratio) )
        
        # select random ratio 
        train_df_norm = self.df.iloc[norm_rand_indx]
        train_df_mal = self.df.iloc[mal_indx]

        self.sub_df = pd.concat([train_df_norm, train_df_mal])
        self.sub_df = self.sub_df.sample(frac=1).reset_index(drop=True) # suffle
        
        
    def split_data(self, test_size = 0.2):
        features, targ = self.sub_df.columns[:-1], self.sub_df.columns[-1] # features / target
        self.X, self.y = self.sub_df[features], self.sub_df[targ]
        # train test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, stratify=self.y)
        print(f'Xtrain:{self.X_train.shape} // ytrain:{self.y_train.shape}\nXval:{self.X_test.shape} // yval:{self.y_test.shape}')
        print(f'ytrain {self.y_train.value_counts()}')
        print(f'ytest {self.y_test.value_counts()}')
        
    
    def train_smote(self):
        sm = SMOTE(sampling_strategy = 0.5)
        self.X_res, self.y_res = sm.fit_resample(self.X_train, self.y_train)
        print(f'Smote Data:{self.X_res.shape}\nTarget:{self.y_res.value_counts()}')
        
    def get_X_y(self):
        return self.X, self.y
    
    def get_balance_data(self):
         return self.X_res, self.y_res, self.X_test, self.y_test
        
    def get_imbalance_data(self):
        return self.X_train, self.y_train, self.X_test, self.y_test

In [None]:
def features_dist_plot(df):
    
    plt.figure(figsize=(20, 20)) 
    
    # Loop through the first 15 features and create a subplot for each
    for i, feature in enumerate(df.columns):
        plt.subplot(5, 3, i + 1)  # 5 rows, 3 columns for 15 plots
        sns.histplot(df[feature], kde=True, color='blue')
        plt.title(f'Distribution of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Frequency')

    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()
    
# features_dist_plot(X_train)

In [None]:
class Neural_Network:
    
    def __init__(self, X_train, y_train, X_val, y_val):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        
        # Model
        self.model = Sequential([
            Dense(128, activation= 'relu', input_shape=(16,)),
            tf.keras.layers.BatchNormalization(),
    
            Dense(128, activation= 'relu'),
    
            Dense(64, activation= 'relu'),
            Dense(1, activation= 'sigmoid') # sigmoid/softmax Output layer  
            ]) 
        
        self.history = None  # To store training history
        

    def model_compile(self):
        # SGD optimizer for imbalance 
        optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
        # tf.keras.optimizers.Adam(clipvalue=1.0) // adam
        
        self.model.compile(
            optimizer = optimizer, 
            loss= tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0, alpha=0.25, 
                                                       from_logits=False),
            metrics=['accuracy']
                            ) # BinaryCrossentropy
    
    def model_summary(self):
        print(self.model.summary())
        
    def model_fit(self):
        # Class Weights
        # # Calculate class weights for the current fold
        # class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
        # class_weights = {0: class_weights[0], 1: class_weights[1]}

        class_weights = {0: 0.5099934469200524, 1: 25.516393442622952} # no balance data
        self.history = self.model.fit(
                    self.X_train.values, self.y_train.values, 
                    validation_data=(self.X_val.values, self.y_val.values), 
                    epochs=25, 
                    batch_size=128, 
                    class_weight=class_weights
                    ) # callbacks=[early_stop], 
    def model_history(self):
        # Loss Curves
        plt.figure(figsize=[8,6])
        plt.plot(self.history.history['loss'],'r',linewidth=3.0)
        plt.plot(self.history.history['val_loss'],'b',linewidth=3.0)
        plt.legend(['Training loss', 'Validation Loss'],fontsize=10)
        plt.xlabel('Epochs ',fontsize=10)
        plt.ylabel('Loss',fontsize=10)
        plt.title('Loss Curves',fontsize=10)

        # ccuracy Curves
        plt.figure(figsize=[8,6])
        plt.plot(self.history.history['accuracy'],'r',linewidth=3.0)
        plt.plot(self.history.history['val_accuracy'],'b',linewidth=3.0)
        plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=10)
        plt.xlabel('Epochs ',fontsize=10)
        plt.ylabel('Accuracy',fontsize=10)
        plt.title('Accuracy Curves',fontsize=10)

        plt.legend()
        plt.show()

    def model_roc_curve(self):
        
        y_pred_prob = self.model.predict(self.X_val).ravel()

        # Step 5: Compute ROC curve and AUC
        fpr, tpr, thresholds = roc_curve(self.y_val, y_pred_prob)
        roc_auc = auc(fpr, tpr)

        # Step 6: Plot the ROC curve
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.show()

            
    def get_model(self):
        return self.model

# Main

In [None]:
# train data 
data_train = Data(path = train_path,
                 common_columns = common_cols, 
                 categorical_cols= cat_cols, 
                 target_col = True)

data_train.read_data() # read data path 
data_train.data_prep() # data preprocessing 

train_df = data_train.get_data()

In [None]:
# test data 
data_test = Data(path = test_path,
                 common_columns = common_cols, 
                 categorical_cols= cat_cols, 
                 target_col = False)
data_test.read_data()
data_test.data_prep()

test_df = data_test.get_data()

create sub dataset unseen to the model in order u update models weights

In [None]:
train_df = train_df[intersection_cols]
test_df = test_df[intersection_cols[:-1]] # remove target
print(f'Train/Val Data:{train_df.shape}\nTest Data:{test_df.shape}')

In [None]:
dfs_splits = DataFrame_Split(train_df)

dfs_splits.main()
sub_dfs = dfs_splits.get_sub_df()

train_df_ratio_smote, update_train_df = sub_dfs[0], sub_dfs[1]

In [None]:
split = Ratio_data(df = train_df_ratio_smote, 
                         ratio = 0.1)

split.data_ratio()
split.split_data(test_size = 0.2)
split.train_smote()

#X_train, y_train, X_val, y_val = split.get_imbalance_data()
X_train_res, y_train_res, X_val, y_val = split.get_balance_data()

In [None]:
nn = Neural_Network(X_train = X_train_res, 
                    y_train = y_train_res, 
                    X_val = X_val, 
                    y_val = y_val) 
nn.model_compile()
nn.model_summary()
nn.model_fit()
nn.model_history()
nn.model_roc_curve()

In [None]:
trained_model = nn.get_model()

Unseen Data 

In [None]:
unseen_id = [ "ISIC_0015657", "ISIC_0015729", "ISIC_0015740"]

In [None]:
predictions = trained_model.predict(test_df)
predictions

In [None]:
pred = [p[0].astype(np.float64) for p in predictions ]
pred

In [None]:
predictions_df = pd.DataFrame({"isic_id":unseen_id, "target":pred})

In [None]:
predictions_df.info()

In [None]:
predictions_df.to_csv("submission.csv", index = False)