# Install Required Packages

In [1]:
!pip install matplotlib google-cloud-storage
!pip install MissForest
!pip install dask[dataframe]
!pip install keras-cv
!pip install keras-tuner --upgrade



# Imports

In [2]:
import collections
import copy
import hashlib
import io
import os
import subprocess
import textwrap
import time
from typing import List, Text
from PIL import Image
import numpy as np
import pandas as pd
import tabulate
import tensorflow as tf
import matplotlib.pyplot as plt
from google.colab import auth
from google.cloud import storage
from sklearn.model_selection import train_test_split
import ast
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from missforest import MissForest
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils import resample
import pickle
import joblib
from google.colab import drive
drive.mount('/content/drive')
import cv2
from sklearn.utils import resample
from concurrent.futures import ThreadPoolExecutor
from tensorflow.keras.applications import ResNet50, MobileNet
from tensorflow.keras.preprocessing import image
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling2D, Concatenate, BatchNormalization, RandomFlip, RandomRotation, RandomZoom
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, AdamW, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import regularizers
from concurrent.futures import ThreadPoolExecutor
from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
import torch
from sklearn.utils import resample
import keras_cv
import keras
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner.tuners import Hyperband, BayesianOptimization
from sklearn.utils import class_weight
import tensorflow as tf
import keras.backend as tfb
from sklearn.utils import class_weight
import random
import warnings
warnings.filterwarnings("ignore")
import logging
# Suppress only the "Connection pool is full" warnings by setting the log level
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Set the Random Seed and Deterministic Running

In [3]:
# Set the seed value for reproducibility
SEED = 42

# Set seed for Python's built-in random module
random.seed(SEED)

# Set seed for NumPy
np.random.seed(SEED)

# Set seed for TensorFlow
tf.random.set_seed(SEED)

# Set seed for Python hash function to make hash-based operations deterministic
os.environ['PYTHONHASHSEED'] = str(SEED)

# Ensure that GPU operations are deterministic
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

# Initialise Dataset in Google Colab

In [4]:
# Specify the path to file
folder_path = '/content/SCIN_Dataset/dx-scin-public-data/'

if os.path.exists(folder_path):
    print("Dataset has already been loaded to Google Colab instance")
else:
    !unzip "/content/drive/MyDrive/SCIN_Dataset.zip" -d "/content"

Dataset has already been loaded to Google Colab instance


# Initialise SCIN Dataset from Google Cloud

### Authenticate User for Google Colab Access

In [5]:
# Authenticate user for access. There will be a popup asking you to sign in with your user and approve access.
auth.authenticate_user()


### Create Global Parameters

In [6]:
class Globals:
  # GCP project with GCS bucket of interest
  gcp_project = 'dx-scin-public' #@param

  # GCS bucket with data to read
  gcs_bucket_name = 'dx-scin-public-data' #@param

  # CSV of case metadata to read
  cases_csv = 'dataset/scin_cases.csv' #@param

  # CSV of label metadata to read
  labels_csv = 'dataset/scin_labels.csv' #@param

  # Images directory
  gcs_images_dir = 'dataset/images/' #@param

  ### Key column names
  image_path_columns = ['image_1_path', 'image_2_path', 'image_3_path']
  weighted_skin_condition_label = "weighted_skin_condition_label"
  skin_condition_label = "dermatologist_skin_condition_on_label_name"

  ###### Formed during execution:

  # Client for querying GCS
  gcs_storage_client = None

  # Bucket object for loading files
  gcs_bucket = None

  # pd.DataFrame for the loaded metadata_csv
  cases_df = None

  # pd.DataFrame for the loaded labels_csv
  cases_and_labels_df = None

print(f'GCS bucket name: {Globals.gcs_bucket_name}')
print(f'cases_csv: {Globals.cases_csv}')
print(f'labels_csv: {Globals.labels_csv}')
print(f'images dir: {Globals.gcs_images_dir}')

GCS bucket name: dx-scin-public-data
cases_csv: dataset/scin_cases.csv
labels_csv: dataset/scin_labels.csv
images dir: dataset/images/


### Create Dataframe Containing Metadata and Labels

In [7]:
def list_blobs(storage_client, bucket_name):
  """Helper to list blobs in a bucket (useful for debugging)."""
  blobs = storage_client.list_blobs(bucket_name)
  for blob in blobs:
    print(blob)

def initialize_df_with_metadata(bucket, csv_path):
  """Loads the given CSV into a pd.DataFrame."""
  df = pd.read_csv(io.BytesIO(bucket.blob(csv_path).download_as_string()), dtype={'case_id': str})
  df['case_id'] = df['case_id'].astype(str)
  return df

def augment_metadata_with_labels(df, bucket, csv_path):
  """Loads the given CSV into a pd.DataFrame."""
  labels_df = pd.read_csv(io.BytesIO(bucket.blob(csv_path).download_as_string()), dtype={'case_id': str})
  labels_df['case_id'] = labels_df['case_id'].astype(str)
  merged_df = pd.merge(df, labels_df, on='case_id')
  return merged_df

Globals.gcs_storage_client = storage.Client(Globals.gcp_project)
Globals.gcs_bucket = Globals.gcs_storage_client.bucket(
    Globals.gcs_bucket_name
)
Globals.cases_df = initialize_df_with_metadata(Globals.gcs_bucket, Globals.cases_csv)
Globals.cases_and_labels_df = augment_metadata_with_labels(Globals.cases_df, Globals.gcs_bucket, Globals.labels_csv)
print(len(Globals.cases_and_labels_df))

5033


In [8]:
# Display head of DataFrame
Globals.cases_and_labels_df.head()

Unnamed: 0,case_id,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,race_ethnicity_american_indian_or_alaska_native,race_ethnicity_asian,race_ethnicity_black_or_african_american,...,dermatologist_gradable_for_fitzpatrick_skin_type_1,dermatologist_gradable_for_fitzpatrick_skin_type_2,dermatologist_gradable_for_fitzpatrick_skin_type_3,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,gradable_for_monk_skin_tone_india,gradable_for_monk_skin_tone_us,monk_skin_tone_label_india,monk_skin_tone_label_us
0,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,...,YES,,,FST2,,,True,True,2.0,1.0
1,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,...,YES,,,FST1,,,True,True,3.0,3.0
2,-1003358831658393077,SCIN,1.0.0,2023,AGE_18_TO_29,MALE,NONE_IDENTIFIED,,,,...,YES,,,FST4,,,True,True,3.0,4.0
3,-1003826561155964328,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,...,NO,,,,,,True,True,2.0,4.0
4,-1003844406100696311,SCIN,1.0.0,2023,AGE_40_TO_49,FEMALE,FST3,,,,...,YES,,,FST1,,,True,True,1.0,1.0


# Data Pre-Processing

### Get Relevant Features and Target Labels

In [9]:
# Create a DataFrame with relevant features and the target label
df = Globals.cases_and_labels_df[['case_id', 'image_1_path', 'image_2_path','image_3_path','combined_race','dermatologist_fitzpatrick_skin_type_label_1',
              'dermatologist_fitzpatrick_skin_type_label_2','dermatologist_fitzpatrick_skin_type_label_3','monk_skin_tone_label_india','monk_skin_tone_label_us',
              'weighted_skin_condition_label']]

# display top 5 records in dataframe and dataframe info
display(df.head())
df.info()

Unnamed: 0,case_id,image_1_path,image_2_path,image_3_path,combined_race,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,monk_skin_tone_label_india,monk_skin_tone_label_us,weighted_skin_condition_label
0,-1000600354148496558,dataset/images/-3205742176803893704.png,,,,FST2,,,2.0,1.0,"{'Inflicted skin lesions': 0.41, 'Eczema': 0.4..."
1,-1002039107727665188,dataset/images/-4762289084741430925.png,,,,FST1,,,3.0,3.0,"{'Prurigo nodularis': 0.41, 'SCC/SCCIS': 0.41,..."
2,-1003358831658393077,dataset/images/-4027806997035329030.png,,,HISPANIC_LATINO_OR_SPANISH_ORIGIN,FST4,,,3.0,4.0,"{'Impetigo': 0.55, 'Herpes Zoster': 0.23, 'Bul..."
3,-1003826561155964328,dataset/images/-5332065579713135540.png,dataset/images/-6353431708064969797.png,dataset/images/742075435141960831.png,,,,,2.0,4.0,{}
4,-1003844406100696311,dataset/images/-3799298995660217860.png,dataset/images/-5881426422999442186.png,dataset/images/5854025080806696361.png,WHITE,FST1,,,1.0,1.0,"{'Lichen planus/lichenoid eruption': 0.33, 'Fo..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5033 entries, 0 to 5032
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      5033 non-null   object 
 1   image_1_path                                 5033 non-null   object 
 2   image_2_path                                 3085 non-null   object 
 3   image_3_path                                 2289 non-null   object 
 4   combined_race                                2652 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  4302 non-null   object 
 6   dermatologist_fitzpatrick_skin_type_label_2  634 non-null    object 
 7   dermatologist_fitzpatrick_skin_type_label_3  631 non-null    object 
 8   monk_skin_tone_label_india                   5019 non-null   float64
 9   monk_skin_tone_label_us                      5005 non-null   float64
 10  

In [10]:
# Remove row for case id 1834609919572055750 - known issue of missing images from dataset
df = df[~df['case_id'].str.contains('1834609919572055750', na=False)]

### Split Data into Train and Test

In [11]:
# separate feature and target columns
X = df.iloc[:,0:-1]
y = df.iloc[:,-1:]

# split data into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

# concenate to create test and train dataframes for seperate pre-processing to prevent data leakage
train_df = pd.concat([X_train,y_train], axis=1)
test_df = pd.concat([X_test,y_test], axis=1)

In [12]:
# show info for training and testing sets
for i in [train_df, test_df]:
    i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, 726 to 861
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      3522 non-null   object 
 1   image_1_path                                 3522 non-null   object 
 2   image_2_path                                 2156 non-null   object 
 3   image_3_path                                 1615 non-null   object 
 4   combined_race                                1846 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  3015 non-null   object 
 6   dermatologist_fitzpatrick_skin_type_label_2  431 non-null    object 
 7   dermatologist_fitzpatrick_skin_type_label_3  433 non-null    object 
 8   monk_skin_tone_label_india                   3511 non-null   float64
 9   monk_skin_tone_label_us                      3501 non-null   float64
 10  weig

### Remove Rows Where Weighted_Skin_Condition_Label is Blank

In [13]:
for i in [train_df, test_df]:

    # Replace "?" with null/NaN in weighted_skin_condition_label
    i['weighted_skin_condition_label'].replace('{}', np.nan, inplace=True)

    # Remove any rows with NaN in weighted_skin_condition_label
    i.dropna(subset=["weighted_skin_condition_label"], inplace=True)

    # show info for training and testing sets
    i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2169 entries, 726 to 3093
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      2169 non-null   object 
 1   image_1_path                                 2169 non-null   object 
 2   image_2_path                                 1400 non-null   object 
 3   image_3_path                                 1049 non-null   object 
 4   combined_race                                1236 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  2054 non-null   object 
 6   dermatologist_fitzpatrick_skin_type_label_2  407 non-null    object 
 7   dermatologist_fitzpatrick_skin_type_label_3  409 non-null    object 
 8   monk_skin_tone_label_india                   2164 non-null   float64
 9   monk_skin_tone_label_us                      2159 non-null   float64
 10  wei

### Remove Features Where More than Half is Missing

In [14]:
# get columns where more than half of the values are missing in the training dataframe
cols_to_drop = []

for col in train_df:
    if 'image' not in col and train_df[col].isna().sum() > len(train_df.index)/2:
        cols_to_drop.append(col)

# drop columns from the training and testing dataframes
for i in [train_df, test_df]:
    i.drop(cols_to_drop, axis=1, inplace=True)

# Display dataframe info to see remaining attributes
for i in [train_df, test_df]:
    i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2169 entries, 726 to 3093
Data columns (total 9 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      2169 non-null   object 
 1   image_1_path                                 2169 non-null   object 
 2   image_2_path                                 1400 non-null   object 
 3   image_3_path                                 1049 non-null   object 
 4   combined_race                                1236 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  2054 non-null   object 
 6   monk_skin_tone_label_india                   2164 non-null   float64
 7   monk_skin_tone_label_us                      2159 non-null   float64
 8   weighted_skin_condition_label                2169 non-null   object 
dtypes: float64(2), object(7)
memory usage: 169.5+ KB
<class 'pandas.core.frame.Dat

### Split Labels and Label Weights

In [15]:
for i in [train_df, test_df]:
  # Using apply and a lambda to parse the dictionary and split into keys and values
  i['label'], i['weights'] = zip(*i['weighted_skin_condition_label'].apply(lambda x: (list(ast.literal_eval(x).keys()), list(ast.literal_eval(x).values()))))

# Display dataframe head and info
for i in [train_df, test_df]:
  display(i.head())
  i.info()

Unnamed: 0,case_id,image_1_path,image_2_path,image_3_path,combined_race,dermatologist_fitzpatrick_skin_type_label_1,monk_skin_tone_label_india,monk_skin_tone_label_us,weighted_skin_condition_label,label,weights
726,-3399323628854326999,dataset/images/-1189822939611839188.png,dataset/images/-6631837338743072685.png,dataset/images/4203933931229451208.png,HISPANIC_LATINO_OR_SPANISH_ORIGIN,FST4,4.0,4.0,"{'Perioral Dermatitis': 0.67, 'Acne': 0.33}","[Perioral Dermatitis, Acne]","[0.67, 0.33]"
2472,-9110612497360538721,dataset/images/2298102274545177880.png,,,,FST2,2.0,3.0,"{'Allergic Contact Dermatitis': 0.67, 'Eczema'...","[Allergic Contact Dermatitis, Eczema]","[0.67, 0.33]"
2688,1519975931025758567,dataset/images/3390191852224494870.png,,,HISPANIC_LATINO_OR_SPANISH_ORIGIN,FST3,2.0,5.0,"{'Pigmented purpuric eruption': 0.55, 'Tinea':...","[Pigmented purpuric eruption, Tinea, Eczema]","[0.55, 0.23, 0.23]"
3966,5837921410652230755,dataset/images/-1592222786083529679.png,dataset/images/-8932329685662456128.png,dataset/images/3238583509221468864.png,HISPANIC_LATINO_OR_SPANISH_ORIGIN,FST1,2.0,1.0,"{'Eczema': 0.43, 'Pigmented purpuric eruption'...","[Eczema, Pigmented purpuric eruption, Allergic...","[0.43, 0.09, 0.18, 0.09, 0.21]"
437,-2400802139054066549,dataset/images/-974618061209306157.png,,,,FST5,5.0,6.0,"{'Pityriasis rosea': 0.55, 'Parapsoriasis': 0....","[Pityriasis rosea, Parapsoriasis, Eczema]","[0.55, 0.23, 0.23]"


<class 'pandas.core.frame.DataFrame'>
Index: 2169 entries, 726 to 3093
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      2169 non-null   object 
 1   image_1_path                                 2169 non-null   object 
 2   image_2_path                                 1400 non-null   object 
 3   image_3_path                                 1049 non-null   object 
 4   combined_race                                1236 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  2054 non-null   object 
 6   monk_skin_tone_label_india                   2164 non-null   float64
 7   monk_skin_tone_label_us                      2159 non-null   float64
 8   weighted_skin_condition_label                2169 non-null   object 
 9   label                                        2169 non-null   object 
 10  wei

Unnamed: 0,case_id,image_1_path,image_2_path,image_3_path,combined_race,dermatologist_fitzpatrick_skin_type_label_1,monk_skin_tone_label_india,monk_skin_tone_label_us,weighted_skin_condition_label,label,weights
3129,2975869998074215405,dataset/images/-8309623388500227648.png,dataset/images/3190471120217261671.png,dataset/images/3233353314629424627.png,WHITE,FST2,2.0,2.0,"{'Herpes Zoster': 0.5, 'Herpes Simplex': 0.5}","[Herpes Zoster, Herpes Simplex]","[0.5, 0.5]"
4862,8751100562164689123,dataset/images/-4075936252847654230.png,,,"HISPANIC_LATINO_OR_SPANISH_ORIGIN,WHITE",,3.0,3.0,"{'Insect Bite': 0.55, 'Erythema nodosum': 0.23...","[Insect Bite, Erythema nodosum, Hypersensitivity]","[0.55, 0.23, 0.23]"
734,-3426382639481624876,dataset/images/7049703890719675614.png,,,WHITE,FST1,2.0,2.0,"{'Eczema': 0.33, 'Allergic Contact Dermatitis'...","[Eczema, Allergic Contact Dermatitis, Irritant...","[0.33, 0.33, 0.33]"
2306,-8545507601606408653,dataset/images/-5228362336502809630.png,dataset/images/5066351493764841468.png,dataset/images/563329283817116517.png,,FST2,3.0,3.0,"{'Eczema': 0.67, 'Hypersensitivity': 0.33}","[Eczema, Hypersensitivity]","[0.67, 0.33]"
1375,-5499150212420647396,dataset/images/-3027327548738103934.png,dataset/images/-736897832503133263.png,dataset/images/3097647019489022813.png,BLACK_OR_AFRICAN_AMERICAN,FST4,8.0,9.0,"{'Eczema': 0.35, 'Lichen nitidus': 0.35, 'Mili...","[Eczema, Lichen nitidus, Milia, Hypersensitivity]","[0.35, 0.35, 0.15, 0.15]"


<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 3129 to 30
Data columns (total 11 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      891 non-null    object 
 1   image_1_path                                 891 non-null    object 
 2   image_2_path                                 587 non-null    object 
 3   image_3_path                                 419 non-null    object 
 4   combined_race                                521 non-null    object 
 5   dermatologist_fitzpatrick_skin_type_label_1  836 non-null    object 
 6   monk_skin_tone_label_india                   890 non-null    float64
 7   monk_skin_tone_label_us                      890 non-null    float64
 8   weighted_skin_condition_label                891 non-null    object 
 9   label                                        891 non-null    object 
 10  weigh

In [16]:
for i in [train_df, test_df]:
  i.drop('weighted_skin_condition_label', axis=1, inplace = True)

### Impute Data

MissForest

In [17]:
# Display dataframe info
for i in [train_df, test_df]:
  i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2169 entries, 726 to 3093
Data columns (total 10 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      2169 non-null   object 
 1   image_1_path                                 2169 non-null   object 
 2   image_2_path                                 1400 non-null   object 
 3   image_3_path                                 1049 non-null   object 
 4   combined_race                                1236 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  2054 non-null   object 
 6   monk_skin_tone_label_india                   2164 non-null   float64
 7   monk_skin_tone_label_us                      2159 non-null   float64
 8   label                                        2169 non-null   object 
 9   weights                                      2169 non-null   object 
dtypes: 

In [18]:
# Remove PREFER_NOT_TO_ANSWER
for i in [train_df, test_df]:
  i['combined_race'].replace('PREFER_NOT_TO_ANSWER', np.nan, inplace=True)

In [19]:
# Columns that we don't want to use imputation
exclude_columns = ['case_id','image_1_path','image_2_path','image_3_path','label','weights']

In [20]:
# Separate the columns to be imputed and the excluded columns
for i in [train_df, test_df]:
  if i is train_df:
    train_df_to_impute = train_df.drop(columns=exclude_columns)
    train_df_excluded = train_df[exclude_columns]
  else:
    test_df_to_impute = test_df.drop(columns=exclude_columns)
    test_df_excluded = test_df[exclude_columns]

In [21]:
# Initialize the imputer
imputer = MissForest()
imputer.fit(x=train_df_to_impute,categorical=['combined_race', 'dermatologist_fitzpatrick_skin_type_label_1', 'monk_skin_tone_label_india','monk_skin_tone_label_us'])

In [22]:
# Apply imputation to train and test sets
for i in [train_df_to_impute, test_df_to_impute]:
  imputed_df = imputer.transform(x=i)

  if i is train_df_to_impute:
    train_df_imputed = pd.concat([train_df_excluded, imputed_df], axis=1)
  else:
    test_df_imputed = pd.concat([test_df_excluded, imputed_df], axis=1)

100%|██████████| 5/5 [00:08<00:00,  1.63s/it]
100%|██████████| 5/5 [00:04<00:00,  1.18it/s]


In [23]:
# Assign imputed dataframes back to train and test sets
train_df = train_df_imputed[['case_id','image_1_path','image_2_path','image_3_path','combined_race',
                             'dermatologist_fitzpatrick_skin_type_label_1', 'monk_skin_tone_label_india',
                             'monk_skin_tone_label_us','label','weights']]
test_df = test_df_imputed[['case_id','image_1_path','image_2_path','image_3_path','combined_race',
                             'dermatologist_fitzpatrick_skin_type_label_1', 'monk_skin_tone_label_india',
                             'monk_skin_tone_label_us','label','weights']]

In [24]:
# Display dataframe info
for i in [train_df, test_df]:
  i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2169 entries, 726 to 3093
Data columns (total 10 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      2169 non-null   object 
 1   image_1_path                                 2169 non-null   object 
 2   image_2_path                                 1400 non-null   object 
 3   image_3_path                                 1049 non-null   object 
 4   combined_race                                2169 non-null   object 
 5   dermatologist_fitzpatrick_skin_type_label_1  2169 non-null   object 
 6   monk_skin_tone_label_india                   2169 non-null   float64
 7   monk_skin_tone_label_us                      2169 non-null   float64
 8   label                                        2169 non-null   object 
 9   weights                                      2169 non-null   object 
dtypes: 

### Ordinal Encoding

In [25]:
# create a list of columnns to encode
feat_to_encode = ['combined_race','dermatologist_fitzpatrick_skin_type_label_1',
                  'monk_skin_tone_label_india','monk_skin_tone_label_us']

# create OrdinalEncoder and fit to training data
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(train_df[feat_to_encode])

# encode columns in training and testing dataframes using the encoder fitted to the training data
for i in [train_df,test_df]:
    i[feat_to_encode] = ordinal_encoder.transform(i[feat_to_encode])

### Check for Low Variance

In [26]:
# set variance threshold and fit to training dataframe excluding target class and age
var_threshold = VarianceThreshold(threshold = 0.25)
var_threshold.fit(train_df.iloc[:,4:-2])

# array of whether each feature has low variance (False) or high variance (True)
var_threshold.get_support()

array([ True,  True,  True,  True])

In [27]:
# get a list of columns to drop with low variance
cols_to_drop = [c for c in train_df.iloc[:,4:-2].columns
                if c not in train_df.iloc[:,4:-2].columns[var_threshold.get_support()]]

# drop any low variance features from the training and testing dataframes
for i in [train_df, test_df]:
    i.drop(cols_to_drop, axis=1, inplace=True)

# Display dataframe info to see remaining attributes
for i in [train_df, test_df]:
    i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2169 entries, 726 to 3093
Data columns (total 10 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   case_id                                      2169 non-null   object 
 1   image_1_path                                 2169 non-null   object 
 2   image_2_path                                 1400 non-null   object 
 3   image_3_path                                 1049 non-null   object 
 4   combined_race                                2169 non-null   float64
 5   dermatologist_fitzpatrick_skin_type_label_1  2169 non-null   float64
 6   monk_skin_tone_label_india                   2169 non-null   float64
 7   monk_skin_tone_label_us                      2169 non-null   float64
 8   label                                        2169 non-null   object 
 9   weights                                      2169 non-null   object 
dtypes: 

### Decode Combined_Race

In [28]:
# Decode the combined_race column in the train and test sets
for i in [train_df,test_df]:
    i['combined_race'] = ordinal_encoder.inverse_transform(i[feat_to_encode])[:, feat_to_encode.index('combined_race')]

In [29]:
train_df.head()

Unnamed: 0,case_id,image_1_path,image_2_path,image_3_path,combined_race,dermatologist_fitzpatrick_skin_type_label_1,monk_skin_tone_label_india,monk_skin_tone_label_us,label,weights
726,-3399323628854326999,dataset/images/-1189822939611839188.png,dataset/images/-6631837338743072685.png,dataset/images/4203933931229451208.png,HISPANIC_LATINO_OR_SPANISH_ORIGIN,3.0,3.0,3.0,"[Perioral Dermatitis, Acne]","[0.67, 0.33]"
2472,-9110612497360538721,dataset/images/2298102274545177880.png,,,WHITE,1.0,1.0,2.0,"[Allergic Contact Dermatitis, Eczema]","[0.67, 0.33]"
2688,1519975931025758567,dataset/images/3390191852224494870.png,,,HISPANIC_LATINO_OR_SPANISH_ORIGIN,2.0,1.0,4.0,"[Pigmented purpuric eruption, Tinea, Eczema]","[0.55, 0.23, 0.23]"
3966,5837921410652230755,dataset/images/-1592222786083529679.png,dataset/images/-8932329685662456128.png,dataset/images/3238583509221468864.png,HISPANIC_LATINO_OR_SPANISH_ORIGIN,0.0,1.0,0.0,"[Eczema, Pigmented purpuric eruption, Allergic...","[0.43, 0.09, 0.18, 0.09, 0.21]"
437,-2400802139054066549,dataset/images/-974618061209306157.png,,,WHITE,4.0,4.0,5.0,"[Pityriasis rosea, Parapsoriasis, Eczema]","[0.55, 0.23, 0.23]"


### Get a List of All Unique Skin Conditions

In [30]:
# Get a list of all skin conditions
skin_conditions = sum(train_df['label'], [])

In [31]:
# number of all skin conditions
len(skin_conditions)

5384

In [32]:
# Remove duplicates to get a list of all unique skin diseases
skin_conditions = list(set(skin_conditions))

In [33]:
# number of all skin conditions
len(skin_conditions)

311

### One-Hot Encoding

In [34]:
# create One-Hot Encoder and fit to training data
one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoder.fit(train_df[['combined_race']])

# Encode column for train_df
ohe_array_train = one_hot_encoder.transform(train_df[['combined_race']])
columns_train = one_hot_encoder.get_feature_names_out(['combined_race'])
ohe_df_train = pd.DataFrame(ohe_array_train, columns=columns_train)
train_df.reset_index(drop=True, inplace=True)
ohe_df_train.reset_index(drop=True, inplace=True)
train_df = pd.concat([train_df, ohe_df_train], axis=1)
train_df.drop('combined_race', axis=1, inplace=True)

# Encode column for test_df
ohe_array_test = one_hot_encoder.transform(test_df[['combined_race']])
columns_test = one_hot_encoder.get_feature_names_out(['combined_race'])
ohe_df_test = pd.DataFrame(ohe_array_test, columns=columns_test)
test_df.reset_index(drop=True, inplace=True)
ohe_df_test.reset_index(drop=True, inplace=True)
test_df = pd.concat([test_df, ohe_df_test], axis=1)
test_df.drop('combined_race', axis=1, inplace=True)

In [35]:
# Display heads for train and test sets
for i in [train_df, test_df]:
  display(i.head())

Unnamed: 0,case_id,image_1_path,image_2_path,image_3_path,dermatologist_fitzpatrick_skin_type_label_1,monk_skin_tone_label_india,monk_skin_tone_label_us,label,weights,combined_race_AMERICAN_INDIAN_OR_ALASKA_NATIVE,...,combined_race_BLACK_OR_AFRICAN_AMERICAN,"combined_race_BLACK_OR_AFRICAN_AMERICAN,HISPANIC_LATINO_OR_SPANISH_ORIGIN","combined_race_BLACK_OR_AFRICAN_AMERICAN,WHITE",combined_race_HISPANIC_LATINO_OR_SPANISH_ORIGIN,"combined_race_HISPANIC_LATINO_OR_SPANISH_ORIGIN,WHITE",combined_race_MIDDLE_EASTERN_OR_NORTH_AFRICAN,combined_race_NATIVE_HAWAIIAN_OR_PACIFIC_ISLANDER,combined_race_OTHER_RACE,combined_race_TWO_OR_MORE_AFTER_MITIGATION,combined_race_WHITE
0,-3399323628854326999,dataset/images/-1189822939611839188.png,dataset/images/-6631837338743072685.png,dataset/images/4203933931229451208.png,3.0,3.0,3.0,"[Perioral Dermatitis, Acne]","[0.67, 0.33]",0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-9110612497360538721,dataset/images/2298102274545177880.png,,,1.0,1.0,2.0,"[Allergic Contact Dermatitis, Eczema]","[0.67, 0.33]",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1519975931025758567,dataset/images/3390191852224494870.png,,,2.0,1.0,4.0,"[Pigmented purpuric eruption, Tinea, Eczema]","[0.55, 0.23, 0.23]",0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5837921410652230755,dataset/images/-1592222786083529679.png,dataset/images/-8932329685662456128.png,dataset/images/3238583509221468864.png,0.0,1.0,0.0,"[Eczema, Pigmented purpuric eruption, Allergic...","[0.43, 0.09, 0.18, 0.09, 0.21]",0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2400802139054066549,dataset/images/-974618061209306157.png,,,4.0,4.0,5.0,"[Pityriasis rosea, Parapsoriasis, Eczema]","[0.55, 0.23, 0.23]",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,case_id,image_1_path,image_2_path,image_3_path,dermatologist_fitzpatrick_skin_type_label_1,monk_skin_tone_label_india,monk_skin_tone_label_us,label,weights,combined_race_AMERICAN_INDIAN_OR_ALASKA_NATIVE,...,combined_race_BLACK_OR_AFRICAN_AMERICAN,"combined_race_BLACK_OR_AFRICAN_AMERICAN,HISPANIC_LATINO_OR_SPANISH_ORIGIN","combined_race_BLACK_OR_AFRICAN_AMERICAN,WHITE",combined_race_HISPANIC_LATINO_OR_SPANISH_ORIGIN,"combined_race_HISPANIC_LATINO_OR_SPANISH_ORIGIN,WHITE",combined_race_MIDDLE_EASTERN_OR_NORTH_AFRICAN,combined_race_NATIVE_HAWAIIAN_OR_PACIFIC_ISLANDER,combined_race_OTHER_RACE,combined_race_TWO_OR_MORE_AFTER_MITIGATION,combined_race_WHITE
0,2975869998074215405,dataset/images/-8309623388500227648.png,dataset/images/3190471120217261671.png,dataset/images/3233353314629424627.png,1.0,1.0,1.0,"[Herpes Zoster, Herpes Simplex]","[0.5, 0.5]",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8751100562164689123,dataset/images/-4075936252847654230.png,,,1.0,2.0,2.0,"[Insect Bite, Erythema nodosum, Hypersensitivity]","[0.55, 0.23, 0.23]",0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-3426382639481624876,dataset/images/7049703890719675614.png,,,0.0,1.0,1.0,"[Eczema, Allergic Contact Dermatitis, Irritant...","[0.33, 0.33, 0.33]",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-8545507601606408653,dataset/images/-5228362336502809630.png,dataset/images/5066351493764841468.png,dataset/images/563329283817116517.png,1.0,2.0,2.0,"[Eczema, Hypersensitivity]","[0.67, 0.33]",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-5499150212420647396,dataset/images/-3027327548738103934.png,dataset/images/-736897832503133263.png,dataset/images/3097647019489022813.png,3.0,7.0,8.0,"[Eczema, Lichen nitidus, Milia, Hypersensitivity]","[0.35, 0.35, 0.15, 0.15]",0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Multilabelbinarizer and Weight Mapping

In [36]:
# Function to map the corresponding weights to the labels
def map_weights(row, labels, weights):
    # Create a dictionary of label-weight pairs
    weight_dict = dict(zip(labels, weights))
    # For each label in the complete list (mlb.classes_), map its weight or set 0 if it's not in the row
    return [weight_dict.get(label, 0) for label in mlb.classes_]

In [37]:
# Create multilabelbinarizer and fit to training data
mlb = MultiLabelBinarizer()
mlb.fit(train_df['label'])

In [38]:
# Perform mlb and weight mapping on the train set

# Create a new DataFrame with encoded labels
encoded_labels_train = mlb.transform(train_df['label'])
encoded_labels_df_train = pd.DataFrame(encoded_labels_train, columns=mlb.classes_)

# Apply the weight mapping for each row
df_weights_mapped_train = train_df.apply(lambda row: map_weights(row['label'], row['label'], row['weights']), axis=1)

# Create a new DataFrame with the mapped weights
weights_df_train = pd.DataFrame(df_weights_mapped_train.tolist(), columns=mlb.classes_)

In [39]:
# Perform mlb and weight mapping on the test set

# Create a new DataFrame with encoded labels
encoded_labels_test = mlb.transform(test_df['label'])
encoded_labels_df_test = pd.DataFrame(encoded_labels_test, columns=mlb.classes_)

# Apply the weight mapping for each row
df_weights_mapped_test = test_df.apply(lambda row: map_weights(row['label'], row['label'], row['weights']), axis=1)

# Create a new DataFrame with the mapped weights
weights_df_test = pd.DataFrame(df_weights_mapped_test.tolist(), columns=mlb.classes_)

### Group and Prune Rare Diseases

In [40]:
# Reform train set
X_train = train_df.drop(columns=['label', 'weights'])
y_train = encoded_labels_df_train
train_df = pd.concat([X_train, y_train], axis=1)

# Reform test set
X_test = test_df.drop(columns=['label', 'weights'])
y_test = encoded_labels_df_test
test_df = pd.concat([X_test, y_test], axis=1)

In [41]:
# Identify label columns
label_columns = train_df.iloc[:, 21:].columns

# Calculate label frequencies (counts of positive labels for each label)
label_counts = train_df[label_columns].sum()
print(f"Label Frequencies: {label_counts}")

# Define rare labels (appear <= 1% times)
# Calculate the rare threshold as 1% of the total number of positive labels
total_positive_labels_before = label_counts.sum()
rare_threshold = total_positive_labels_before * 0.005
rare_labels = label_counts[label_counts <= rare_threshold].index.tolist()
print(f"Rare Threshold: {rare_threshold} ")
print(f"Rare Labels: {rare_labels}")

# Calculate the total number of positive labels removed
positive_labels_removed = label_counts[rare_labels].sum()
print(f"Total positive labels before removal: {total_positive_labels_before}")
print(f"Total positive labels removed: {positive_labels_removed}")

# Compute the percentage of positive labels removed
percentage_removed = (positive_labels_removed / total_positive_labels_before) * 100
print(f"Percentage of positive labels removed: {percentage_removed:.2f}%")

# Function to add 'Rare Disease' column and drop rare labels
def process_dataframe(df, rare_labels):
    # Initialize 'Rare Disease' column to 0
    df['Rare Disease'] = 0

    if rare_labels:
        # Ensure the rare_labels exist in the dataframe to avoid KeyError
        existing_rare_labels = [label for label in rare_labels if label in df.columns]

        if existing_rare_labels:
            # Set 'Rare Disease' to 1 if any rare label is present in the row
            df['Rare Disease'] = df[existing_rare_labels].max(axis=1)

            # Drop the rare label columns from the dataframe
            df.drop(columns=existing_rare_labels, inplace=True)

    return df

# Process train_df
train_df = process_dataframe(train_df, rare_labels)

# Process test_df
test_df = process_dataframe(test_df, rare_labels)

# If you decide not to use the 'Rare Disease' column, you can drop it
train_df = train_df.drop(columns=['Rare Disease'])
test_df = test_df.drop(columns=['Rare Disease'])

Label Frequencies: AKV - Acrokeratosis verruciformis                                 1
Abrasion and/or friction burn of lower limb without infection     1
Abrasion and/or friction burn of thigh without infection          1
Abrasion of wrist                                                 2
Abrasion, scrape, or scab                                        32
                                                                 ..
scurvy                                                            1
superficial hemorrhage                                            1
unilateral laterothoracic exanthem                                1
varicose vein                                                     1
wound/abrasion                                                    1
Length: 311, dtype: int64
Rare Threshold: 26.92 
Rare Labels: ['AKV - Acrokeratosis verruciformis', 'Abrasion and/or friction burn of lower limb without infection', 'Abrasion and/or friction burn of thigh without infection', 'Abrasio

In [42]:
# Remove any rows where no disease label is present after pruning
train_df = train_df[train_df.iloc[:, 21:].any(axis=1)]
test_df = test_df[test_df.iloc[:, 21:].any(axis=1)]

### Check for Independent Features

In [43]:
# set chi2 score for independence check and desired number of features
selector = SelectKBest(chi2, k=1)

# fit to training dataframe
selector.fit(train_df.iloc[:, 4:21],  train_df.iloc[:, 21:])

# array of whether each feature is independent
selector.get_support()

# get a list of dependent columns to drop
cols_to_drop = [c for c in train_df.iloc[:, 4:21].columns
                if c not in train_df.iloc[:, 4:21].columns[selector.get_support()]]

# drop the dependent features from the training and testing dataframes
for i in [train_df, test_df]:
    i.drop(cols_to_drop, axis=1, inplace=True)

# Display dataframe info to see remaining attributes
for i in [train_df, test_df]:
    i.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2009 entries, 0 to 2168
Data columns (total 43 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   case_id                           2009 non-null   object 
 1   image_1_path                      2009 non-null   object 
 2   image_2_path                      1309 non-null   object 
 3   image_3_path                      979 non-null    object 
 4   monk_skin_tone_label_us           2009 non-null   float64
 5   Abrasion, scrape, or scab         2009 non-null   int64  
 6   Abscess                           2009 non-null   int64  
 7   Acne                              2009 non-null   int64  
 8   Acute and chronic dermatitis      2009 non-null   int64  
 9   Acute dermatitis, NOS             2009 non-null   int64  
 10  Allergic Contact Dermatitis       2009 non-null   int64  
 11  CD - Contact dermatitis           2009 non-null   int64  
 12  Cellulitis 

### Power Labeling and Random Over Sampling

In [44]:
# Get label column names
labels = train_df.iloc[:, 5:].columns

# Assign weights
n = len(labels)
weights = [2**(n - 1 - i) for i in range(n)]

# Compute the powerlabel using the dot product
train_df['powerlabel'] = train_df[labels].dot(weights)

In [45]:
# Calculate class counts and the average count
class_counts = train_df['powerlabel'].value_counts()
average_count = int(train_df['powerlabel'].value_counts().unique().mean())

print("Class Counts:")
print(class_counts)
print(f"\nAverage Count per Class: {average_count}")

# Initialize a list to hold the balanced DataFrames
balanced_dfs = []

# Iterate through each class and resample
for cls, count in class_counts.items():
    df_class = train_df[train_df['powerlabel'] == cls]

    if count < average_count:
        # Upsample minority class
        df_resampled = resample(
            df_class,
            replace=True,             # Sample with replacement
            n_samples=average_count,  # To match the average count
            random_state=SEED           # For reproducibility
        )
        print(f"Upsampled class '{cls}' from {count} to {average_count}")

    elif count > average_count:
        # Downsample majority class
        df_resampled = resample(
            df_class,
            replace=False,            # Sample without replacement
            n_samples=average_count,  # To match the average count
            random_state=SEED           # For reproducibility
        )
        print(f"Downsampled class '{cls}' from {count} to {average_count}")

    else:
        # If the class already has the average count, keep it as is
        df_resampled = df_class
        print(f"Class '{cls}' already has {average_count} samples. No resampling needed.")

    # Append the resampled DataFrame to the list
    balanced_dfs.append(df_resampled)

# Combine all balanced classes into a single DataFrame
train_df_balanced = pd.concat(balanced_dfs).reset_index(drop=True)

# Shuffle the DataFrame to mix the classes
train_df = train_df_balanced.sample(frac=1, random_state=SEED).reset_index(drop=True)

# Verify the new class distribution
new_class_counts = train_df['powerlabel'].value_counts()
print("\nNew Class Counts After Balancing:")
print(new_class_counts)

train_df = train_df.drop(columns=['powerlabel'])

Class Counts:
powerlabel
268435456      115
4               67
4563402752      64
4295491584      51
4294967296      46
              ... 
8590065696       1
4298113024       1
4294971396       1
6443499524       1
17448304656      1
Name: count, Length: 599, dtype: int64

Average Count per Class: 24
Downsampled class '268435456' from 115 to 24
Downsampled class '4' from 67 to 24
Downsampled class '4563402752' from 64 to 24
Downsampled class '4295491584' from 51 to 24
Downsampled class '4294967296' from 46 to 24
Downsampled class '1048576' from 41 to 24
Downsampled class '67108864' from 37 to 24
Downsampled class '268435968' from 32 to 24
Downsampled class '512' from 28 to 24
Downsampled class '268435984' from 28 to 24
Downsampled class '4296015876' from 28 to 24
Upsampled class '34359738368' from 23 to 24
Upsampled class '268435472' from 22 to 24
Upsampled class '68157440' from 20 to 24
Upsampled class '8192' from 20 to 24
Upsampled class '4294967300' from 19 to 24
Upsampled class '85

### Assign y_train

In [46]:
# Assign y_train to later get number of labels in target
y_train = train_df.iloc[:, 5:]

### Create Validation and Test Sets

In [47]:
# separate feature and target columns
X = test_df.iloc[:,0:5]
y = test_df.iloc[:, 5:]

# split data into test and validation splits
X_test, X_val, y_test, y_val = train_test_split(X, y, test_size=0.5, random_state=SEED)

# concenate to create test and train dataframes for seperate pre-processing to prevent data leakage
test_df = pd.concat([X_test,y_test], axis=1)
val_df = pd.concat([X_val,y_val], axis=1)

# MobileNet Model for Multi Label Image Classification (Image Input Only)

In [48]:
# Confirm whether GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [49]:
# Define constants
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 64
EPOCHS = 200
NUM_CLASSES = y_train.shape[1]

In [50]:
# Compute label weights
num_classes = y_train.shape[1]
class_weights_dict = {}
for i in range(num_classes):
    y_column = y_train.iloc[:, i].values
    weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_column),
        y=y_column
    )
    class_weights_dict[i] = weights[1] if len(weights) > 1 else 1.0

print("Computed Class Weights:", class_weights_dict)

Computed Class Weights: {0: 18.71875, 1: 23.03846153846154, 2: 14.261904761904763, 3: 14.975, 4: 6.965116279069767, 5: 1.4609756097560975, 6: 7.304878048780488, 7: 11.98, 8: 5.076271186440678, 9: 1.34304932735426, 10: 17.61764705882353, 11: 3.9407894736842106, 12: 13.613636363636363, 13: 7.881578947368421, 14: 8.557142857142857, 15: 5.759615384615385, 16: 6.965116279069767, 17: 2.9362745098039214, 18: 4.047297297297297, 19: 16.63888888888889, 20: 8.557142857142857, 21: 11.092592592592593, 22: 7.4875, 23: 9.359375, 24: 23.03846153846154, 25: 14.975, 26: 9.661290322580646, 27: 9.661290322580646, 28: 4.470149253731344, 29: 23.03846153846154, 30: 9.983333333333333, 31: 13.021739130434783, 32: 11.092592592592593, 33: 6.112244897959184, 34: 14.261904761904763, 35: 4.047297297297297, 36: 18.71875, 37: 8.557142857142857}


In [51]:
# Define weights for labels and positive labels
POS_WEIGHT = 10
label_weights = tf.constant(list(class_weights_dict.values()), dtype=tf.float32)

In [52]:
# Custom weighted loss function
def custom_binary_crossentropy(POS_WEIGHT, label_weights):
  def weighted_binary_crossentropy(target, output):

    # transform back to logits
    _epsilon = tf.convert_to_tensor(tf.keras.backend.epsilon(), output.dtype.base_dtype) # Change: Use tf.convert_to_tensor instead of tfb._to_tensor
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.math.log(output / (1 - output))
    # compute weighted loss
    loss = tf.nn.weighted_cross_entropy_with_logits(labels=target, logits=output, pos_weight=POS_WEIGHT)

    weighted_loss = loss * label_weights

    return tf.reduce_mean(weighted_loss, axis=-1)
  return weighted_binary_crossentropy

In [53]:
class SCINGenerator(Sequence):
    def __init__(self, dataframe, batch_size, image_size, bucket, shuffle=True, augment=False, seed=SEED):
        self.df = dataframe.reset_index(drop=True)
        self.batch_size = batch_size
        self.image_size = image_size
        self.bucket = bucket
        self.shuffle = shuffle
        self.augment = augment
        self.seed = seed
        self.rng = np.random.default_rng(self.seed)  # Local random generator

        self.indices = np.arange(len(self.df))
        if self.shuffle:
            self.indices = self.rng.permutation(self.indices)  # Shuffle using local generator

        # Define data augmentation
        if self.augment:
            self.data_augmentation = tf.keras.Sequential([
                tf.keras.layers.RandomFlip("horizontal"),
                tf.keras.layers.RandomRotation(0.2),
                tf.keras.layers.RandomZoom(0.2),
            ])
        else:
            self.data_augmentation = None

    def __len__(self):
        return int(np.ceil(len(self.df) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            self.indices = self.rng.permutation(self.indices)

    def load_image(self, image_path):

        base_path = '/content/SCIN_Dataset/dx-scin-public-data/'

        if pd.isna(image_path):
            # Return a placeholder image if the path is NaN
            img = np.zeros((*self.image_size, 3), dtype=np.float32)
            img_array = tf.keras.preprocessing.image.img_to_array(img)
            return img_array
        else:
            try:
                full_path = os.path.join(base_path, image_path)
                img = Image.open(full_path)
                img = img.resize(self.image_size)
                img_array = tf.keras.preprocessing.image.img_to_array(img)
                img_array = tf.keras.applications.mobilenet.preprocess_input(img_array)
                return img_array
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")
                img = np.zeros((*self.image_size, 3), dtype=np.float32)
                img_array = tf.keras.preprocessing.image.img_to_array(img)
                return img_array

    def load_images_batch(self, image_paths1):
        with ThreadPoolExecutor() as executor:
            images1 = list(executor.map(self.load_image, image_paths1))
        return np.array(images1)

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_df = self.df.iloc[batch_indices]

        # Load images
        images1 = batch_df['image_1_path'].values
        X = self.load_images_batch(images1)

        # Load labels
        y = batch_df[y_train.columns].values.astype(np.float32)

        # Apply augmentation if enabled
        if self.augment and self.data_augmentation is not None:
            X = self.data_augmentation(X)

        # Return a tuple of tensors instead of a list
        return X, y

In [54]:
# Build model and define hyper parameters for hyper parameter tuning
def build_model(hp):
    # Tune learning rate
    learning_rate = hp.Choice(
        'learning_rate',
        values=[1e-3, 1e-4, 1e-5]
    )

    # Tune Number of Units in Dense Layers
    # Tune number of units in first dense layer
    units_dense1 = hp.Choice(
        'units_dense1',
        values=[128, 256, 512]
    )

    # Tune number of units in second dense layer
    units_dense2 = hp.Choice(
        'units_dense2',
        values=[128, 256, 512]
    )

    # Tune Dropout Rates
    # Tune the dropout rate after the first dense layer
    dropout_rate1 = hp.Choice(
        'dropout_rate1',
        values=[0.2, 0.3, 0.4, 0.5]
    )

    # Tune the dropout rate after the second dense layer
    dropout_rate2 = hp.Choice(
        'dropout_rate2',
        values=[0.2, 0.3, 0.4, 0.5]
    )

    # Tune L2 regularization factor
    l2_reg = hp.Choice(
        'l2_reg',
        values=[1e-3, 1e-4, 1e-5]
    )

    # Tune Optimizer
    optimizer_name = hp.Choice(
        'optimizer',
        values=['Adam', 'AdamW']
    )

    if optimizer_name == 'Adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'AdamW':
        optimizer = AdamW(learning_rate=learning_rate)
    else:
        raise ValueError("Invalid optimizer choice")

    # Define the base MobileNet model
    base_model = MobileNet(weights='imagenet', include_top=False, input_shape=IMAGE_SIZE + (3,))
    base_model.trainable = False

    # Define input
    input = Input(shape=IMAGE_SIZE + (3,), name='input')

    # Process input through the base model
    features = base_model(input)

    # Apply Batch Normalization
    features = BatchNormalization()(features)

    # Apply Global Average Pooling
    gap = GlobalAveragePooling2D()
    features = gap(features)

    # Fully connected layers
    dense = Dense(units_dense1, activation='relu',
                  kernel_regularizer=regularizers.l2(l2_reg))(features)
    dense = Dropout(dropout_rate1)(dense)
    dense = Dense(units_dense2, activation='relu',
                  kernel_regularizer=regularizers.l2(l2_reg))(dense)
    dense = Dropout(dropout_rate2)(dense)

    # Output layer
    output = Dense(NUM_CLASSES, activation='sigmoid', name='output')(dense)

    # Create the model
    model = Model(inputs=input, outputs=output)

    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss=custom_binary_crossentropy(POS_WEIGHT, label_weights),
        metrics=[
            'binary_accuracy',
            tf.keras.metrics.F1Score(name='f1_score', average='micro'),
            tf.keras.metrics.AUC(multi_label=True, curve='ROC', name='roc_auc_score'),
            tf.keras.metrics.AUC(multi_label=True, curve='PR', name='pr_auc_score'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )

    return model

In [55]:
# Initialize the generators
train_generator = SCINGenerator(
    dataframe=train_df,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
    bucket=Globals.gcs_bucket,
    shuffle=True,
    augment=True,   # Enable augmentation for training
    seed=SEED   # Set seed for reproducibility
)

validation_generator = SCINGenerator(
    dataframe=val_df,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
    bucket=Globals.gcs_bucket,
    shuffle=False,
    augment=False  # No augmentation for validation
)

test_generator = SCINGenerator(
    dataframe= test_df,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
    bucket=Globals.gcs_bucket,
    shuffle=False,
    augment=False  # No augmentation for validation
)

In [56]:
# Initialise the tuner using Hyperband
tuner = BayesianOptimization(
    build_model,
    objective=kt.Objective('val_f1_score', direction='max'),
    max_trials=30,
    executions_per_trial=2,
    seed=SEED,
    directory='/content/drive/MyDrive/Hyperparameter Tuning/Bayesian Optimisation',
    project_name='Images'
)

Reloading Tuner from /content/drive/MyDrive/Hyperparameter Tuning/Bayesian Optimisation/Images/tuner0.json


In [57]:
# Define early stopping for the tuner
stop_early = EarlyStopping(monitor='val_f1_score', mode='max', patience=5)

In [58]:
# Execute the search
tuner.search(
    train_generator,
    epochs=EPOCHS,
    validation_data=validation_generator,
    callbacks=[stop_early],
    verbose=1
)

In [59]:
# Retrieve the best hyperparameters
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

In [60]:
print("\nBest Hyperparameters:")
print(f"Learning Rate: {best_hp.get('learning_rate')}")
print(f"Units in Dense Layer 1: {best_hp.get('units_dense1')}")
print(f"Units in Dense Layer 2: {best_hp.get('units_dense2')}")
print(f"Dropout Rate1: {best_hp.get('dropout_rate1')}")
print(f"Dropout Rate2: {best_hp.get('dropout_rate2')}")
print(f"L2 Regularization: {best_hp.get('l2_reg')}")
print(f"Optimizer: {best_hp.get('optimizer')}")


Best Hyperparameters:
Learning Rate: 1e-05
Units in Dense Layer 1: 128
Units in Dense Layer 2: 512
Dropout Rate1: 0.5
Dropout Rate2: 0.3
L2 Regularization: 0.001
Optimizer: AdamW


In [61]:
# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf_no_top.h5
[1m17225924/17225924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [62]:
# Save the best model to Google Drive
best_model.save('/content/drive/MyDrive/best_model_images.keras')

# Save a back up to
best_model.save('/content/drive/MyDrive/best_model_images_backup.keras')

In [63]:
# Build a model with the best hyper parameters
model_train = build_model(best_hp)

In [64]:
# Train model with best hyper parameters
model_train.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=validation_generator,
    callbacks=[
        EarlyStopping(monitor='val_loss', mode='min', patience=10, restore_best_weights=True),
        ModelCheckpoint('/content/drive/MyDrive/model_images.keras', monitor='val_loss', save_best_only=True, save_weights_only=False),
        ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.1, patience=5, min_lr=1e-6)
    ],
    verbose=1
)

Epoch 1/200
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 807ms/step - binary_accuracy: 0.5306 - f1_score: 0.0490 - loss: 10.9574 - pr_auc_score: 0.0760 - precision: 0.0822 - recall: 0.5086 - roc_auc_score: 0.4958 - val_binary_accuracy: 0.6388 - val_f1_score: 0.0904 - val_loss: 9.0646 - val_pr_auc_score: 0.0605 - val_precision: 0.0868 - val_recall: 0.6131 - val_roc_auc_score: 0.5058 - learning_rate: 1.0000e-05
Epoch 2/200
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 745ms/step - binary_accuracy: 0.6264 - f1_score: 0.0814 - loss: 10.4339 - pr_auc_score: 0.0792 - precision: 0.1049 - recall: 0.5210 - roc_auc_score: 0.5095 - val_binary_accuracy: 0.7188 - val_f1_score: 0.1533 - val_loss: 8.6088 - val_pr_auc_score: 0.0658 - val_precision: 0.1118 - val_recall: 0.6227 - val_roc_auc_score: 0.5159 - learning_rate: 1.0000e-05
Epoch 3/200
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 748ms/step - binary_accuracy: 0.6848 - f1_score:

<keras.src.callbacks.history.History at 0x7d8210404d90>

In [65]:
# Evaluate the best model
results = model_train.evaluate(test_generator, verbose=1)
print(f"Test Loss: {results[0]}")
print(f"Test Binary Accuracy: {results[1]}")
print(f"Test F1: {results[2]}")
print(f"Test AUC ROC: {results[3]}")
print(f"Test AUC PR: {results[4]}")
print(f"Test Precision: {results[5]}")
print(f"Test Recall: {results[6]}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 640ms/step - binary_accuracy: 0.7590 - f1_score: 0.2204 - loss: 7.0879 - pr_auc_score: 0.1320 - precision: 0.1489 - recall: 0.6895 - roc_auc_score: 0.6539
Test Loss: 6.831550598144531
Test Binary Accuracy: 0.7579203248023987
Test F1: 0.23392713069915771
Test AUC ROC: 0.6615680456161499
Test AUC PR: 0.11598269641399384
Test Precision: 0.14783450961112976
Test Recall: 0.6951080560684204


In [66]:
# Save the best model to Google Drive
model_train.save('/content/drive/MyDrive/final_model_images.keras')

# Fine-Tune Model

In [67]:
# Define blocks to fine-tune
blocks_to_fine_tune = ['13', '12', '11', '10', '9','8','7','6','5','4','3','2','1']

# Initialise the best results
current_best_loss = results[0]
current_best_binary_accuracy = results[1]
current_best_f1 = results[2]
current_best_auc_roc = results[3]
current_best_auc_pr = results[4]
current_best_precision = results[5]
current_best_recall = results[6]

# Initialise an empty list to keep track of fine-tuned blocks
fine_tuned_blocks = []

# Retrieve the base_model once before the loop
base_model = None
for layer in model_train.layers:
    if isinstance(layer, tf.keras.Model):
        base_model = layer
        break

if base_model is None:
    raise ValueError("Base model not found within model_train.layers.")

# Fine-tune block-wise
for block in blocks_to_fine_tune:
    print(f"Fine-tuning block: {block}")

    # Add the current block to the list of fine-tuned blocks
    fine_tuned_blocks.append(block)

    # Set layers trainable based on the accumulated fine-tuned blocks
    for layer in base_model.layers:
        if any(b in layer.name for b in fine_tuned_blocks) and not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

    # Compile the model after updating the trainable layers
    model_train.compile(
        optimizer=AdamW(learning_rate=1e-7),
        loss=custom_binary_crossentropy(POS_WEIGHT, label_weights),
        metrics=[
            'binary_accuracy',
            tf.keras.metrics.F1Score(name='f1_score', average='micro'),
            tf.keras.metrics.AUC(multi_label=True, curve='ROC', name='roc_auc_score'),
            tf.keras.metrics.AUC(multi_label=True, curve='PR', name='pr_auc_score'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
        ]
    )

    # Train the model
    history = model_train.fit(
        train_generator,
        epochs=EPOCHS,
        validation_data=validation_generator,
        callbacks=[
            EarlyStopping(monitor='val_loss', mode='min', patience=10, restore_best_weights=True),
            ModelCheckpoint('/content/drive/MyDrive/model_images.keras', monitor='val_loss', save_best_only=True, save_weights_only=False),
            ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.1, patience=5, min_lr=1e-8)
        ],
        verbose=1
    )

    # Evaluate the model on the test set
    results = model_train.evaluate(test_generator, verbose=1)
    print(f"Test Loss: {results[0]}")
    print(f"Test Binary Accuracy: {results[1]}")
    print(f"Test F1: {results[2]}")
    print(f"Test AUC ROC: {results[3]}")
    print(f"Test AUC PR: {results[4]}")
    print(f"Test Precision: {results[5]}")
    print(f"Test Recall: {results[6]}")

    # Check if the current model is better
    if results[0] < current_best_loss:
        current_best_loss = results[0]
        current_best_binary_accuracy = results[1]
        current_best_f1 = results[2]
        current_best_auc_roc = results[3]
        current_best_auc_pr = results[4]
        current_best_precision = results[5]
        current_best_recall = results[6]

        model_train.save('/content/drive/MyDrive/final_model_images.keras')
    else:
        print(f"No improvement in loss after fine-tuning block {block}. Stopping further fine-tuning.")
        break

Fine-tuning block: 13
Epoch 1/200
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 801ms/step - binary_accuracy: 0.7749 - f1_score: 0.1914 - loss: 7.3039 - pr_auc_score: 0.2344 - precision: 0.2060 - recall: 0.6863 - roc_auc_score: 0.7675 - val_binary_accuracy: 0.7642 - val_f1_score: 0.2292 - val_loss: 6.5899 - val_pr_auc_score: 0.1104 - val_precision: 0.1437 - val_recall: 0.6989 - val_roc_auc_score: 0.6555 - learning_rate: 1.0000e-07
Epoch 2/200
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 748ms/step - binary_accuracy: 0.7740 - f1_score: 0.1968 - loss: 7.3300 - pr_auc_score: 0.2325 - precision: 0.2039 - recall: 0.6825 - roc_auc_score: 0.7627 - val_binary_accuracy: 0.7639 - val_f1_score: 0.2292 - val_loss: 6.5915 - val_pr_auc_score: 0.1108 - val_precision: 0.1436 - val_recall: 0.6989 - val_roc_auc_score: 0.6554 - learning_rate: 1.0000e-07
Epoch 3/200
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 748ms/step - binary_accuracy

In [68]:
print(f"Best Test Loss: {current_best_loss}")
print(f"Best Test Binary Accuracy: {current_best_binary_accuracy}")
print(f"Best Test F1: {current_best_f1}")
print(f"Best Test AUC ROC: {current_best_auc_roc}")
print(f"Best Test AUC PR: {current_best_auc_pr}")
print(f"Best Test Precision: {current_best_precision}")
print(f"Best Test Recall: {current_best_recall}")

Best Test Loss: 6.814630508422852
Best Test Binary Accuracy: 0.7593255639076233
Best Test F1: 0.23082876205444336
Best Test AUC ROC: 0.6639871597290039
Best Test AUC PR: 0.1207294687628746
Best Test Precision: 0.14879649877548218
Best Test Recall: 0.6962457299232483
