In [None]:
#maybe you should run this first
# pip install torch torchvision
# pip install pillow

In [24]:
import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import glob
from pathlib import Path
import torch
import torchvision.models as models
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import GroupShuffleSplit


In [25]:
IN_KAGGLE = "KAGGLE_KERNEL_RUN_TYPE" in os.environ
IN_COLAB = "COLAB_GPU" in os.environ

In [None]:
#before you run this cell make sure that your kaggle key and your kaggle username are saved at colab/secrets
if IN_COLAB:
  from google.colab import userdata
  #download the data to colab
  os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
  os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
  ! kaggle competitions download isic-2024-challenge
  ! unzip isic-2024-challenge.zip
  IMG_PATH = "/content/train-image/image"
  CSV_PATH = "/content/train-metadata.csv"


In [27]:
if IN_KAGGLE:
  IMG_PATH = "/kaggle/input/isic-2024-challenge/train-image/image"
  CSV_PATH = "/kaggle/input/isic-2024-challenge/train-metadata.csv"


In [28]:
def get_train_file_path(image_id):
    return f"{IMG_PATH}/{image_id}.jpg"

In [None]:
# add the images path to the df
df = pd.read_csv(CSV_PATH)
train_images = sorted(glob.glob(f"{IMG_PATH}/*.jpg"))# make list of all the imges that realy exist in the image folder
df['file_path'] = df['isic_id'].apply(get_train_file_path)# add a "file_path" feature to each row
df = df[ df["file_path"].isin(train_images) ].reset_index(drop=True)# keep only the rows that their images in train_images
df.head()


In [None]:
# consider to create a sample from the df, just to check that all the proces workes well
# print("          df.shape, # of positive cases, # of patients")
# print("original>", df.shape, df.target.sum())
# df_positive = df[df["target"] == 1].reset_index(drop=True)
# df_negative = df[df["target"] == 0].reset_index(drop=True)
# small_df = pd.concat([df_positive, df_negative.iloc[:df_positive.shape[0]*20, :]])  # positive:negative = 1:20
# print("filtered>", small_df.shape, df.target.sum())

In [10]:
def split_by_patients(train_data_frame, target_column='target', patient_column='patient_id', train_size=0.85, drop_columns=False):
    '''
    This function receives a data frame and splits by patients while maintaining the target ratio.
    :param train_data_frame: Training data frame with the patient IDs and targets inside.
    :param target_column: Name of the target column, 'target' is default.
    :param patient_column: Name of the patient column, 'patient_id' is default.
    :param train_size: Percentage of data to become the training set, 0.85 is default.
    :param drop_columns: When True: target and patient columns are dropped.
    :return: A tuple of 4: x_train, y_train, x_test, y_test.
    '''
    targets = train_data_frame[target_column]
    patients = train_data_frame[patient_column]

    if drop_columns:
        train_data_frame.drop(columns=['target', 'patient_id'], inplace=True)

    # Split the data by patients, while keeping the positive cases distributed properly
    gss = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=42)
    train_idx, test_idx = next(gss.split(train_data_frame, groups=patients, y=targets))
    x_train, x_test = train_data_frame.iloc[train_idx], train_data_frame.iloc[test_idx]
    y_train, y_test = [targets[i] for i in train_idx], [targets[i] for i in test_idx]

    # Print split stats
    original_train_size = train_data_frame.shape[0]
    train_size = x_train.shape[0]
    original_positive_cases = targets.sum()
    train_positive_cases = sum(y_train)
    print(f'Data split: {train_size * 100 / original_train_size}, {100 - (train_size * 100 / original_train_size)}')
    print(f'Positives cases split: {train_positive_cases * 100 / original_positive_cases}, {100 - (train_positive_cases * 100 / original_positive_cases)}')

    return x_train, y_train, x_test, y_test

In [11]:
# split to train and validation sets

train_df,_,valid_df,__ =  split_by_patients(df)

Data split: 84.60401088119204, 15.395989118807961
Positives cases split: 85.49618320610686, 14.503816793893137


In [15]:
def calculate_auc(y_test,y_pred):
    fpr, tpr,_= roc_curve(y_test, y_pred)
    new_tpr = []
    for num in tpr:
        if num>=0.8:
            new_tpr.append(0.8)
        else:
            new_tpr.append(num)
    return auc(fpr,tpr)-auc(fpr,new_tpr)

In [16]:

class ISICDataset(Dataset):
    def __init__(self, df: pd.DataFrame, transform=None): # the df is the csv train data
        self.df = df
        self.file_names = df["file_path"].values #the df is already with a "file path" column
        self.targets = df["target"].values
        self.transform = transform

    def __len__(self):
        return len(self.df) #return the number of rows

    def __getitem__(self, index: int):
        img_path = self.file_names[index]
        target = self.targets[index]

        img = Image.open(img_path)
        img = img.convert("RGB")

        if self.transform:
            img = self.transform(img)

        return img, int(target)

In [17]:
IMG_SIZE=224
BATCH_SIZE=32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model(model_name):
    if model_name == "vgg16":
        model = models.vgg16(pretrained=True)
        #replece the classfier with liniar layer with one neuron with weights according to the features in the previese layer
        model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, 1)
    model = model.to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()
    return model, optimizer, criterion

In [18]:
#instatiata a dataset for train set
train_dataset = ISICDataset(train_df,
                          transform=transforms.Compose([
                                transforms.Resize((IMG_SIZE, IMG_SIZE)),
                                transforms.ToTensor(),
                        ]))
#instatiata a dataset for valid set
valid_dataset = ISICDataset(valid_df,
                          transform=transforms.Compose([
                                transforms.Resize((IMG_SIZE, IMG_SIZE)),
                                transforms.ToTensor(),
                        ]))
#instatiata a dataloader for train set
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
#instatiata a dataloader for valid set
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [21]:
def train(model, train_loader, optimizer, criterion):
  """
  train the model with baches of images given by the train_loader,return the total loss and the auc on the train data
  """
  total_loss = 0
  all_targets = []
  all_probs = []
  count_baches = 0
  model.train()
  for input, targets in train_loader:#when iterating over the data loader, we get bach of pics - tensor(32,3,224,224), and vector of the lables(32)
      input = input.to(DEVICE)
      targets = targets.to(DEVICE)

      targets = targets.unsqueeze(1) #add dim to the targets tensor, it will look like [[1],[0]..]
      targets = targets.float() # BCEWithLogitsLoss requires targets as float()
      optimizer.zero_grad()
      output = model(input)
      loss = criterion(output, targets)# calculate the loss on the bach, return [scalar]
      total_loss += loss.item()#add the skalar to the total lose

      sigmoid = torch.nn.Sigmoid()
      probs = sigmoid(output).cpu().detach().numpy()#create a np array with probabilities predicted

      all_targets.extend(targets.cpu().detach().numpy().flatten())
      all_probs.extend(probs.flatten())

      loss.backward() #calculate dloss/dx for every parameter x
      optimizer.step() # adjust the parameters accordingly

      #count the baches to see where we are
      count_baches+=1
      for i in range(1,11):
        if count_baches == i*1000:
          print(f"{i*100} baches already passed to the model")

  auc = calculate_auc(np.array(all_targets),np.array(all_probs))
  return total_loss, auc

def val(model, val_loader, criterion):
    total_loss= 0
    all_targets = []
    all_probs = []
    model.eval()
    with torch.no_grad():
        for input, targets in val_loader:
            input = input.to(DEVICE)
            targets = targets.to(DEVICE)

            targets = targets.unsqueeze(1) # make the target [batch, 1]
            targets = targets.float() # BCEWithLogitsLoss requires targets as float()

            output = model(input)
            val_loss = criterion(output, targets)
            total_loss +=  val_loss.item()

            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(output).cpu().detach().numpy()

            all_targets.extend(targets.cpu().detach().numpy().flatten())
            all_probs.extend(probs.flatten())

    auc = calculate_auc(np.array(all_targets),np.array(all_probs))
    return total_loss, auc



In [None]:
EXP_ID    = 1
MODEL_NAME = "vgg16"
NUM_EPOCHS =7
BATCH_SIZE = 32
# NOTE="with_external_db"
EXP_NAME = "{:03}_{}_{}_{}".format(EXP_ID, MODEL_NAME, NUM_EPOCHS, BATCH_SIZE)  # you can name your experiment whatever you like
SAVE_PATH = "/kaggle/working"

model, optimizer, criterion = load_model("vgg16")



## training loop
best_val_pauc = 0
for epoch in range(NUM_EPOCHS):
  print(f"start epoch:{epoch}")
  train_loss, train_pauc = train(model, train_loader, optimizer, criterion)
  print(f"Epoch {epoch} has finished")
  print(f"start to predict the validation set, the results  are:")
  valid_loss, valid_pauc = val(model, valid_loader, criterion)
  #save the best model so far
  if valid_pauc > best_val_pauc:
      best_val_pauc = valid_pauc
      os.makedirs(f"{SAVE_PATH}/{EXP_NAME}", exist_ok=True)
      torch.save(model.state_dict(),f"{SAVE_PATH}/{EXP_NAME}/best_all.pth")
      print(f"Epoch {epoch}, train_loss {train_loss:.4f}, train_pauc {train_pauc}, valid_loss {valid_loss:.4f}, valid_pauc {val_pauc} --> Best valid_pauc {valid_pauc} at epoch {epoch}")

  else:
      print(f"Epoch {epoch}, train_loss {train_loss:.4f}, train_pauc {train_pauc:.2f}, val_loss {val_loss:.4f}, val_pauc {val_pauc:.2f}")


