In [1]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-2fdfef1a-2abe-ed33-6d15-6e83dc187b98)


In [22]:
######
from PIL import Image
import imageio
######
from sklearn.model_selection import train_test_split
######
import torch
import torch.nn as nn
import torchvision.transforms as T
import torchvision.transforms.functional as TF
######
from tqdm.notebook import tqdm
import os
###### 
import time
import random
import numpy as np
import pandas as pd
# You can comment the lines below if you want to see warnings :)
import warnings 
warnings.filterwarnings("ignore")

# Load Data Locally

If you are using colab, I advise you to run these cells, it download the entire dataset and put it directly in the "virtual machine". It can speed up loading data because loading data from your google drive is much slower than loading it directly from the machine.

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle

In [3]:
from google.colab import files
# Here you should upload your Kaggle API key (see : https://www.kaggle.com/docs/api (Authentification paragraph))
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"negalov","key":"f2d706af1447ced98029686098226cba"}'}

In [None]:
# Kaggle API identification
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

In [5]:
# Download the entire dataset in zip format
! kaggle competitions download -c 3md3070-dlmi

Downloading 3md3070-dlmi.zip to /content
 97% 278M/286M [00:06<00:00, 33.2MB/s]
100% 286M/286M [00:06<00:00, 43.4MB/s]


In [None]:
# Unzip file
! unzip /content/3md3070-dlmi.zip

# Utils 

In [10]:
class LymphoDataset(torch.utils.data.Dataset):
    def __init__(self, orig, img_list, ages, concentration, labels, transforms=None):
        self.transforms = transforms
        self.img_list = img_list
        self.img_dict = {idx: {'paths': [orig + name + '/' + img_name for img_name in os.listdir(orig + name)],
                               'age': ages[idx],
                               'concentration': concentration[idx],
                               'label': labels[idx],
                               'name': name} for idx, name in enumerate(img_list)}

    def load_image(self, path):
        image = imageio.imread(path)
        return image


    def __getitem__(self, image_id):   
        images = [self.load_image(path) for path in self.img_dict[image_id]['paths']]
        label = self.img_dict[image_id]['label']
        age = self.img_dict[image_id]['age']
        concentration = self.img_dict[image_id]['concentration']
        name = self.img_dict[image_id]['name']

        # Convert everything to tensor
        concentration = torch.tensor([concentration], dtype=torch.float32)
        age = torch.as_tensor([age], dtype=torch.float32)
        label = torch.as_tensor([label], dtype=torch.float32)

        if self.transforms:
            images = torch.cat([self.transforms(image)[None, :, :, :] for image in images], axis=0)
        
        data = {"age": age,
                "concentration": concentration,
                "images": images,
                "label": label,
                "name": name}
        return data

    def __len__(self):
        return len(self.img_list)

In [12]:
def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.ToPILImage())
    transforms.append(T.Resize(224))
    if train:
      # You can add here some data augmentation :)
      pass # Remove this if you add data augmentation
    transforms.append(T.ToTensor())
    return T.Compose(transforms)
    
def gen_list(key, batch):
  if key == 'images':
    return [d[key][:, :, :, :] for d in batch]
  elif key =='name':
    return [d[key] for d in batch]
  else:
    return [d[key][None, :] for d in batch]

def custom_fn(batch):
  elem = batch[0].keys()
  return {key : torch.cat(gen_list(key, batch), axis=0) if key != 'name' else gen_list(key, batch) for key in elem}

In [13]:
def return_data(df, indices):
  ages = []
  concentration = []
  labels = []
  for indice in indices:
    age, conc, label = df.loc[indice, 'AGE'], df.loc[indice, 'LYMPH_COUNT'], df.loc[indice, 'LABEL']
    ages.append(age)
    concentration.append(conc)
    labels.append(label)
  return {'indice': indices, 'age': ages, 'concentration':concentration, 'label':labels}

# Example

I'm going to give an overview of the framework

In [14]:
# First, let's load the csv file that contains the patient ID, age, label & lympho count
df = pd.read_csv("/content/drive/MyDrive/DLMI_Challenge/clinical_data_clean.csv")

In [16]:
df.head(4)

Unnamed: 0,ID,LABEL,GENDER,LYMPH_COUNT,AGE
0,P26,1,1,11.2,87
1,P183,1,1,12.8,78
2,P89,1,1,9.6,85
3,P123,1,1,122.6,90


In [18]:
# Now I will consider only the training data
df = df[df.LABEL > -0.5].reset_index(drop=True) # patient ID with label = -1 are the one that belongs to the test file
df.head(4)

Unnamed: 0,ID,LABEL,GENDER,LYMPH_COUNT,AGE
0,P26,1,1,11.2,87
1,P183,1,1,12.8,78
2,P89,1,1,9.6,85
3,P123,1,1,122.6,90


In [19]:
orig = '/content/trainset/' # This is the path to the patient folders containing all the images

In [26]:
# Let's now split our dataframe into train/val
train_id_patient, val_id_patient = train_test_split(df.ID.tolist(), test_size=0.25, random_state=42)

In [28]:
# You can have a look on the output of each file
train_id_patient[:3]

['P131', 'P64', 'P147']

In [29]:
# For the train set
data_train = return_data(df.set_index(['ID']), train_id_patient)

In [31]:
# Have a look on data_train (it is a dict with 4 keys (indice, age, concentration, label))
print(data_train['indice'][:3])
print(data_train['age'][:3])
print(data_train['concentration'][:3])
print(data_train['label'][:3])

['P131', 'P64', 'P147']
[77, 61, 57]
[4.43, 4.28, 175.71]
[1, 0, 1]


In [32]:
# Now creating the train_loader
img_list, ages, concentration, labels = data_train['indice'], data_train['age'], data_train['concentration'], data_train['label']
train = LymphoDataset(orig, img_list, ages, concentration, labels, get_transform(True))
train_loader = torch.utils.data.DataLoader(
    train, batch_size=1, shuffle=True, num_workers=8, collate_fn=custom_fn)

I set batch_size to 1 because you'll have a memory issue otherwise. Indeed, there are some patient that have more than 100 images (the maximum is 198 images). but there is a walk around to work with larger batch size, you'll see :)

In [33]:
# We can do the same thing with validation 
data_valid = return_data(df.set_index(['ID']), val_id_patient)
img_list, ages, concentration, labels = data_valid['indice'], data_valid['age'], data_valid['concentration'], data_valid['label']
valid = LymphoDataset(orig, img_list, ages, concentration, labels, get_transform(False))
valid_loader = torch.utils.data.DataLoader(
    valid, batch_size=1, shuffle=False, num_workers=8, collate_fn=custom_fn)

In [37]:
### EXAMPLES ###
start = time.time()
for idx, batch in enumerate(train_loader):
  test = batch
print(f"Time for loading all the train images : {round(time.time() - start, 4)} seconds")

Time for loading all the train images : 10.5607 seconds


It will probably took 30/40 seconds if you add data augmentation :) \
Now let's have a look on one of the element returned by train_loader

In [41]:
test

{'age': tensor([[28.]]),
 'concentration': tensor([[4.3100]]),
 'images': tensor([[[[0.7608, 0.8000, 0.8667,  ..., 0.7216, 0.7137, 0.6941],
           [0.7843, 0.8196, 0.8824,  ..., 0.7490, 0.7333, 0.7098],
           [0.7882, 0.8275, 0.8863,  ..., 0.7804, 0.7569, 0.7255],
           ...,
           [0.9804, 0.9686, 0.9569,  ..., 0.9686, 0.9725, 0.9725],
           [0.9686, 0.9725, 0.9725,  ..., 0.9686, 0.9725, 0.9725],
           [1.0000, 1.0000, 0.9882,  ..., 0.9686, 0.9725, 0.9686]],
 
          [[0.6235, 0.6745, 0.7529,  ..., 0.5961, 0.5882, 0.5686],
           [0.6471, 0.6941, 0.7686,  ..., 0.6275, 0.6078, 0.5843],
           [0.6510, 0.7059, 0.7765,  ..., 0.6588, 0.6353, 0.6039],
           ...,
           [0.8784, 0.8706, 0.8588,  ..., 0.8667, 0.8706, 0.8706],
           [0.8510, 0.8549, 0.8549,  ..., 0.8745, 0.8784, 0.8784],
           [0.8627, 0.8667, 0.8588,  ..., 0.8706, 0.8745, 0.8706]],
 
          [[0.6510, 0.6784, 0.7216,  ..., 0.6392, 0.6392, 0.6275],
           [0.6706

In [None]:
# Element              # Representation # shape
#test['age']           # Age            # (1,1)
#test['concentration'] # Concentration  # (1,1)
#test['images'].       # Images         # (num_images, 3, 224, 224)
#test['name']          # name           # (1)
#test['label']         # Label          # (1,1)

Now let's imagine you want to use mini batch gradient descent (backward on multiple element (batch_size > 1)) instead of a stochastic gradient descent (batch_size = 1)), I will show you a walk around to do that :)

In [None]:
def train_walk_around(train_loader, model, epochs, batch_size, criterion, optimizer):
  for epoch_no in range(self.epochs):  
    model.train()
    total_loss = 0
    outputs = []
    labels = []
    i = 0
    for idx, batch in enumerate(train_loader):
      img, label = batch['images'].cuda(), batch['label'].cuda()
      output, _ = model.get_outputs(img)
      outputs.append(output)
      labels.append(label)
      i += 1
      if i < batch_size and idx < len(train_loader)-1: 
          continue
      outputs = torch.cat(outputs, dim=0)
      labels = torch.cat(labels, dim=0)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      total_loss += loss.item()

      outputs = []
      labels = []
      i = 0

How does it work ? \
Code Explanation

In [None]:
def train_walk_around(train_loader, model, epochs, batch_size, criterion, optimizer):
  for epoch_no in range(self.epochs):  
    model.train()
    total_loss = 0
    outputs = [] # List to store the outputs of the model 
    labels = [] # List to store the true label 
    i = 0 # For incrementation
    for idx, batch in enumerate(train_loader):
      img, label = batch['images'].cuda(), batch['label'].cuda() # Get the images and the label and transfer to GPU
      output = model.get_outputs(img) # Get the output of the model 
      outputs.append(output) # store it in outputs
      labels.append(label) # store it in labels
      i += 1 # Incrementation
      if i < batch_size and idx < len(train_loader)-1: 
        # Now if the number of patient (representend by bag of images) on which we ran the model
        # is less than the batch size, we won't backpropagate and we will skip the lines of code below
        # and move to the next patient
        continue
      # Now that we have ran our model on batch_size patients we can backpropagate.
      outputs = torch.cat(outputs, dim=0)
      labels = torch.cat(labels, dim=0)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      total_loss += loss.item()

      outputs = []
      labels = []
      i = 0

So this is the naive framework, the issue here is that you need a small model (resnet34 for instance) if you have less than 16/17 gb of VRAM (GPU RAM). \