In [None]:
## Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as offline

import os
import pathlib
import gc
import sys
import re
import math
import random
import time
import datetime as dt
from tqdm import tqdm
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
import torchvision.transforms.functional as TF
from torchvision.models import resnet18
!pip install torchinfo -q --user
from torchinfo import summary

from PIL import Image

print('import done!')

In [None]:
## For reproducible results
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed(s)
    os.environ['PYTHONHASHSEED'] = str(s)
    print('Seeds setted!')

global_seed = 42
seed_all(global_seed)

In [None]:
## Data Loading
data_config = {'train_csv_path': '../input/uw-madison-gi-tract-image-segmentation/train.csv',
               'train_folder_path': '../input/uw-madison-gi-tract-image-segmentation/train',
               'test_folder_path': '../input/uw-madison-gi-tract-image-segmentation/test',
               'sample_submission_path': '../input/uw-madison-gi-tract-image-segmentation/sample_submission.csv',
              }

train_df = pd.read_csv(data_config['train_csv_path'])
submission_df = pd.read_csv(data_config['sample_submission_path'])

print(f'train_length: {len(train_df)}')
print(f'submission_length: {len(submission_df)}')

In [None]:
## Null Value Check
print('train_df.info()'); print(train_df.info(), '\n')

train_df.head()

In [None]:
## Separate 'id' columns' texts, and create new id columns.
## This code takes about 2 minutets to execute.

def create_id_list(text, p_train = pathlib.Path(data_config['train_folder_path'])):
    t = text.split('_')

    case_id = t[0][4:]
    day_id = t[1][3:]
    slice_id = t[3]

    case_folder = t[0]
    day_folder = ('_').join([t[0], t[1]])
    slice_file = ('_').join([t[2], t[3]])

    p_folder = p_train / case_folder / day_folder / 'scans'
    file_name = [p.name for p in p_folder.iterdir() if p.name[6:10] == slice_id]
    id_list = [case_id, day_id, slice_id, case_folder, day_folder, slice_file]
    id_list.extend(file_name)
    return id_list

def create_new_ids(dataframe, new_ids = ['case_id', 'day_id', 'slice_id', 'case_folder', 'day_folder', 'slice_file', 'file_name']):
    dataframe['id_list'] = dataframe['id'].map(create_id_list)
    for i, item in enumerate(new_ids):
        dataframe[item] = dataframe['id_list'].map(lambda x: x[i])
    dataframe = dataframe.drop(['id_list'], axis=1)
    return dataframe

train_df = create_new_ids(train_df)
train_df.head()

In [None]:
## Create detection column (1: non NaN segmentation, 0: NaN segmentation).
train_df['detection'] = train_df['segmentation'].notna() * 1
train_df.head()

In [None]:
total_img_n = int(len(train_df) / 3)
print('The number of imgs: ', total_img_n)

In [None]:
## Calculate segmentation areas and img size.
def cal_pos_area(segmentation):
    pos_area = 0
    if type(segmentation) is str:
        seg_list = segmentation.split(' ')
        for i in range(len(seg_list)//2):
            pos_area += int(seg_list[i*2 + 1])
    return pos_area

def cal_total_area(file_name):
    img_h = int(file_name[11:14])
    img_w = int(file_name[15:18])
    total_area = img_h * img_w
    return total_area

train_df['pos_area'] = train_df['segmentation'].map(cal_pos_area)
train_df['total_area'] = train_df['file_name'].map(cal_total_area)
train_df['pos_area_percentage'] = train_df['pos_area'] / train_df['total_area'] * 100

## Check
train_df[1920:1930]

In [None]:
## Split the samples based on the 'class'.
train_lb_df = train_df[train_df['class']=='large_bowel'].reset_index(drop=True)
train_sb_df = train_df[train_df['class']=='small_bowel'].reset_index(drop=True)
train_st_df = train_df[train_df['class']=='stomach'].reset_index(drop=True)

## Calculate each segmentation pixels' ratio to the total img pixels.
lb_area_ratio = train_lb_df['pos_area'].sum() / train_lb_df['total_area'].sum()
sb_area_ratio = train_sb_df['pos_area'].sum() / train_sb_df['total_area'].sum()
st_area_ratio = train_st_df['pos_area'].sum() / train_st_df['total_area'].sum()
bg_area_ratio = 1 - (lb_area_ratio + sb_area_ratio + st_area_ratio)

print(lb_area_ratio, sb_area_ratio, st_area_ratio, bg_area_ratio)

In [None]:
## Split the samples which have non-null values in 'segmentation' as positive ones.
train_positive_df = train_df.dropna(subset=['segmentation']).reset_index(drop=True)
train_negative_df = train_df[train_df['segmentation'].isna()].reset_index(drop=True)

pos_lb_df = train_positive_df[train_positive_df['class']=='large_bowel'].reset_index(drop=True)
pos_sb_df = train_positive_df[train_positive_df['class']=='small_bowel'].reset_index(drop=True)
pos_st_df = train_positive_df[train_positive_df['class']=='stomach'].reset_index(drop=True)

In [None]:
## Plot the bar graph of the detection percentages (per total number of images) of each classes.
class_group = train_df.groupby(['class'])['detection'].mean() * 100

fig = px.bar(class_group)
fig.update_layout(title = "<span style='font-size:36px;>Detection Percentages (per total number of images) of Each Classes</span>",
                  yaxis_title = 'detection percentage')

In [None]:
## Plot the histogram of the detection percentage of large_bowel class in each case_ids
lb_detection_mean = train_lb_df.groupby(['case_id'])['detection'].mean() * 100
fig = px.histogram(lb_detection_mean, nbins=25, marginal='box')
fig.update_layout(title = "<span style='font-size:36px;>Detection Percentage of 'large_bowel' in Each Case_ids</span>",
                  xaxis_title = 'detection percentage')

In [None]:
## Plot the histogram of the detection percentage of small_bowel class in each case_ids
sb_detection_mean = train_sb_df.groupby(['case_id'])['detection'].mean() * 100
fig = px.histogram(sb_detection_mean, nbins=25, marginal='box')
fig.update_layout(title = "<span style='font-size:36px;>Detection Percentage of 'small_bowel' in Each Case_ids</span>",
                  xaxis_title = 'detection percentage')

In [None]:
## Train - Valid - Test split
## I split the train, valid, test data based on the case_id (imgs that have the same case_id are assigned in the same set).

train_ratio = 0.85
valid_ratio = 0.10
test_ratio = 0.05

case_ids = train_df['case_id'].unique()
idxs = np.random.permutation(range(len(case_ids)))
cut_1 = int(train_ratio * len(idxs))
cut_2 = int((train_ratio + valid_ratio) * len(idxs))

train_case_ids = case_ids[idxs[:cut_1]]
valid_case_ids = case_ids[idxs[cut_1:cut_2]]
test_case_ids = case_ids[idxs[cut_2:]]

train = train_df.query('case_id in @train_case_ids')
valid = train_df.query('case_id in @valid_case_ids')
test = train_df.query('case_id in @test_case_ids')

print(len(train), len(valid), len(test), len(train_df))

In [None]:
train_case_folders = train['case_folder'].unique()
train_files = []
for case_folder in train_case_folders:
    p_train = pathlib.Path(data_config['train_folder_path'])
    p_folder = p_train / case_folder
    tmp_files = list(p_folder.glob('**/scans/*.png'))
    train_files.extend(tmp_files)

valid_case_folders = valid['case_folder'].unique()
valid_files = []
for case_folder in valid_case_folders:
    p_train = pathlib.Path(data_config['train_folder_path'])
    p_folder = p_train / case_folder
    tmp_files = list(p_folder.glob('**/scans/*.png'))
    valid_files.extend(tmp_files)

test_case_folders = test['case_folder'].unique()
test_files = []
for case_folder in test_case_folders:
    p_train = pathlib.Path(data_config['train_folder_path'])
    p_folder = p_train / case_folder
    tmp_files = list(p_folder.glob('**/scans/*.png'))
    test_files.extend(tmp_files)

print(len(train_files), len(valid_files), len(test_files))


In [None]:
## Building Dataset and DataLoader
class UWMadison2022Dataset(torch.utils.data.Dataset):
    def __init__(self, files, dataframe=None, input_shape=256,):
        self.files = files
        self.df = dataframe
        self.input_shape = input_shape
        self.transforms = transforms.Compose([
            transforms.CenterCrop(self.input_shape),
            transforms.Normalize(mean=[(0.485+0.456+0.406)/3], std=[(0.229+0.224+0.225)/3]),
        ])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        p_file = self.files[idx]
        #img = torchvision.io.read_image(p_file)
        img = np.array(Image.open(p_file))
        img_shape = torch.tensor(img.shape)
        img = transforms.functional.to_tensor(img) / 255.
        img = self.transforms(img)
        #img = torch.cat([img, img, img], dim=0)

        if self.df is not None:
            f_name = str(p_file).split('/')
            case_day_id = f_name[5]
            slice_id = f_name[7][:10]
            f_id = '_'.join([case_day_id, slice_id])
            labels_df = self.df.query('id == @f_id')

            label = torch.zeros([img_shape[0]*img_shape[1]])
            for i, organ in enumerate(['large_bowel', 'small_bowel', 'stomach']):
                segmentation = labels_df[labels_df['class'] == organ]['segmentation'].item()
                if type(segmentation) is str:
                    segmentation = segmentation.split(' ')
                    for j in range(len(segmentation)//2):
                        start_idx = int(segmentation[j*2])
                        span = int(segmentation[j*2 + 1])
                        label[start_idx:(start_idx+span)] = (i+1)
            label = torch.reshape(label, (img_shape[0], img_shape[1]))
            label = transforms.CenterCrop(self.input_shape)(label)
            label = torch.nn.functional.one_hot(label.to(torch.int64), num_classes=4)
            label = label.permute(2, 0, 1)
            return img, label, img_shape

        else: return img, img_shape

train_ds = UWMadison2022Dataset(train_files, train, input_shape=256)
valid_ds = UWMadison2022Dataset(valid_files, valid, input_shape=256)
test_ds = UWMadison2022Dataset(test_files, test, input_shape=256)

BATCH_SIZE = 32

## Checking dataset and dataloder
print('------ train_dl ------')
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
tmp = train_dl.__iter__()
x, y, shape = tmp.next()
print(f"x : {x.shape}")
print(f"labels: {y.shape}")
print(f"img_shapes: {shape.shape}")
print(f"n_samples: {len(train_ds)}")
print(f"n_batches: {len(tmp)}")
print()

print('------ valid_dl ------')
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
tmp = valid_dl.__iter__()
x, y, shape = tmp.next()
print(f"x : {x.shape}")
print(f"labels: {y.shape}")
print(f"img_shapes: {shape.shape}")
print(f"n_samples: {len(valid_ds)}")
print(f"n_batches: {len(tmp)}")
print()

print('------ test_dl ------')
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
tmp = test_dl.__iter__()
x, y, shape = tmp.next()
print(f"x : {x.shape}")
print(f"labels: {y.shape}")
print(f"img_shapes: {shape.shape}")
print(f"n_samples: {len(test_ds)}")
print(f"n_batches: {len(tmp)}")
print()

In [None]:
import tensorflow as tf
from tensorflow import keras

## Limit GPU Memory in TensorFlow
## Because TensorFlow, by default, allocates the full amount of available GPU memory when it is launched.
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
        print('{} memory growth: {}'.format(device, tf.config.experimental.get_memory_growth(device)))
else:
    print("Not enough GPU hardware devices available")

In [None]:
!pip install keras-unet-collection -q -U
from keras_unet_collection import models, losses

tf_model = models.swin_unet_2d((256, 256, 1), filter_num_begin=64,
                               n_labels=4, depth=4, stack_num_down=2, stack_num_up=2,
                               patch_size=(4, 4), num_heads=[4, 8, 8, 8],
                               window_size=[4, 2, 2, 2], num_mlp=512,
                               output_activation='Softmax', shift_window=True,
                               name='swin_unet')

In [None]:
tf_model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adam(lr=1e-3),
              metrics=['accuracy', losses.dice_coef])
tf_model.summary()
## To train this tf_model, we have to create TensorFlow Datasets.

In [None]:
## Focal Loss Function
class SegmentationFocalLoss(nn.Module):
    def __init__(self, gamma=2, weight=None):
        super().__init__()
        self.gamma = gamma
        if torch.cuda.is_available():
            self.loss = torch.nn.CrossEntropyLoss(weight=weight).cuda()
        else:
            self.loss = nn.CrossEntropyLoss(weight=weight)

    def forward(self, pred, target):
        ce_loss = self.loss(pred, target)
        #ce_loss = torch.nn.functional.cross_entropy(pred, target, reduce=False)
        pt = torch.exp(-ce_loss)
        focal_loss = (1. - pt) ** self.gamma * ce_loss
        return torch.mean(focal_loss)

##Setting the weight parameter of CrossEntropyLoss.
lb_weight = 1 / lb_area_ratio
sb_weight = 1 / sb_area_ratio
st_weight = 1 / st_area_ratio
bg_weight = 1 / bg_area_ratio
total_weight = lb_weight + sb_weight + st_weight + bg_weight

lb_weight = lb_weight / total_weight * 5
sb_weight = sb_weight / total_weight * 5
st_weight = st_weight / total_weight * 5
bg_weight = bg_weight / total_weight * 5
weight = torch.tensor([bg_weight, lb_weight, sb_weight, st_weight], dtype=torch.float)
print(f'bg:{bg_weight}, lb:{lb_weight}, sb:{sb_weight}, st{st_weight}')

loss_fn = SegmentationFocalLoss(gamma=3, weight=weight)

In [None]:
LEARNING_RATE = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
## For the model training loop.
if torch.cuda.is_available():
    DEVICE = 'cuda'
else: DEVICE = 'cpu'

def train_fn(loader, model, optimizer, loss_fn, device=DEVICE):
    model.train()
    train_loss = 0.
    loop = tqdm(loader)

    for batch_idx, (data, targets, img_size) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)

        predictions = model(data)
        targets = torch.argmax(targets, dim=1)
        loss = loss_fn(predictions, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())
        train_loss += loss.detach().cpu().numpy() * BATCH_SIZE

    train_loss = train_loss / (BATCH_SIZE * len(train_dl))
    return train_loss

## For the model validation loop.
def valid_fn(loader, model, loss_fn, device=DEVICE):
    model.eval()
    valid_loss = 0.
    loop = tqdm(loader)

    with torch.no_grad():
        for batch_idx, (data, targets, img_size) in enumerate(loop):
            data = data.to(device=device)
            targets = targets.to(device=device)

            predictions = model(data)
            targets = torch.argmax(targets, dim=1)
            loss = loss_fn(predictions, targets)
            valid_loss += loss * BATCH_SIZE

            loop.set_postfix(loss=loss.item())

        valid_loss = valid_loss / (BATCH_SIZE * len(valid_dl))
    return valid_loss

In [None]:
## For the train & validation loop.
NUM_EPOCHS = 70

## DeepLabv3 model
model.to(device=DEVICE)

best_loss = 100
for epoch in range(NUM_EPOCHS):
    print('-------------')
    print('Epoch {}/{}'.format(epoch+1, NUM_EPOCHS))
    print('-------------')

    train_loss = train_fn(train_dl, model, optimizer, loss_fn, DEVICE)
    valid_loss = valid_fn(valid_dl, model, loss_fn, DEVICE)

    if valid_loss < best_loss:
        checkpoint = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        torch.save(checkpoint, "./checkpoint.pth")
        print('best model saved!')
        best_loss = valid_loss

    print(f'Train Loss: {train_loss},  Valid Loss: {valid_loss}')