In [1]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
from typing import List, Dict, Optional
import pandas as pd
import numpy as np
import os

import albumentations as albu
from albumentations.pytorch import ToTensorV2
import random
import matplotlib.pyplot as plt

from pathlib import Path
import random
import cv2



In [2]:
class AugmentationTransforms:
    def __init__(self, image_size: int):
        self.image_size = image_size

    def get_training_augmentation(self):
        scale_factor = random.uniform(1.0, 1.05)
        train_transform = [
            albu.HorizontalFlip(p=0.5),
            albu.augmentations.geometric.resize.Resize(
                int(self.image_size * scale_factor),
                int(self.image_size * scale_factor),
                always_apply=True,
            ),
            albu.RandomCrop(
                height=self.image_size, width=self.image_size, always_apply=True
            ),
            albu.augmentations.transforms.GaussNoise(p=0.2),
            albu.augmentations.geometric.transforms.Perspective(p=0.5),
            albu.OneOf(
                [
                    albu.CLAHE(p=1),
                    albu.RandomBrightnessContrast(p=1),
                    albu.RandomGamma(p=1),
                ],
                p=0.5,
            ),
            albu.OneOf(
                [
                    albu.augmentations.transforms.Sharpen(p=1),
                    albu.Blur(blur_limit=3, p=1),
                    albu.MotionBlur(blur_limit=3, p=1),
                ],
                p=0.5,
            ),
            albu.OneOf(
                [albu.RandomBrightnessContrast(p=1), albu.HueSaturationValue(p=1),],
                p=0.5,
            ),
              albu.augmentations.geometric.resize.Resize(
                self.image_size, self.image_size, always_apply=True
            ),
        ]
        return albu.Compose(train_transform)

    def get_validation_augmentation(self):
        """Add paddings to make image shape divisible by 32"""
        test_transform = [
            albu.augmentations.geometric.resize.Resize(
                self.image_size, self.image_size, always_apply=True
            ),
        ]
        return albu.Compose(test_transform)

    def get_preprocessing(self):
        """Construct preprocessing transform

        Args:
            preprocessing_fn (callbale): data normalization function
                (can be specific for each pretrained neural network)
        Return:
            transform: albumentations.Compose

        """

        # Model expects input [N, C, H, W]
        # ToTensor convert HWC image to CHW image
        ubc_mean = [0.8894420586142374,0.8208752169441305,0.8864016141389351]
        ubc_std = [0.10106393015358608,0.15637655015581306,0.09892687853183287]
        transform = [
            albu.Normalize(mean=ubc_mean, std=ubc_std),
            ToTensorV2(),
        ]

        return albu.Compose(transform)

In [3]:
# installing pyvips for offline use
# intall the deb packages
!yes | dpkg -i --force-depends /kaggle/input/pyvips-python-and-deb-package-gpu/linux_packages/archives/*.deb
# install the python wrapper
!pip install pyvips -f /kaggle/input/pyvips-python-and-deb-package-gpu/python_packages/ --no-index

Selecting previously unselected package apparmor.
(Reading database ... 113818 files and directories currently installed.)
Preparing to unpack .../apparmor_3.0.4-2ubuntu2.2_amd64.deb ...
Unpacking apparmor (3.0.4-2ubuntu2.2) ...
Selecting previously unselected package autoconf.
Preparing to unpack .../autoconf_2.71-2_all.deb ...
Unpacking autoconf (2.71-2) ...
Selecting previously unselected package automake.
Preparing to unpack .../automake_13a1.16.5-1.3_all.deb ...
Unpacking automake (1:1.16.5-1.3) ...
Selecting previously unselected package autotools-dev.
Preparing to unpack .../autotools-dev_20220109.1_all.deb ...
Unpacking autotools-dev (20220109.1) ...
Selecting previously unselected package bzip2-doc.
Preparing to unpack .../bzip2-doc_1.0.8-5build1_all.deb ...
Unpacking bzip2-doc (1.0.8-5build1) ...
Selecting previously unselected package file.
Preparing to unpack .../file_13a5.41-3ubuntu0.1_amd64.deb ...
Unpacking file (1:5.41-3ubuntu0.1) ...
Selecting previo

In [4]:
!pip install /kaggle/input/cancer-detection-w-randomly-selected-tiles/*.whl &> /dev/null

In [5]:
# from https://www.kaggle.com/competitions/UBC-OCEAN/discussion/451908 &
# https://www.kaggle.com/code/jirkaborovec/cancer-subtype-lightning-torch-inference-tiles

import pyvips
import numpy as np
import random
from PIL import Image


# cuts tiles of specified size; filters tiles with more black pixels than drop_thr
def extract_image_tiles(
    p_img,
    folder,
    size: int = 2048,
    scale: float = 0.5,
    drop_thr: float = 0.6,
    white_thr: int = 240,
    max_samples: int = 50,
) -> list:

    name, _ = os.path.splitext(os.path.basename(p_img))
    im = pyvips.Image.new_from_file(p_img)
    w = h = size
    # https://stackoverflow.com/a/47581978/4521646

    # if only one full tile fits then cut from center - else cut from top left
    if (
        (im.height < 2 * size or im.width < 2 * size)
        and im.height > size
        and im.width > size
    ):
        y = int(0.5 * im.height) - int(0.5 * size)
        x = int(0.5 * im.width) - int(0.5 * size)
        idxs = [(y, y + h, x, x + w)]
    else:
        idxs = [
            (y, y + h, x, x + w)
            for y in range(0, im.height, h)
            for x in range(0, im.width, w)
        ]

    # random subsample
    max_samples = (
        max_samples if isinstance(max_samples, int) else int(len(idxs) * max_samples)
    )
    random.shuffle(idxs)
    files = []
    
    for y, y_, x, x_ in idxs:
        # https://libvips.github.io/pyvips/vimage.html#pyvips.Image.crop
        tile = im.crop(x, y, min(w, im.width - x), min(h, im.height - y)).numpy()[
            ..., :3
        ]
        
        if tile.shape[:2] != (h, w):
            tile_ = tile
            tile_size = (h, w) if tile.ndim == 2 else (h, w, tile.shape[2])
            tile = np.zeros(tile_size, dtype=tile.dtype)
            tile[: tile_.shape[0], : tile_.shape[1], ...] = tile_
            
        black_bg = np.sum(tile, axis=2) == 0
        tile[black_bg, :] = 255
        mask_bg = np.mean(tile, axis=2) > white_thr
        
        # CHANGED 2023-12-15
        if np.sum(mask_bg) >= (np.prod(mask_bg.shape[:2]) * drop_thr):
            continue
            
        p_img = os.path.join(folder, f"{int(x_ / w)}-{int(y_ / h)}.png")
        # print(tile.shape, tile.dtype, tile.min(), tile.max())
        new_size = int(size * scale), int(size * scale)
        Image.fromarray(tile).resize(new_size, Image.LANCZOS).save(p_img)
        files.append(p_img)
        
        # need to set counter check as some empty tiles could be skipped earlier
        if len(files) >= max_samples:
            break

    return files, idxs


# creates tiles for specified image /kaggle/input/UBC-OCEAN/test_images/*.png into folder /kaggle/working/test_tiles/*.png
def extract_prune_tiles(
    path_img: str,
    folder: str,
    size: int = 2048,
    scale: float = 0.25,
    drop_thr: float = 0.6,
    max_samples: int = 30,
) -> str:

    print(f"processing: {path_img}")
    name, _ = os.path.splitext(os.path.basename(path_img))
    folder = os.path.join(folder, name)
    os.makedirs(folder, exist_ok=True)
    tiles, _ = extract_image_tiles(
        path_img,
        folder,
        size=size,
        scale=scale,
        drop_thr=drop_thr,
        max_samples=max_samples,
    )
    return folder


In [6]:
import os

# pyvips settings - important
os.environ['VIPS_CONCURRENCY'] = '4'
os.environ['VIPS_DISC_THRESHOLD'] = '15gb'

In [7]:
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
import cv2
import glob
from PIL import Image

class CancerTilesDataset(Dataset):
    def __init__(self, folder: str, image_ext: str =  '.png', preprocessing = None, augmentation = None):
        self.imgs = glob.glob(os.path.join(folder, "*" + image_ext))
        self.preprocessing = preprocessing
        self.augmentation = augmentation
        
    
    def __getitem__(self, idx):
        
        img_path = self.imgs[idx]
        img = np.array(Image.open(img_path))[..., :3]
        
        # filter background
        # TODO Is this needed actually here? Aren't they saved with white background
        mask = np.sum(img, axis=2) == 0
        img[mask, :] = 255
        if np.max(img) < 1.5:
            img = np.clip(img * 255, 0, 255).astype(np.uint8)
            
        # augmentation & preprocess - pole kindel, kas siin sai atm õigesti
        if self.augmentation:
            img = self.augmentation(image=img)["image"]
        
        if self.preprocessing:
            img = self.preprocessing(image=img)["image"]

        return img
    
    def __len__(self) -> int:
        return len(self.imgs)

In [8]:
import onnxruntime

ort_session = onnxruntime.InferenceSession("/kaggle/input/cancer-detection-w-randomly-selected-tiles/cancer_classification_model..onnx", providers=["CUDAExecutionProvider"])
input_name = ort_session.get_inputs()[0].name

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



In [9]:
# Map index to class
config = {
    "input_size": 448,
    "tiles": 50,
    "other_threshold": 0.5,
    "drop_thr": 0.4,
}
config["scale"] =  config["input_size"] / 2048

classes = ['HGSC', 'LGSC', 'EC', 'CC', 'MC']
idx2class = {idx: class_name for idx, class_name in enumerate(classes)}

test_df = pd.read_csv("/kaggle/input/UBC-OCEAN/test.csv")

# Ajutiselt panen siia, parem integreerida klassi
aug_transforms = AugmentationTransforms(config["input_size"])

submission = []
import time
for _, row in test_df.iterrows():
    row = dict(row)
    
    # Default prediction
    row['label'] = "HGSC"
    
    # prepare data - cut and load tiles
    folder_tiles = extract_prune_tiles(
        os.path.join("/kaggle/input/UBC-OCEAN/", "test_images", f"{str(row['image_id'])}.png"),
        "./test_tiles/", size=2048, scale=config["scale"], max_samples = config["tiles"], drop_thr=config["drop_thr"])
    
    dataset = CancerTilesDataset(folder_tiles, preprocessing = aug_transforms.get_preprocessing())
    
    if not len(dataset):
        print (f"seem no tiles were cut for `{folder_tiles}`")
        submission.append(row)
        continue
        
    dataloader = DataLoader(dataset, batch_size=4, num_workers=4, shuffle=False)
    
    imagePredictions = []
    maxScores = []  # List to store the maximum scores for each batch

    for imgs in dataloader:
        ort_outs = ort_session.run(None, {input_name: imgs.numpy()})
        output = torch.tensor(ort_outs[0])

        # Calculate the maximum softmax score for each prediction in the batch
        softmax_scores = torch.nn.functional.softmax(output, dim=1)
        max_score, preds = torch.max(softmax_scores, dim=1)
        maxScores.extend(max_score.detach().cpu().numpy())

        preds = preds.cpu().numpy()
        imagePredictions.append(preds)
    
    # Check if the average max score of predictions is below the threshold
    avgMaxScore = np.mean(maxScores)
    if avgMaxScore < config["other_threshold"]:
        row['label'] = 'Other'  # Classify as 'Other'
    else:
        # If above threshold, use the most frequent prediction
        lb = np.argmax(np.bincount(np.concatenate(imagePredictions)))
        row['label'] = classes[lb]

    submission.append(row)
    
    # cleaning
    os.system(f"rm -rf {folder_tiles}")

processing: /kaggle/input/UBC-OCEAN/test_images/41.png


In [10]:
df_sub = pd.DataFrame(submission)
df_sub[["image_id", "label"]].to_csv("submission.csv", index=False)