In [1]:
import pytorch_lightning as pl
import torch
from torchvision.models import convnext_tiny, ConvNeXt_Tiny_Weights, convnext_base, ConvNeXt_Base_Weights, convnext_small, ConvNeXt_Small_Weights, efficientnet_b4, EfficientNet_B4_Weights
import torch.nn as nn
import torch.optim as optim
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from typing import List, Dict, Optional
import pandas as pd
import numpy as np
import os

import albumentations as albu
from albumentations.pytorch import ToTensorV2
import random
import matplotlib.pyplot as plt

from pathlib import Path
import random
import cv2



In [2]:
class AugmentationTransforms:
    def __init__(self, image_size: int):
        self.image_size = image_size

    def get_training_augmentation(self):
        scale_factor = random.uniform(1.0, 1.05)
        train_transform = [
            albu.HorizontalFlip(p=0.5),
            albu.augmentations.geometric.resize.Resize(
                int(self.image_size * scale_factor),
                int(self.image_size * scale_factor),
                always_apply=True,
            ),
            albu.RandomCrop(
                height=self.image_size, width=self.image_size, always_apply=True
            ),
            albu.augmentations.transforms.GaussNoise(p=0.2),
            albu.augmentations.geometric.transforms.Perspective(p=0.5),
            albu.OneOf(
                [
                    albu.CLAHE(p=1),
                    albu.RandomBrightnessContrast(p=1),
                    albu.RandomGamma(p=1),
                ],
                p=0.5,
            ),
            albu.OneOf(
                [
                    albu.augmentations.transforms.Sharpen(p=1),
                    albu.Blur(blur_limit=3, p=1),
                    albu.MotionBlur(blur_limit=3, p=1),
                ],
                p=0.5,
            ),
            albu.OneOf(
                [albu.RandomBrightnessContrast(p=1), albu.HueSaturationValue(p=1),],
                p=0.5,
            ),
              albu.augmentations.geometric.resize.Resize(
                self.image_size, self.image_size, always_apply=True
            ),
        ]
        return albu.Compose(train_transform)

    def get_validation_augmentation(self):
        """Add paddings to make image shape divisible by 32"""
        test_transform = [
            albu.augmentations.geometric.resize.Resize(
                self.image_size, self.image_size, always_apply=True
            ),
        ]
        return albu.Compose(test_transform)

    def get_preprocessing(self):
        """Construct preprocessing transform

        Args:
            preprocessing_fn (callbale): data normalization function
                (can be specific for each pretrained neural network)
        Return:
            transform: albumentations.Compose

        """

        # Model expects input [N, C, H, W]
        # ToTensor convert HWC image to CHW image
        transform = [
            albu.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            ToTensorV2(),
        ]

        return albu.Compose(transform)


In [3]:
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
import cv2

class CancerDataset(Dataset):
    def __init__(self, imgs: List[str], csv: str, preprocessing = None, augmentation = None, return_image_id: bool = False):
        self.imgs = imgs
        
        self.classes = ['HGSC', 'LGSC', 'EC', 'CC', 'MC']
        self.class2idx = {label: idx for idx, label in enumerate(self.classes)}
        
        if return_image_id:
            # If we want to return the image_id, we set the label column to mimic the label column
            self.labels = pd.read_csv(csv, usecols=["image_id"], index_col="image_id")
            self.labels["label"] = self.labels.index
        else:
            # Map values to numbers
            self.labels = pd.read_csv(csv, usecols=["image_id", "label"], index_col="image_id")
            self.labels["label"].map(self.class2idx)
        
        self.preprocessing = preprocessing
        self.augmentation = augmentation
        
    
    def __getitem__(self, i):
        img_name = Path(self.imgs[i])
        img_id = int(img_name.name.split("_")[0])
        
        img = cv2.imread(str(img_name))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        label = self.labels.loc[img_id]["label"]
        
        if self.augmentation:
            img = self.augmentation(image=img)["image"]
        
        if self.preprocessing:
            img = self.preprocessing(image=img)["image"]
            
        return img, label
    
    def __len__(self):
        return len(self.imgs)

In [4]:
class CancerDataModule(pl.LightningDataModule):
    def __init__(
        self,
        image_size: int,
        batch_size: int,
        cutoff: float = 0.8,
        shuffle: Optional[bool] = True,
        data_dir: str = "/kaggle/input/UBC-OCEAN",
    ):
        super().__init__()
        self.image_size = image_size
        self.train_batch_size = batch_size
        self.shuffle = shuffle
        self.data_dir = Path(data_dir)
        self.cutoff = cutoff
        
        aug_transforms = AugmentationTransforms(self.image_size)

        self.preprocess_transforms = aug_transforms.get_preprocessing()
        self.train_transforms = aug_transforms.get_training_augmentation()
        self.val_transforms = aug_transforms.get_validation_augmentation()


    def setup(self, stage: Optional[str] = None):
        if stage == "fit":
            train_data_dir = self.data_dir / "train_thumbnails"
            train_csv = self.data_dir / "train.csv"
            
            imgs = list(train_data_dir.glob("*"))
            cutoff_point = int(len(imgs) * self.cutoff)

            self.train_dataset = CancerDataset(imgs[:cutoff_point], train_csv, self.preprocess_transforms, self.train_transforms)
            self.validation_dataset = CancerDataset(imgs[cutoff_point:], train_csv, self.preprocess_transforms, self.val_transforms)

            print(f"The the training set has {len(self.train_dataset)} images")
            print(f"The the validation set has {len(self.validation_dataset)} images")
        
        if stage == "test":
            test_data_dir = self.data_dir / "test_thumbnails"
            test_csv = self.data_dir / "test.csv"
            
            imgs = list(test_data_dir.glob("*"))
            
            self.test_dataset = CancerDataset(imgs, test_csv, self.preprocess_transforms, self.val_transforms, return_image_id=True)
            
# #             # TODO
#             train_data_dir = self.data_dir / "train_thumbnails"
#             train_csv = self.data_dir / "train.csv"
            
#             imgs = list(train_data_dir.glob("*"))
#             cutoff_point = int(len(imgs) * self.cutoff)
#             self.test_dataset = CancerDataset(imgs[cutoff_point:], train_csv, self.preprocess_transforms, self.val_transforms, return_image_id=True)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.train_batch_size,
            shuffle=self.shuffle,
            num_workers=4,
        )

    def val_dataloader(self):
        return DataLoader(
            self.validation_dataset, batch_size=8, shuffle=False, num_workers=4,
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset, batch_size=4, shuffle=False, num_workers=4,
        )


In [5]:
import torchmetrics


class CancerDetector(pl.LightningModule):
    def __init__(
        self,
        lr: float,
        gamma: float,
        model_name: str,
        num_classes: int = 5,
        init_weights: bool = True,
    ):
        super().__init__()
        # TODO Use model preprocessing function
        self.model = self._get_model(model_name, num_classes, init_weights)

        self.loss_fn = nn.CrossEntropyLoss()
        self.lr = lr
        self.gamma = gamma

        self.save_hyperparameters()

        # Should we use macro average? Default is micro
        self.accuracy = torchmetrics.classification.Accuracy(
            num_classes=num_classes, task="multiclass"
        )
        self.f1 = torchmetrics.classification.F1Score(
            num_classes=num_classes, task="multiclass"
        )
        self.recall = torchmetrics.classification.Recall(
            num_classes=num_classes, task="multiclass"
        )
        self.precision = torchmetrics.classification.Precision(
            num_classes=num_classes, task="multiclass"
        )

    def _get_model(self, model_name: str, num_classes: int, init_weights: bool):
        if model_name == "convnext_tiny":
            model = convnext_tiny(
                weights=ConvNeXt_Tiny_Weights.IMAGENET1K_V1 if init_weights else None
            )
        elif model_name == "convnext_small":
            model = convnext_small(
                weights=ConvNeXt_Small_Weights.IMAGENET1K_V1 if init_weights else None
            )
        elif model_name == "convnext_base":
            model = convnext_base(
                weights=ConvNeXt_Base_Weights.IMAGENET1K_V1 if init_weights else None
            )
        elif model_name == "efficientnet_b4":
            model = efficientnet_b4(weights = EfficientNet_B4_Weights.IMAGENET1K_V1 if init_weights else None)
        else:
            raise Exception(f"Unknown model name {model_name}")
        
        if "covnext" in model_name:
            in_features = model.classifier[2].in_features
            model.classifier[2] = nn.Linear(in_features, num_classes)
        else:
            in_features = model.classifier[1].in_features
            model.classifier[1] = nn.Linear(in_features, num_classes)

        return model

    def forward(self, imgs: torch.Tensor):
        return self.model(imgs)

    def training_step(self, batch: torch.Tensor, batch_idx: int):
        x, y = batch
        output = self(x)
        loss = self.loss_fn(output, y)

        self._log_metrics(loss, output, y, "train")

        return loss

    def validation_step(self, batch: torch.Tensor, batch_idx: int):
        x, y = batch
        output = self(x)
        loss = self.loss_fn(output, y)

        self._log_metrics(loss, output, y, "val")

        return loss

    def _log_metrics(
        self, loss: torch.Tensor, preds: torch.Tensor, target: torch.Tensor, phase: str
    ):
        accuracy = self.accuracy(preds, target)
        f1 = self.f1(preds, target)
        recall = self.recall(preds, target)
        precision = self.precision(preds, target)

        self.log(f"{phase}/loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log(
            f"{phase}/accuracy", accuracy, prog_bar=True, on_step=False, on_epoch=True
        )
        self.log(f"{phase}/f1", f1, prog_bar=True, on_step=False, on_epoch=True)
        self.log(f"{phase}/recall", recall, prog_bar=True, on_step=False, on_epoch=True)
        self.log(
            f"{phase}/precision", precision, prog_bar=True, on_step=False, on_epoch=True
        )

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.model.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=self.gamma)
        return [optimizer], [scheduler]

In [6]:
# installing pyvips for offline use

!ls /kaggle/input/pyvips-python-and-deb-package-gpu
# intall the deb packages
!yes | dpkg -i --force-depends /kaggle/input/pyvips-python-and-deb-package-gpu/linux_packages/archives/*.deb
# install the python wrapper
!pip install pyvips -f /kaggle/input/pyvips-python-and-deb-package-gpu/python_packages/ --no-index

linux_packages	python_packages
Selecting previously unselected package apparmor.
(Reading database ... 113818 files and directories currently installed.)
Preparing to unpack .../apparmor_3.0.4-2ubuntu2.2_amd64.deb ...
Unpacking apparmor (3.0.4-2ubuntu2.2) ...
Selecting previously unselected package autoconf.
Preparing to unpack .../autoconf_2.71-2_all.deb ...
Unpacking autoconf (2.71-2) ...
Selecting previously unselected package automake.
Preparing to unpack .../automake_13a1.16.5-1.3_all.deb ...
Unpacking automake (1:1.16.5-1.3) ...
Selecting previously unselected package autotools-dev.
Preparing to unpack .../autotools-dev_20220109.1_all.deb ...
Unpacking autotools-dev (20220109.1) ...
Selecting previously unselected package bzip2-doc.
Preparing to unpack .../bzip2-doc_1.0.8-5build1_all.deb ...
Unpacking bzip2-doc (1.0.8-5build1) ...
Selecting previously unselected package file.
Preparing to unpack .../file_13a5.41-3ubuntu0.1_amd64.deb ...
Unpacking file (1:5.41-3

In [7]:
# from https://www.kaggle.com/competitions/UBC-OCEAN/discussion/451908 &
# https://www.kaggle.com/code/jirkaborovec/cancer-subtype-lightning-torch-inference-tiles

import pyvips
import numpy as np
import random
from PIL import Image

# cuts tiles of specified size; filters tiles with more black pixels than drop_thr
def extract_image_tiles(
    p_img, folder, size: int = 2048, scale: float = 0.5,
    drop_thr: float = 0.6, white_thr: int = 240, max_samples: int = 50) -> list:
    
    name, _ = os.path.splitext(os.path.basename(p_img))
    im = pyvips.Image.new_from_file(p_img)
    w = h = size
    # https://stackoverflow.com/a/47581978/4521646
    idxs = [(y, y + h, x, x + w) for y in range(0, im.height, h) for x in range(0, im.width, w)]
    # random subsample
    max_samples = max_samples if isinstance(max_samples, int) else int(len(idxs) * max_samples)
    random.shuffle(idxs)
    files = []
    for y, y_, x, x_ in idxs:
        # https://libvips.github.io/pyvips/vimage.html#pyvips.Image.crop
        tile = im.crop(x, y, min(w, im.width - x), min(h, im.height - y)).numpy()[..., :3]
        if tile.shape[:2] != (h, w):
            tile_ = tile
            tile_size = (h, w) if tile.ndim == 2 else (h, w, tile.shape[2])
            tile = np.zeros(tile_size, dtype=tile.dtype)
            tile[:tile_.shape[0], :tile_.shape[1], ...] = tile_
        black_bg = np.sum(tile, axis=2) == 0
        tile[black_bg, :] = 255
        mask_bg = np.mean(tile, axis=2) > white_thr
        if np.sum(mask_bg) >= (np.prod(mask_bg.shape) * drop_thr):
            continue
        p_img = os.path.join(folder, f"{int(x_ / w)}-{int(y_ / h)}.png")
        # print(tile.shape, tile.dtype, tile.min(), tile.max())
        new_size = int(size * scale), int(size * scale)
        Image.fromarray(tile).resize(new_size, Image.LANCZOS).save(p_img)
        files.append(p_img)
        # need to set counter check as some empty tiles could be skipped earlier
        if len(files) >= max_samples:
            break
    return files, idxs

# creates tiles for specified image /kaggle/input/UBC-OCEAN/test_images/*.png into folder /kaggle/working/test_tiles/*.png
def extract_prune_tiles(
    path_img: str, folder: str, size: int = 2048, scale: float = 0.25,
    drop_thr: float = 0.6, max_samples: int = 30) -> str:
    
    print(f"processing: {path_img}")
    name, _ = os.path.splitext(os.path.basename(path_img))
    folder = os.path.join(folder, name)
    os.makedirs(folder, exist_ok=True)
    tiles, _ = extract_image_tiles(
        path_img, folder, size=size, scale=scale,
        drop_thr=drop_thr, max_samples=max_samples)
    return folder

In [8]:
import os

# pyvips settings - important
os.environ['VIPS_CONCURRENCY'] = '4'
os.environ['VIPS_DISC_THRESHOLD'] = '15gb'

In [9]:
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
import cv2
import glob
from PIL import Image

class CancerTilesDataset(Dataset):
    def __init__(self, folder: str, image_ext: str =  '.png', preprocessing = None, augmentation = None):
        
        self.imgs = glob.glob(os.path.join(folder, "*" + image_ext))
        self.preprocessing = preprocessing
        self.augmentation = augmentation
        
    
    def __getitem__(self, idx):
        
        img_path = self.imgs[idx]
        img = np.array(Image.open(img_path))[..., :3]
        
        # filter background
        mask = np.sum(img, axis=2) == 0
        img[mask, :] = 255
        if np.max(img) < 1.5:
            img = np.clip(img * 255, 0, 255).astype(np.uint8)
            
        # augmentation & preprocess - pole kindel, kas siin sai atm õigesti
        if self.augmentation:
            img = self.augmentation(image=img)["image"]
        
        if self.preprocessing:
            img = self.preprocessing(image=img)["image"]

        return img
    
    def __len__(self) -> int:
        return len(self.imgs)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
best_model = "/kaggle/input/cancer-detection-w-randomly-selected-tiles/cancer_classification_model.pt"

# Load model from checkpoint
model = CancerDetector.load_from_checkpoint(best_model, init_weights=False)
model.to(device)
model.eval()

CancerDetector(
  (model): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
              (1): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (scal

In [11]:

# Map index to class
classes = ['HGSC', 'LGSC', 'EC', 'CC', 'MC']
idx2class = {idx: class_name for idx, class_name in enumerate(classes)}

test_df = pd.read_csv("/kaggle/input/UBC-OCEAN/test.csv")

# Ajutiselt panen siia, parem integreerida klassi
aug_transforms = AugmentationTransforms(512)

submission = []
import time
for _, row in test_df.iterrows():
    row = dict(row)
    
    # prepare data - cut and load tiles
    folder_tiles = extract_prune_tiles(
        os.path.join("/kaggle/input/UBC-OCEAN/", "test_images", f"{str(row['image_id'])}.png"),
        "./test_tiles/", size=2048, scale=0.25, max_samples = 10)
    
    dataset = CancerTilesDataset(folder_tiles, preprocessing = aug_transforms.get_preprocessing())
    
    if not len(dataset):
        print (f"seem no tiles were cut for `{folder_tiles}`")
        submission.append(row)
        continue
        
    dataloader = DataLoader(dataset, batch_size=4, num_workers=4, shuffle=False)
    
    imagePredictions = []
    maxScores = []  # List to store the maximum scores for each batch

    for imgs in dataloader:
        imgs = imgs.to(device)
        output = model(imgs)

        # Calculate the maximum softmax score for each prediction in the batch
        softmax_scores = torch.nn.functional.softmax(output, dim=1)
        max_score, preds = torch.max(softmax_scores, dim=1)
        maxScores.extend(max_score.detach().cpu().numpy())

        preds = preds.cpu().numpy()
        imagePredictions.append(preds)
    
    
    # Check if the average max score of predictions is below the threshold
    threshold = 0.7
    avgMaxScore = np.mean(maxScores)
    if avgMaxScore < threshold:
        row['label'] = 'Other'  # Classify as 'Other'
    else:
        # If above threshold, use the most frequent prediction
        lb = np.argmax(np.bincount(np.concatenate(imagePredictions)))
        row['label'] = classes[lb]

    submission.append(row)
    
    # cleaning
    os.system(f"rm -rf {folder_tiles}")

processing: /kaggle/input/UBC-OCEAN/test_images/41.png


In [12]:
df_sub = pd.DataFrame(submission)
df_sub[["image_id", "label"]].to_csv("submission.csv", index=False)