In [58]:
import os
import random

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from medpy.io import load
from scipy.ndimage import zoom
from torchvision import transforms
from torch.utils.data import random_split

In [63]:
class config:
    seed = 1717
    base_path = "./hubmap-organ-segmentation"
    batch_size = 4
    
    val_size = 0.25
    
    epoch = 30
    lr = 1e-3
    device = "cpu"
    
    rescaling_factor = 3
    
    #base_path = "../input/hubmap-organ-segmentation" #UNCOMMIT FOR KAGGLE
    train_metadata = os.path.join(base_path, "train.csv")
    test_metadata = os.path.join(base_path, "test.csv")
    train_images = os.path.join(base_path, "train_images")
    test_images = os.path.join(base_path, "test_images")
    
    random.seed(seed)
    torch.manual_seed(seed)
    

In [3]:
# https://www.kaggle.com/paulorzp/rle-functions-run-length-encode-decode
def mask2rle(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)
 
def rle2mask(mask_rle, shape=(1600,256)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

In [52]:
class BaselineDataset(Dataset):
    def __init__(
        self, root_dir, metadata, transform=None,
        rescaling_factor = config.rescaling_factor
    ):
        self.root_dir = root_dir
        self.transform = transform
        
        self.new_voxel_spacing = np.array(
            [
                3,
                0.171 * rescaling_factor,
                0.171 * rescaling_factor
            ]
        )
        
        self.idx2name = {
            x: os.path.join(self.root_dir, y)
            for x, y in enumerate(
                os.listdir(self.root_dir)
            )
        }
        self.name2idx = {
            int(y.split(".")[1].split("/")[-1]): x
            for x, y in self.idx2name.items()
        }
        self.idx2meta = {
            self.name2idx[x[1]["id"]]: {
                "mask": None,
                "pixel_size": x[1]["pixel_size"],
                "tissue_thickness": x[1]["tissue_thickness"],
            }
            for x in metadata.iterrows()
        }
        for x in metadata.iterrows():
            if x[1].get("rle") is not None:
                self.idx2meta[self.name2idx[x[1]["id"]]]["mask"] = rle2mask(
                    x[1]["rle"],
                    (x[1]["img_width"], x[1]["img_height"])
                )
    
    def get_voxel_size(self, idx):
        image = load(self.idx2name[idx])
        return np.array(image[1].get_voxel_spacing())
    
    def __len__(self):
        return len(self.idx2name)
    
    def __getitem__(self, idx):
        image = load(self.idx2name[idx])
        mask = self.idx2meta[idx]["mask"]
        #TODO Use pixel_size and tissue_thickness here
        
        #TODO Scale image
        #image_spacing = np.array([1, *image[1].get_voxel_spacing()])
        #image_scale_factor = image_spacing / self.new_voxel_spacing
        #image = zoom(image[0], image_scale_factor, order=3, cval=0)
        #mask = zoom(mask, image_scale_factor, order=3, cval=0)
        
        ans = {"image": image[0], "mask": mask}
        if self.transform:
            ans["image"] = self.transform(ans["image"])
            ans["mask"] = self.transform(ans["mask"])
        
        return ans

In [53]:
df_train = pd.read_csv(config.train_metadata)
df_test = pd.read_csv(config.test_metadata)
df_train.head()

Unnamed: 0,id,organ,data_source,img_height,img_width,pixel_size,tissue_thickness,rle,age,sex
0,10044,prostate,HPA,3000,3000,0.4,4,1459676 77 1462675 82 1465674 87 1468673 92 14...,37.0,Male
1,10274,prostate,HPA,3000,3000,0.4,4,715707 2 718705 8 721703 11 724701 18 727692 3...,76.0,Male
2,10392,spleen,HPA,3000,3000,0.4,4,1228631 20 1231629 24 1234624 40 1237623 47 12...,82.0,Male
3,10488,lung,HPA,3000,3000,0.4,4,3446519 15 3449517 17 3452514 20 3455510 24 34...,78.0,Male
4,10610,spleen,HPA,3000,3000,0.4,4,478925 68 481909 87 484893 105 487863 154 4908...,21.0,Female


In [54]:
image_transforms = transforms.Compose([
    transforms.ToTensor(),
])

In [74]:
train_dataset = BaselineDataset(config.train_images, df_train, transform = image_transforms)
train_dataset, val_dataset = random_split(
    train_dataset,
    [
        len(train_dataset) - round(len(train_dataset) * config.val_size),
        round(len(train_dataset) * config.val_size)
    ],
    generator=torch.Generator().manual_seed(42)
)
test_dataset = BaselineDataset(config.test_images, df_test, transform = image_transforms)

In [75]:
train_dataloader = DataLoader(
    train_dataset, batch_size=config.batch_size,
    shuffle=True
)
val_dataloader = DataLoader(
    val_dataset, batch_size=config.batch_size,
    shuffle=True
)
test_dataloader = DataLoader(
    test_dataset, batch_size=config.batch_size,
    shuffle=False
)