# Classification Tasks

## BACH

In [1]:
from torchvision.datasets import ImageFolder, MNIST
from pathlib import Path
from torch.utils.data import random_split
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
import numpy as np
from numpy.random import default_rng
from torchvision import transforms
import torch
from torchvision.datasets.utils import download_url, download_and_extract_archive
from torch.utils.data import Dataset

In [73]:
p = Path('/home/niklas/Internal_HDD/project_data/histopathology/BACH/ICIAR2018_BACH_Challenge/Photos')

In [3]:
mean, std = torch.tensor([0.7169, 0.6170, 0.8427]), torch.tensor([0.1661, 0.1885, 0.1182]) # calculated over dataset

In [138]:
class Bach:
    
    def __init__(self, root, train_transform, valid_transform, download=False, valid_percent=0.2, shuffle=True):
        
        self.root = root
        self.train_transform = train_transform
        self.valid_transform = valid_transform
        self.valid_percent = valid_percent
        self.shuffle = shuffle
        
        if download:
            self.download_data()
            self.root = self.root/'ICIAR2018_BACH_Challenge/Photos' # extend root directory to point to images
            
        self.train_ds, self.valid_ds, self.train_sampler, self.valid_sampler = self.prepare_datasets()
        
    def download_data(self):
        url = 'https://zenodo.org/record/3632035/files/ICIAR2018_BACH_Challenge.zip'
        download_and_extract_archive(url, self.root)
    
    def prepare_datasets(self):
        train_ds = ImageFolder(self.root, self.train_transform)
        valid_ds = ImageFolder(self.root, self.valid_transform)
        
        num_train = len(train_ds)
        indices   = list(range(num_train))
        split     = int(np.floor(self.valid_percent * num_train))
        
        if self.shuffle:
            rng = default_rng(seed=101)
            rng.shuffle(indices)
        
        train_idx, valid_idx = indices[split:], indices[:split]
        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(valid_idx)
        
        return train_ds, valid_ds, train_sampler, valid_sampler
    
    def get_dataloaders(self, batch_size, shuffle=True, pin_memory=True, num_workers=0):
        train_dl = DataLoader(
            self.train_ds, batch_size=batch_size, sampler=self.train_sampler,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        valid_dl = DataLoader(
            self.valid_ds, batch_size=batch_size, sampler=self.valid_sampler,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        return train_dl, valid_dl

In [75]:
train_transform = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor()])

valid_transform = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor()])

In [76]:
bach = Bach(p, train_transform, valid_transform)

In [77]:
train_dl, valid_dl = bach.get_dataloaders(32)

In [79]:
x,y = next(iter(train_dl))

In [586]:
def get_mean_std(loader):
    # var[X] = E[X**2] - E[X]**2
    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0

    for i, (data, _) in enumerate(loader):
        print(f'{i+1}/{len(loader)}', ' '*100, end='\r')
        channels_sum += torch.mean(data, dim=[0, 2, 3])
        channels_sqrd_sum += torch.mean(data ** 2, dim=[0, 2, 3])
        num_batches += 1

    mean = channels_sum / num_batches
    std = (channels_sqrd_sum / num_batches - mean ** 2) ** 0.5

    return mean, std


#mean, std = get_mean_std(dl)
#print(mean)
#print(std)

## Patchcamelyon

In [52]:
class PatchCamelyonDataset(Dataset):

    def __init__(self, root, transform, mode='train'):
        super().__init__()

        assert mode in ['train', 'valid', 'test']
        
        self.root = root
        self.transform = transform
        self.mode = mode

        self.X = h5py.File(root/f'camelyonpatch_level_2_split_{mode}_x.h5', 'r').get('x')
        self.y = h5py.File(root/f'camelyonpatch_level_2_split_{mode}_y.h5', 'r').get('y')

    def __getitem__(self, idx):
        x, y = self.X[idx], self.y[idx]
        x, y = self.transform(x), y.item()
        return x, y

    def __len__(self):
        return len(self.X)

In [115]:
class PatchCamelyon:
    
    def __init__(self, root, train_transform, valid_transform, download=False):
        
        self.root = root
        self.train_transform = train_transform
        self.valid_transform = valid_transform
        
        if download:
            self.download_data()
            
        self.train_ds, self.valid_ds, self.test_ds = self.prepare_datasets()
        
    def download_data(self):
        base_url = 'https://zenodo.org/record/2546921/files/'
        for mode in ['train', 'valid', 'test']:
            download_url(base_url + f'camelyonpatch_level_2_split_{mode}_meta.csv', self.root)
            for xy in ['x','y']: 
                download_and_extract_archive(base_url + f'camelyonpatch_level_2_split_{mode}_{xy}.h5.gz', self.root)
    
    def prepare_datasets(self):
        train_ds = PatchCamelyonDataset(self.root, transform=self.train_transform, mode='train')
        valid_ds = PatchCamelyonDataset(self.root, transform=self.valid_transform, mode='valid')
        test_ds  = PatchCamelyonDataset(self.root, transform=self.valid_transform, mode='test')
        
        return train_ds, valid_ds, test_ds
    
    def get_dataloaders(self, batch_size, shuffle=True, pin_memory=True, num_workers=0):
        
        train_dl = DataLoader(
            self.train_ds, batch_size=batch_size, shuffle=shuffle,
            num_workers=num_workers, pin_memory=pin_memory)
        
        valid_dl = DataLoader(
            self.valid_ds, batch_size=batch_size,
            num_workers=num_workers, pin_memory=pin_memory)
        
        test_dl = DataLoader(
            self.test_ds, batch_size=batch_size,
            num_workers=num_workers, pin_memory=pin_memory)
        
        return train_dl, valid_dl, test_dl

In [116]:
tsfm = transforms.Compose([
    transforms.ToTensor()
])

In [117]:
root = Path('/home/niklas/Internal_HDD/project_data/histopathology/pcam/')

In [118]:
pcam = PatchCamelyon(root, train_transform=tsfm, valid_transform=tsfm, download=False)

In [129]:
train_dl, valid_dl, test_dl = pcam.get_dataloaders(128)

In [125]:
x,y = next(iter(test_dl))

In [124]:
x.shape

torch.Size([32, 3, 96, 96])

In [131]:
mean, std = get_mean_std(train_dl)
print(mean)
print(std)

tensor([0.7008, 0.5384, 0.6916])
tensor([0.2350, 0.2774, 0.2129])


### NCT-CRC-HE-100K

In [194]:
root = Path('/home/niklas/Internal_HDD/project_data/histopathology/NCT-CRC-HE-100K/')

In [195]:
class NctCrcHe100K:
    
    def __init__(self, root, train_transform, valid_transform, download=False, color_norm=True):
        
        self.root = root
        self.train_transform = train_transform
        self.valid_transform = valid_transform
        self.color_norm = color_norm
        
        if download:
            self.download_data()
            
        self.train_ds, self.valid_ds = self.prepare_datasets()
        
    
    def download_data(self):
        base_url = 'https://zenodo.org/record/1214456/files/'
        download_and_extract_archive(base_url + 'NCT-CRC-HE-100K.zip', self.root)
        download_and_extract_archive(base_url + 'NCT-CRC-HE-100K-NONORM.zip', self.root)
        download_and_extract_archive(base_url + 'CRC-VAL-HE-7K.zip', self.root)
    
    
    def prepare_datasets(self):
        train_dir = 'NCT-CRC-HE-100K' if self.color_norm else 'NCT-CRC-HE-100K-NONORM'
        
        train_ds = ImageFolder(self.root/train_dir, self.train_transform)
        valid_ds = ImageFolder(self.root/'CRC-VAL-HE-7K', self.valid_transform)
        
        return train_ds, valid_ds
    
    
    def get_dataloaders(self, batch_size, shuffle=True, pin_memory=True, num_workers=0):
        train_dl = DataLoader(
            self.train_ds, batch_size=batch_size, shuffle=shuffle,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        valid_dl = DataLoader(
            self.valid_ds, batch_size=batch_size,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        return train_dl, valid_dl

In [196]:
nct = NctCrcHe100K(root, tsfm, tsfm, download=False, color_norm=False)

In [191]:
train_dl, valid_dl = nct.get_dataloaders(128)

In [181]:
x,y = next(iter(train_dl))

In [182]:
x.shape

torch.Size([128, 3, 224, 224])

In [166]:
x,y = next(iter(valid_dl))

In [197]:
mean, std = get_mean_std(train_dl)
print(mean)
print(std)

tensor([0.7358, 0.5804, 0.7012])                                                                            
tensor([0.2262, 0.2860, 0.2300])


## Breakhis

In [577]:
class BreakHis:
    
    def __init__(self, root, train_transform, valid_transform, label='tumor_class', download=False, valid_percent=0.2, shuffle=True):
        
        self.root = root
        self.label = label
        self.train_transform = train_transform
        self.valid_transform = valid_transform
        self.valid_percent = valid_percent
        self.shuffle = shuffle
        
        assert label in ['tumor_class', 'tumor_type']
        
        if download:
            self.download_data()
            
        self.ds_train, self.ds_valid, self.ds_test, self.sampler_train, self.sampler_valid, self.sampler_test = self.prepare_datasets()
        
    def download_data(self):
        url = 'http://www.inf.ufpr.br/vri/databases/BreaKHis_v1.tar.gz'
        download_and_extract_archive(url, self.root)
        self.root = self.root/'BreaKHis_v1/histology_slides/breast' # extend root directory to point to images
    
    def prepare_datasets(self):
        
        if self.label == 'tumor_type':
            
            # multiclass classification
            benign_classes    = [0,1,2,3]
            malignant_classes = [4,5,6,7]
            
            benign_types = self.root/'benign/SOB'
            malignant_types = self.root/'malignant/SOB'

            # instantiate copies of dataset
            ds_b_train = ImageFolder(benign_types, self.train_transform)
            ds_b_valid = ImageFolder(benign_types, self.valid_transform)
            ds_b_test  = ImageFolder(benign_types, self.valid_transform)

            ds_m_train = ImageFolder(malignant_types, self.train_transform)
            ds_m_valid = ImageFolder(malignant_types, self.valid_transform)
            ds_m_test  = ImageFolder(malignant_types, self.valid_transform)
            
            for ds_m in [ds_m_train, ds_m_valid, ds_m_test]:
                # offset classes
                img_paths, labels = list(zip(*ds_m.samples))
                labels = [label+4 for label in labels]
                ds_m_train.targets = labels
                ds_m_train.samples = list(zip(img_paths, labels))
                
            ds_b_train, ds_b_valid, ds_b_test = self._stratified_split(ds_b_train, ds_b_valid, ds_b_test, benign_classes)
            ds_m_train, ds_m_valid, ds_m_test = self._stratified_split(ds_m_train, ds_m_valid, ds_m_test, malignant_classes)

            ds_train = torch.utils.data.ConcatDataset([ds_b_train, ds_m_train])
            ds_valid = torch.utils.data.ConcatDataset([ds_b_valid, ds_m_valid])
            ds_test  = torch.utils.data.ConcatDataset([ds_b_test,  ds_m_test])

            ds_train.targets = ds_b_train.targets + ds_m_train.targets
            ds_valid.targets = ds_b_valid.targets + ds_m_valid.targets
            ds_test.targets  = ds_b_test.targets  + ds_m_test.targets
            
        else:
            
            classes = [0,1] # binary classification
            
            ds_train = ImageFolder(self.root, self.train_transform)
            ds_valid = ImageFolder(self.root, self.valid_transform)
            ds_test  = ImageFolder(self.root, self.valid_transform)
            
            ds_train, ds_valid, ds_test = self._stratified_split(ds_train, ds_valid, ds_test, classes)
            
        
        sampler_train = self._get_balanced_sampler(ds_train) 
        sampler_valid = self._get_balanced_sampler(ds_valid)
        sampler_test  = self._get_balanced_sampler(ds_test)
        
        return ds_train, ds_valid, ds_test, sampler_train, sampler_valid, sampler_test
    
    
    def _stratified_split(self, ds_train, ds_valid, ds_test, classes):
        
        X, y = list(zip(*ds_train.samples))
        
        stratify = np.repeat(classes, np.ceil(len(X)/len(classes)))[:len(X)]
        X_train, X_valid, y_train, y_valid = train_test_split(X, y,  stratify=stratify, test_size=0.4) # valid + test = 40% of train

        # split valid and test
        stratify = np.repeat(classes, np.ceil(len(X_valid)/len(classes)))[:len(X_valid)]
        X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid,  stratify=stratify, test_size=0.5) # valid and test are equally sized

        # update dataset samples and targets
        ds_train.samples = list(zip(X_train, y_train))
        ds_train.targets = y_train

        ds_valid.samples = list(zip(X_valid, y_valid))
        ds_valid.targets = y_valid

        ds_test.samples = list(zip(X_test,  y_test))
        ds_test.targets = y_test

        return ds_train, ds_valid, ds_test 
    
    
    def _get_balanced_sampler(self, ds):
        
        _, class_counts = np.unique(ds.targets, return_counts=True)
        n_classes = len(class_counts)
        num_samples = len(ds)
        labels = copy.copy(ds.targets)

        class_weights = [num_samples/class_counts[i] for i in range(n_classes)]
        weights = [class_weights[labels[i]] for i in range(num_samples)]
        sampler = WeightedRandomSampler(torch.tensor(weights), num_samples)
        
        return sampler
    
    
    def get_dataloaders(self, batch_size, pin_memory=True, num_workers=0):
        train_dl = DataLoader(
            self.ds_train, batch_size=batch_size, sampler=self.sampler_train,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        valid_dl = DataLoader(
            self.ds_valid, batch_size=batch_size, sampler=self.sampler_valid,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        test_dl = DataLoader(
            self.ds_test, batch_size=batch_size, sampler=self.sampler_test,
            num_workers=num_workers, pin_memory=pin_memory
        )
        
        return train_dl, valid_dl, test_dl

In [419]:
root = Path('/home/niklas/Internal_HDD/project_data/histopathology/BreakHis/')
url = 'http://www.inf.ufpr.br/vri/databases/BreaKHis_v1.tar.gz'
download_and_extract_archive(url, root)

Using downloaded and verified file: /home/niklas/Internal_HDD/project_data/histopathology/BreakHis/BreaKHis_v1.tar.gz
Extracting /home/niklas/Internal_HDD/project_data/histopathology/BreakHis/BreaKHis_v1.tar.gz to /home/niklas/Internal_HDD/project_data/histopathology/BreakHis


In [578]:
p = Path('/home/niklas/Internal_HDD/project_data/histopathology/BreakHis/BreaKHis_v1/histology_slides/breast/')

In [583]:
tsfm = transforms.Compose([transforms.Resize((512, 512)), transforms.ToTensor()])
breakhis = BreakHis(p, train_transform=tsfm, valid_transform=tsfm)

In [584]:
train_dl, valid_dl, test_dl = breakhis.get_dataloaders(batch_size=32)

In [588]:
ds = ImageFolder(p, tsfm)

In [590]:
dl = DataLoader(ds, batch_size=32)

In [591]:
x,y = next(iter(dl))

In [595]:
mean, std = get_mean_std(dl)

248/248                                                                                                     

In [596]:
mean

tensor([0.7871, 0.6265, 0.7644])

In [597]:
std

tensor([0.1279, 0.1786, 0.1127])