This notebook is to quickly explore the data in the PANDA dataset and familiarise with the processing functionality

In [None]:
import os 
import sys
import math
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from torch.utils.data import DataLoader
from monai.data.image_reader import WSIReader
from monai.data import Dataset

current_dir = Path(os.getcwd())
sys.path.append(str(current_dir.parent))

from panda_dataset import PandaDataset, LoadPandaROId
from utils.viz_utils import plot_panda_data_sample, load_image_dict

In [None]:
panda_dir="/tmp/datasets/PANDA"
path_train_csv = panda_dir + "/train.csv"

### Train.csv

In [None]:
df_train_csv = pd.read_csv(path_train_csv)

train.csv is the dictionary file used to select which slides to use for training. 
It does contain some metadata and it doesn't contain any actual image data

In [None]:
df_train_csv.head()

In [None]:
df_train_csv.shape

About 10k images in the training set (~90% of the total)

In [None]:
df_train_csv['isup_grade'].hist()

scores are a bit imbalanced

In [None]:
df_train_csv['data_provider'].hist()

data providers are balanced

In [None]:
# this is the Monai Dataset, what it adds with respect to the standard dataset?
# dataset = Dataset(panda_dataset)

### Dataset objects and  LoadPandaROId (called inside load_image_dict)

In [None]:
panda_dataset = PandaDataset(root_dir=panda_dir, n_slides=1)
# Using the dataloader to avoid copying the operations in get_item to pass from tiff path to actual tiff
loader = DataLoader(panda_dataset, batch_size=1)

for _, dict_image in enumerate(loader):
    print(dict_image)
    print(load_image_dict(dict_image, level=1, margin=64))

### Plot single slide

In [None]:
# staining/normalization seems different in the different channels
fig, axes = plt.subplots(ncols=3, nrows=1)
for channel in range(3):
    axes[channel].imshow(dict_image['image'][channel], clim=(0, 255), cmap='gray')
fig.tight_layout()
plt.show()

In [None]:
plt.imshow(dict_image['image'].transpose(1,2,0))
plt.show()

### Plot samples at different resolution level

In [None]:
plot_panda_data_sample(panda_dir, nsamples=12, ncols=4, level=1, margin=64)

* Each slide looks very different and has different shape! 
* shape doesn't seem connected with the provider
* staining is clearly connected with the provider - images from radboud are lighter

In [None]:
plot_panda_data_sample(panda_dir, nsamples=12, ncols=4, level=2, margin=64)

this will take a bit to run! level 0 is the highest resolution

In [None]:
plot_panda_data_sample(panda_dir, nsamples=12, ncols=4, level=0, margin=64)

we should focus on a single image to see the difference of the detail depending on the resolution

### Pixel distribution (without clipping)

In [None]:
panda_dataset = PandaDataset(root_dir=panda_dir, nrows=12)
loader = DataLoader(panda_dataset, batch_size=1)

ncols=3
fig, axes = plt.subplots(ncols=ncols, nrows=12, figsize=(7, 18))
for i, dict_images in enumerate(loader):
    slide_id = dict_images['image_id']
    print(f">>> Slide {slide_id}")
    img = load_image_dict(dict_images, level=1, margin=64)
    for ch in range(3):
        img = dict_image['image'][0].flatten()
        npix = len(img)
        axes[i, ch].hist(np.random.choice(dict_image['image'][ch].flatten(), size=int(npix*0.5), replace=False), bins=100);
fig.tight_layout()

distributions don't seem wildly different, good. This is a small sample

### Pixel distribution (with clipping)

In [None]:
panda_dataset = PandaDataset(root_dir=panda_dir, nrows=12)
loader = DataLoader(panda_dataset, batch_size=1)

ncols=3
fig, axes = plt.subplots(ncols=ncols, nrows=12, figsize=(7, 18))
for i, dict_images in enumerate(loader):
    slide_id = dict_images['image_id']
    print(f">>> Slide {slide_id}")
    img = load_image_dict(dict_images, level=1, margin=64)
    for ch in range(3):
        img = dict_image['image'][0].flatten()
        npix = len(img)
        axes[i, ch].hist(np.random.choice(dict_image['image'][ch].flatten(), size=int(npix*0.5), replace=False), bins=100);
        axes[i, ch].set_ylim([0,50000])
fig.tight_layout()

Note: the background is not always exactly 0