In [1]:
##########################################################################################################
# Author: Mihaly Sulyok & Peter Karacsonyi                                                               #
# Last updated: 2023 Dec 4                                                                               #
# This workbook does wsi preprocessing with Pathml version 2.1.1                                         #
# Input: wsi image files, tested with OpenSlide .tif                                                     #
# Transformations: MedianBlur, BinaryThreshold, Tiling, MorphOpen, MorphClose, ForegroundDetection       #
# Read more on transformations: https://pathml.readthedocs.io/en/latest/examples/link_gallery.html       #
# Output: h5path files (https://pathml.readthedocs.io/en/latest/h5path.html)                             #
##########################################################################################################

In [10]:
import os
if os.name == 'nt':
    import openslideimport #on windows, openslide needs to be installed manually, check local openslideimport.py

# global imports
import time
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# pathml imports
from pathml import types
from pathml.core import SlideData, SlideDataset
from pathml.preprocessing import Pipeline, MedianBlur, MorphOpen, MorphClose, BinaryThreshold, ForegroundDetection


In [11]:
# set working directory
base_dir = Path("G:\echino")
# base_dir = Path("/mnt/g/placenta/")
# wsi folder
wsi_subfolder = "wsi"
# place geojsons into this folder inside base with same name as the image (echino23.tiff / echino38.gejson)
annotations_subfolder = "wsiannotation-dumps"
# output h5path files to
h5path_subfolder = "h5"
# filenames having this substring without endpoint / have no annotation (normal23.tiff)
normal_filename = "norm"

In [12]:
# relative directory generation (don't modify)
data_dir = base_dir / Path(wsi_subfolder)                       # input
annotations_subfolder = base_dir / Path(annotations_subfolder)  # input : need to pre-generate with annotation_batch_reader
h5_dir = base_dir / Path(h5path_subfolder)                      # output

# read wsi files
wsi_paths = list(data_dir.glob("*.tif"))

# build list of SlideData objects, label with output class, add annotation
wsi_list = []
for wsi in wsi_paths:
    annotation_mask = None
    # label preparation based on filename
    if normal_filename in wsi.name.lower():
        label = {[("class", "normal")]}
    else: 
        label = {[("class", "echino")]}
        try: 
            annotation_mask = np.load(f"{Path( annotations_subfolder / wsi.stem )}.npy", allow_pickle=True)
        except Exception as e:
            print(f"Annotation mask could not be loaded for {wsi.stem}, error: {e}")
            pass

    # adding annotation as a mask if present (better not add empty mask)
    if annotation_mask is not None:
        wsi_list.append(
            SlideData(
                wsi.as_posix(), 
                name = wsi.as_posix(), 
                backend = "openslide", 
                slide_type = types.HE,
                labels = label,
                masks = {"tumor": annotation_mask}
            )
        )
        print(f"annotation mask successfully appended")
    else:
        wsi_list.append(
            SlideData(
                wsi.as_posix(), 
                name = wsi.as_posix(), 
                backend = "openslide", 
                slide_type = types.HE,
                labels = label
            )
        )

del annotation_mask

# initialize a SlideDataset Object
slide_dataset = SlideDataset(wsi_list)

In [13]:
# Draw the images
print(f"Read {len(slide_dataset)} wsi images: ")
for slide in slide_dataset:
    _, axs = plt.subplots(figsize=(2, 2))
    slide.plot(ax=axs)
    axs.set_title(label=slide.name, fontsize=8)

Read 0 wsi images: 


In [6]:
# define preprocessing pipeline
pipeline = Pipeline([
    MedianBlur(kernel_size=5), # default 5
    BinaryThreshold(mask_name="tissue", use_otsu=True),
    MorphOpen(mask_name="tissue", kernel_size=5, n_iterations=1),
    MorphClose(mask_name="tissue", kernel_size=5, n_iterations=1),
    ForegroundDetection(mask_name="tissue", min_region_size=500, max_hole_size=1500, outer_contours_only=True)
])

In [7]:
# run preprocessing pipeline
start_time = time.time()
slide_dataset.run(
    pipeline,
    distributed=False, # distributed processing always crashes / memleak on wsl. on windows it throws OpenSlide DLL not found error
    # client = client, # distributed processing client
    tile_size=500, # https://www.sciencedirect.com/science/article/pii/S2352914822000053
    overwrite_existing_tiles=True,
    write_dir=str(h5_dir), # The write_dir is buggy: places h5path files to where images were, so I move them to another dir...
)

# counting extracted tiles:
total_tiles = 0
for slide in slide_dataset.slides:
    total_tiles = total_tiles + len(slide.tiles.keys)

print(f"Total number of tiles extracted: {total_tiles}")

# move h5path files from wsi dir to a new dir
h5_dir.mkdir(0o644, parents=True, exist_ok=True)
h5_paths = list(data_dir.glob("*.h5path"))
print(f"Writing {len(h5_paths)} h5path images to {str(h5_dir)}: ")
for h5file in h5_paths:
    h5file.replace(h5_dir/h5file.name)
    print(h5_dir/h5file.name)

time_elapsed = time.time() - start_time
print('Preprocessing completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

Total number of tiles extracted: 388940
Writing 46 h5path images to G:\echinov3\h5: 
G:\echinov3\h5\echino13.tif.h5path
G:\echinov3\h5\echino14.tif.h5path
G:\echinov3\h5\echino15.tif.h5path
G:\echinov3\h5\echino16.tif.h5path
G:\echinov3\h5\echino17.tif.h5path
G:\echinov3\h5\echino19.tif.h5path
G:\echinov3\h5\echino21.tif.h5path
G:\echinov3\h5\echino23.tif.h5path
G:\echinov3\h5\echino24.tif.h5path
G:\echinov3\h5\echino25.tif.h5path
G:\echinov3\h5\echino26.tif.h5path
G:\echinov3\h5\echino27.tif.h5path
G:\echinov3\h5\echino28.tif.h5path
G:\echinov3\h5\echino29.tif.h5path
G:\echinov3\h5\echino3.tif.h5path
G:\echinov3\h5\echino30.tif.h5path
G:\echinov3\h5\echino31.tif.h5path
G:\echinov3\h5\echino32.tif.h5path
G:\echinov3\h5\echino35.tif.h5path
G:\echinov3\h5\echino39.tif.h5path
G:\echinov3\h5\echino40.tif.h5path
G:\echinov3\h5\echino41.tif.h5path
G:\echinov3\h5\echino42.tif.h5path
G:\echinov3\h5\echino50.tif.h5path
G:\echinov3\h5\echino51.tif.h5path
G:\echinov3\h5\echino61.tif.h5path
G:\ech

In [8]:
# TODO:
# 1) create a separate preprocessing notebook where one big tile gets transformed and plotted for verification of transformations
# 2) plot mask on image (example in snippets)

In [9]:
# NOTES:
# labels can only be applied at SlideData object creation at the beginning, I did not find way to modify it later (tried with setattr etc)
# Try albumentations: it applies random transformations like 90 degree rotations etc 
# https://albumentations.ai/ & https://pathml.readthedocs.io/en/latest/examples/link_train_hovernet.html#Data-augmentation