<br>

<a id="imports"></a>

<h1 style="font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; background-color: #ffffff; color: #E55CA0;" id="imports">0&nbsp;&nbsp;IMPORTS & INSTALLS&nbsp;&nbsp;&nbsp;&nbsp;<a href="#toc">&#10514;</a></h1>

<br>

**PYVIPS INSTALL CODE ONLY WORKS PINNED TO ORIGINAL ENVIRONMENT (2020)**
- Original: https://www.kaggle.com/hirune924/fast-image-region-loading-using-pyvips
  - Thanks for @hirune924 !!
- https://www.kaggle.com/code/yukkyo/using-pyvips-without-internet/comments
- Install needs about 5 minutes

**I'M NOT SURE WHY BUT THIS ONLY WORKS WHEN A GPU IS ATTACHED...**

In [None]:
print("\n... INSTALLING LOCAL VERSION OF PYVIPS! ...")
!dpkg -i --force-depends /kaggle/input/pyvips-local/libvips-apt/libvips-apt/*.deb >/dev/null 2>&1
!pip install /kaggle/input/pyvips-local/cffi-1.14.4-cp37-cp37m-manylinux1_x86_64.whl
!pip install /kaggle/input/pyvips-local/pycparser-2.20-py2.py3-none-any.whl
!pip install /kaggle/input/pyvips-local/pyvips-2.1.13-py2.py3-none-any.whl

from PIL import Image; Image.MAX_IMAGE_PIXELS = 5_000_000_000;
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cv2 # for resize
import pyvips
import os

print("... INSTALL COMPLETE! ...\n")

<br>

<a id="helper_functions"></a>

<h1 style="font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; background-color: #ffffff; color: #E55CA0;" id="helper_functions">1&nbsp;&nbsp;HELPER FUNCTIONS&nbsp;&nbsp;&nbsp;&nbsp;<a href="#toc">&#10514;</a></h1>

In [None]:
# vips image to numpy array
def vips2numpy(vi):
    
    # map vips formats to np dtypes
    format_to_dtype = {
        'uchar': np.uint8,       'char': np.int8,
        'ushort': np.uint16,     'short': np.int16,
        'uint': np.uint32,       'int': np.int32,
        'float': np.float32,     'double': np.float64,
        'complex': np.complex64, 'dpcomplex': np.complex128,
    }
    
    # Return newly written np.ndarray
    return np.ndarray(buffer=vi.write_to_memory(),
                      dtype=format_to_dtype[vi.format],
                      shape=[vi.height, vi.width, vi.bands])

def pyvips_open_downsampled_slide(img_path, downsample_by=8, as_numpy=True, resize_to=(512,512)):
    """
    
    Helper function to convert WSI into smaller downscaled version using pyvips.
    
    Timing details for MAYO CLINIC STRIP AI dataset:
        SMALLEST IMAGE BY AREA (4417, 5314)
            * Function takes ~1 seconds to run
        MEDIAN IMAGE BY AREA   (17573, 38743)(~30X LARGER THAN SMALLEST IMAGE)
            * Function takes ~30 seconds to run
        LARGEST IMAGE BY AREA  (48282, 101406)(~208X LARGER THAN SMALLEST IMAGE)(~7X LARGER THAN MEDIAN IMAGE)
            * Function takes ~405 seconds to run
    
    Args:
        img_path (str): Path to .tif file to be downsampled
        downsample_by (int): How many times smaller should resultant 
            image be. i.e. image_size*(1/downsample_by) = new_size
            -1 will yield maximum downsample above resized image shape
        as_numpy (bool, optional): Whether to return image as numpy array (default)
           or leave as PIL.Image object for further manipulation
        resize_to (tuple of ints, optional): What to resize the downsampled image to
    
    Returns:
        Downsampled image as a numpy array of type uint8 with only 3 channels
    """
    
    # Open the image with PIL
    tmp_img = pyvips.Image.new_from_file(img_path)    
    
    print("\n... APPROXIMATE TIME TO LOAD IMAGE IS AT MOST APPROXIMATELY: " \
          f"{int((405/(48282*101406))*(tmp_img.width*tmp_img.height))} SECONDS ...\n")
    
    # if -1 than we downsample by whatever results in the image having dimensions as
    # close to 512x512 as possible so the image can be resized after
    
    if downsample_by==-1:
        _epsilon = 1e-3
        downsample_by=min(tmp_img.width, tmp_img.height)/resize_to[0]-_epsilon
    
    # Resize the image
    tmp_img = tmp_img.resize(1/downsample_by)
    tmp_img = vips2numpy(tmp_img) if as_numpy else tmp_img
    tmp_img = cv2.resize(tmp_img, resize_to) if resize_to is not None else tmp_img
    
    return tmp_img

<br>

<a id="demo"></a>

<h1 style="font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; background-color: #ffffff; color: #E55CA0;" id="demo">2&nbsp;&nbsp;DEMO&nbsp;&nbsp;&nbsp;&nbsp;<a href="#toc">&#10514;</a></h1>

<br>

In [None]:
print("\n... BASIC SETUP STARTING ...\n\n")

# Path to competition data
DATA_DIR = "/kaggle/input/mayo-clinic-strip-ai"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
train_df = pd.read_csv(TRAIN_CSV)
train_df["image_path"] = train_df["image_id"].apply(lambda x: os.path.join(TRAIN_DIR, x+".tif"))

print("\n... TRAINING DATAFRAME... \n")
display(train_df)

# Capture examples that span smallest, median, and large image size examples
SMALLEST_IMAGE_ID = "b43ebe_0"
SMALLEST_IMAGE_ROW = train_df[train_df.image_id==SMALLEST_IMAGE_ID]
MEDIAN_IMAGE_ID = "719165_0"
MEDIAN_IMAGE_ROW = train_df[train_df.image_id==MEDIAN_IMAGE_ID]
LARGEST_IMAGE_ID = "6baf51_0"
LARGEST_IMAGE_ROW = train_df[train_df.image_id==LARGEST_IMAGE_ID]

# Prove that we can infer on all image sizes available in training dataset 
#   --> (smallest, median and largest)
plt.figure(figsize=(20,15))

print("\n\n\n... DEMO FOR SMALLEST, MEDIAN, & LARGEST (WOULD NORMALLY CRASH PIL) IMAGES IN TRAINING DATA \n\n\n")
plt.subplot(1,3,1)
plt.imshow(pyvips_open_downsampled_slide(SMALLEST_IMAGE_ROW.image_path.values[0], downsample_by=-1, as_numpy=True, resize_to=(512,512)))
plt.title("SMALLEST IMAGE", fontweight="bold")

plt.subplot(1,3,2)
plt.imshow(pyvips_open_downsampled_slide(MEDIAN_IMAGE_ROW.image_path.values[0], downsample_by=-1, as_numpy=True, resize_to=(512,512)))
plt.title("MEDIAN IMAGE", fontweight="bold")

plt.subplot(1,3,3)
plt.imshow(pyvips_open_downsampled_slide(LARGEST_IMAGE_ROW.image_path.values[0], downsample_by=-1, as_numpy=True, resize_to=(512,512)))
plt.title("LARGEST IMAGE", fontweight="bold")

plt.tight_layout()
plt.show()