# Preprocess Train Data

## Imports

In [None]:
import gc
import os
from glob import glob

import numpy as np
import pyvips
import skimage.io as io
from PIL import Image
from skimage.color import rgb2gray, rgba2rgb
from skimage.filters.thresholding import threshold_otsu
from skimage.util import img_as_ubyte
from tqdm.auto import trange


## Helper Function

In [None]:
def pre_process_data(path, save_path):
    """Reads in the data from the path and saves the image to disk."""
    try:        
        img = pyvips.Image.thumbnail(path, 1792, crop="attention")
  
        image = np.ndarray(
            buffer=img.write_to_memory(),            
            shape=[img.height, img.width, img.bands],
        )

        # Thresholding the image
        image_gray = rgb2gray(image)
        # Find threshold between background and foreground
        thresh = threshold_otsu(image_gray)
        binary = image_gray <= thresh
        # Source: https://stackoverflow.com/questions/72239660/how-can-one-apply-a-mask-on-a-numpy-array-which-leaves-the-original-values-uncha
        image = image[:, :, ...] * binary[..., None]

        # Remove regions with no signal.
        # Source: https://www.kaggle.com/code/abhishek123maurya/image-cropping-without-altering-pixel-values
        # Iterate through "rows" of the image.
        rm = [i for i in range(image.shape[0]) if len(np.unique(image[i, :])) <= 75]
        img = np.delete(image, rm, axis=0)
        # Iterate through "columns" of the image.
        rm = [i for i in range(image.shape[1]) if len(np.unique(image[:, i])) <= 75]
        img = np.delete(image, rm, axis=1)

        io.imsave(save_path, image, quality=100)
    finally:
        del rm
        del binary
        del image_gray
        del image
        del img
        gc.collect()


## Process Image

In [None]:
# Make directory to save processed images.
os.makedirs("train_data_cropped", exist_ok=True)

# Get the paths to each training image.
train_images = glob("../input/mayo-clinic-strip-ai/train/*.tif")

# Preprocess the training images and save each iamge.
for i in trange(len(train_images)):
    pre_process_data(train_images[i], train_images[i].replace("../input/mayo-clinic-strip-ai/train/", "./train_data_cropped/").replace(".tif", ".jpeg")) 


In [None]:
# Zip the images into a single archive.
!zip -r images.zip *.jpeg
