# Preprocess Train Data

## Imports

In [None]:
import gc
import os
import zipfile
from multiprocessing.dummy import Pool

import pandas as pd
import pyvips


## Global Variables

In [None]:
# Assumption that this is a Kaggle environment.
INPUT_DIRECTORY = "../input/mayo-clinic-strip-ai"
TRAIN_IMAGE_DIR = "../input/mayo-clinic-strip-ai/train"


## Helper Function

In [None]:
def preprocess_image(image_id, maxw, maxh):

    # Image is resized to maxw x maxh, smart cropped, and then saved to disk as a JPEG.

    # The smart crop is applied with attention features.
    # Other options: https://libvips.github.io/pyvips/enums.html#pyvips.enums.Interesting

    out = pyvips.Image.thumbnail(
        os.path.join(TRAIN_IMAGE_DIR, image_id + ".tif"),
        maxw,
        height=maxh,
        crop="attention",
    )
    out.write_to_file(image_id + ".jpeg", Q=100)
    del out
    gc.collect


def save_dataset(num_workers, iterable):

    # Enable verbose logging.
    os.environ["VIPS_PROGRESS"] = "1"
    # Limit pyvips to two threads.
    os.environ["VIPS_CONCURRENCY"] = "2"

    # Source: https://github.com/libvips/pyvips/issues/291#issuecomment-994714555
    pool = Pool(num_workers)
    results = pool.starmap(preprocess_image, iterable)
    pool.close()
    pool.join()
    

## Process Images

In [None]:
train = pd.read_csv(f"{INPUT_DIRECTORY}/train.csv")

# Resize images, apply smart crop, and save to disk in current directory.
image_ids = train["image_id"]
max_width = 1024
max_height = 1024
iter = [(image_id, max_width, max_height) for image_id in image_ids]
save_dataset(num_workers=3, iterable=iter)


In [None]:
# Zip the images into a single archive.
with zipfile.ZipFile("images.zip", "w") as zip:
    for file in os.listdir():
        if file.endswith(".jpeg"):
            zip.write(file)

# Or use `zip -r images.zip *.jpeg`
# !zip -r images.zip *.jpeg
