# Mayo Clinic Create Parsed Image Dataset

Much of this code was adapted from: https://www.kaggle.com/code/simsonimus/hubmap-training

- Split into parts based on file size

In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from skimage import io
import glob, os, json, cv2, gc, shutil
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tqdm.notebook import tqdm

IMG_SIZE = 1024
INPUT_PATH = "../input/mayo-clinic-strip-ai"
PRINT_PLOTS = False
FULL_RUN = True

def create_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
        
# create_folder("./plots")
create_folder("./train")
create_folder("./test")

df_train = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))
display(df_train)

df_test = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'))
display(df_test)

GROUP_TO_RUN = 1
TOTAL_GROUPS = 10


# Get Filesizes
- Split into equally sized groups based on these file sizes

In [None]:
for i, d in tqdm(df_train.iterrows()):

    image_id = d['image_id']
    file_size = os.path.getsize(f'../input/mayo-clinic-strip-ai/train/{image_id}.tif')
    df_train.loc[i, 'file_size'] = file_size
target_group_size = df_train['file_size'].sum() / (TOTAL_GROUPS - 1)

In [None]:
df_train['file_size'].plot(kind='hist', bins=20)

In [None]:
# Create 10 equal sized groups by file size
group = 1
group_size = 0
for i, d in tqdm(df_train.iterrows()):
    image_id = d['image_id']
    file_size = os.path.getsize(f'../input/mayo-clinic-strip-ai/train/{image_id}.tif')
    if group_size + file_size > target_group_size:
        group += 1
        group_size = file_size
    else:
        group_size += file_size
    df_train.loc[i, 'file_size'] = file_size
    df_train.loc[i, 'group'] = group
    
df_train['group'] = df_train['group'].astype('int')
df_train.to_csv('train_with_groups.csv', index=False)

In [None]:
df_train.groupby('group')['file_size'].sum().plot(kind='barh', figsize=(10, 5))

In [None]:
df_train = df_train.query('group == @GROUP_TO_RUN')

# Functions

In [None]:
def read_tiff(image_path):
    image = io.imread(image_path)
    image = np.squeeze(image) # some images have unnecessary axes with shape 1 --> remove
    if image.shape[0] == 3: # some images have color as first axis -> swap axes
        image = image.swapaxes(0,1)
        image = image.swapaxes(1,2)
    return image

def read_mask(image, encoded_mask):
    mask = rle_decode(encoded_mask, (image.shape[1], image.shape[0])) # with inverted axes
    mask = mask.swapaxes(0,1) # swap back axes
    mask = np.expand_dims(mask, -1) # add one axis to have same shape as images
    return mask

def delete_directory_contents(dir):
    for file in os.scandir(dir):
        os.remove(file.path)
        
def plot_masked_image(image, mask, name):
    plt.imshow(image, interpolation='none')
    plt.imshow(mask, cmap='jet', alpha=0.3, interpolation='none')
    
    plt.savefig(f"./plots/{name}.png", dpi = 1000)
    plt.show()
    
def slice_images(image_id, image, mask=[], folder=""):
    print('Slicing Image ' + image_id + ' ...')

    possible_slices_x = image.shape[0] // IMG_SIZE
    possible_slices_y = image.shape[1] // IMG_SIZE

    for x in range(possible_slices_x):
        for y in range(possible_slices_y):
            image_slice = image[x * IMG_SIZE : (x+1) * IMG_SIZE, y * IMG_SIZE : (y+1) * IMG_SIZE]
            
            #if np.any(image_slice) and not (image_slice > 200).all(): # only process non-black and non-gray images --> no background images

            if not len(mask) == 0:
                mask_slice = mask[x * IMG_SIZE : (x+1) * IMG_SIZE, y * IMG_SIZE : (y+1) * IMG_SIZE] * 255
                if 255 in mask_slice:
                    cv2.imwrite(f"./{folder}/{image_id}-imgslice.{x}.{y}.jpg", image_slice)
                    cv2.imwrite(f"./{folder}/{image_id}-maskslice.{x}.{y}.png", mask_slice.astype(int))
            else:
                cv2.imwrite(f"./{folder}/{image_id}-imgslice.{x}.{y}.jpg", image_slice)

# ref.: https://www.kaggle.com/stainsby/fast-tested-rle
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

## ref.: https://www.kaggle.com/bguberfain/memory-aware-rle-encoding
def rle_encode_less_memory(img):
    pixels = img.T.flatten()
    
    # This simplified method requires first and last pixel to be zero
    pixels[0] = 0
    pixels[-1] = 0
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 2
    runs[1::2] -= runs[::2]
    
    return ' '.join(str(x) for x in runs)

# Slice Training Images

In [None]:
def slice_training_images(df_train):
    if not FULL_RUN:
        df_train = df_train.iloc[0:1, :]  # only use one training image for quicker debug runs
    else:
        df_train = df_train.iloc[1:, :]
    for index, train_sample in tqdm(df_train.iterrows(), total=len(df_train)):
        image_id = train_sample['image_id']

        image_path = os.path.join(INPUT_PATH, f"train/{image_id}.tif")
        image = read_tiff(image_path)

        slice_images(image_id, image, [], "train")

In [None]:
slice_training_images(df_train)

# Slice Test Images

In [None]:
def slice_test_images(df_test):
    if not FULL_RUN:
        df_test = df_test.iloc[0:1, :]  # only use one training image for quicker debug runs
    else:
        df_test = df_test.iloc[1:, :]
    for index, test_sample in tqdm(df_test.iterrows(), total=len(df_test)):
        image_id = test_sample['image_id']

        image_path = os.path.join(INPUT_PATH, f"test/{image_id}.tif")
        image = read_tiff(image_path)

        slice_images(image_id, image, [], "test")

In [None]:
# slice_test_images(df_test)