# Preprocessing input images and annotations

In [None]:
%matplotlib inline

In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pathlib 
import random
import scipy.stats
import shutil

import skimage.filters
import skimage.io
import skimage.segmentation

import utils.data_augmentation
import utils.data_split
import warnings

from tqdm import tqdm

# Parameters

In [None]:
from config import config_vars
config_vars

In [None]:
# Prepare split files

if config_vars["create_split_files"]:
    file_list = os.listdir(config_vars["dir_raw_images"])

    [list_training, list_test, list_validation] = utils.data_split.create_image_lists(
        config_vars["dir_raw_images"],
        .5,
        .25
    )

    utils.data_split.write_path_files(config_vars["path_files_training"], list_training)
    utils.data_split.write_path_files(config_vars["path_files_test"], list_test)
    utils.data_split.write_path_files(config_vars["path_files_validation"], list_validation)

In [None]:
# Create output directories
os.makedirs(config_vars["dir_training"], exist_ok=True)
os.makedirs(config_vars["dir_validation"], exist_ok=True)
os.makedirs(config_vars["dir_test"], exist_ok=True)
os.makedirs(config_vars["dir_boundary_labels"], exist_ok=True)
os.makedirs(config_vars["dir_images_normalized_8bit"], exist_ok=True)

## Read data partitions
- To split up in training, validation and test sets

In [None]:
with open(config_vars["path_files_training"]) as f:
    training_files = f.read().splitlines()
    if config_vars["max_training_images"] > 0:
        random.shuffle(training_files)
        training_files = training_files[0:config_vars["max_training_images"]]
        
with open(config_vars["path_files_validation"]) as f:
    validation_files = f.read().splitlines()
    
with open(config_vars["path_files_test"]) as f:
    test_files = f.read().splitlines()

## Create Output Targets: Three Class Boundary

In [None]:
filelist = sorted(os.listdir(config_vars["dir_raw_annotations"]))

# run over all raw images
for filename in tqdm(filelist):
    
    # GET ANNOTATION
    annot = skimage.io.imread(config_vars["dir_raw_annotations"] + filename)
    
    # strip the first channel
    if len(annot.shape) == 3:
        annot = annot[:,:,0]
    
    # label the annotations nicely to prepare for future filtering operation
    annot = skimage.morphology.label(annot)
    
    # filter small objects, e.g. micronulcei
    annot = skimage.morphology.remove_small_objects(annot, min_size=config_vars["min_nucleus_size"])
    
    # find boundaries
    boundaries = skimage.segmentation.find_boundaries(annot)

    for k in range(2, config_vars["boundary_size"], 2):
        boundaries = skimage.morphology.binary_dilation(boundaries)
        
    # BINARY LABEL
    
    # prepare buffer for binary label
    label_binary = np.zeros((annot.shape + (3,)))
    
    # write binary label
    label_binary[(annot == 0) & (boundaries == 0), 0] = 1
    label_binary[(annot != 0) & (boundaries == 0), 1] = 1
    label_binary[boundaries == 1, 2] = 1
    
    # save it - converts image to range from 0 to 255
    skimage.io.imsave(config_vars["dir_boundary_labels"] + filename, label_binary)
    

# Show example image 
print(annot.dtype, annot.shape)

# plot original annotation
plt.figure(figsize=(15,15))
plt.imshow(annot)
plt.colorbar()
plt.show()

# plot boundary labels
plt.figure(figsize=(15,15))
plt.imshow(label_binary)
plt.colorbar()
plt.show()

## Image Preprocessing

In [None]:
if config_vars["transform_images_to_PNG"]:

    filelist = sorted(os.listdir(config_vars["dir_raw_images"]))

    # run over all raw images
    for filename in tqdm(filelist):

        # load image and its annotation
        orig_img = skimage.io.imread(config_vars["dir_raw_images"] + filename)       

        # IMAGE

        # normalize to [0,1]
        percentile = 99.9
        high = np.percentile(orig_img, percentile)
        low = np.percentile(orig_img, 100-percentile)

        img = np.minimum(high, orig_img)
        img = np.maximum(low, img)

        img = (img - low) / (high - low) # gives float64, thus cast to 8 bit later
        img = skimage.img_as_ubyte(img) 

        skimage.io.imsave(config_vars["dir_images_normalized_8bit"] + filename[:-3] + 'png', img)

    # Show example image
    print("BEFORE")
    print(img.dtype, img.shape)
    plt.imshow(img)
    plt.show()
    plt.hist(img.flatten())
    plt.show() 
        
    print("AFTER")
    print(img.dtype, img.shape)
    plt.imshow(img)
    plt.show()
    plt.hist(img.flatten())
    plt.show()
    
else:
    config_vars["dir_images_normalized_8bit"] = config_vars["dir_raw_images"]

# Augment images (optional) 
* data augmentation using affine transformations 
* n_points x n_points data points are equally distributed in the image 
* distort 
* n_augmentations images are calculated for each image 


In [None]:
def generate_augmented_examples(filelist, n_augmentations, n_points, distort, dir_boundary_labels, dir_images_normalized_8bit):
    
    updated_filelist = []
    
    # run over all raw images
    for filename in tqdm(filelist):
            
        # check if boundary labels were calculated 
        my_file = pathlib.Path(dir_boundary_labels + filename)
        
        if my_file.is_file():
            
            # load image 
            x = skimage.io.imread(dir_images_normalized_8bit + filename)
            
            # load annotation 
            y = skimage.io.imread(dir_boundary_labels + filename)
            
            for n in range(1,n_augmentations):
                # augment image and annotation 
                x_augmented, y_augmented = utils.data_augmentation.deform(x, y, points = n_points, distort = distort)
                
                # filename for augmented images
                filename_augmented = os.path.splitext(filename)[0] + '_aug_{:03d}'.format(n) + os.path.splitext(filename)[1]
                skimage.io.imsave(dir_images_normalized_8bit + filename_augmented, x)
                skimage.io.imsave(dir_boundary_labels + filename_augmented, y)
                updated_filelist.append(filename_augmented)
                
    return filelist + updated_filelist 

if config_vars["augment_images"]:
    training_files = generate_augmented_examples(
        training_files, 
        config_vars["n_augmentations"], 
        config_vars["n_points"], 
        config_vars["distort"], 
        config_vars["dir_boundary_labels"], 
        config_vars["dir_images_normalized_8bit"]
    )


## Split data
- Split up in training, validation and test sets

In [None]:
# Add a 0 suffix because Keras thinks everything is a classification task organized in directories.

os.makedirs(config_vars["dir_training"] + 'x/', exist_ok=True)
os.makedirs(config_vars["dir_validation"] + 'x/0/', exist_ok=True)
os.makedirs(config_vars["dir_test"] + 'x/0/', exist_ok=True)

os.makedirs(config_vars["dir_training"] + 'y/', exist_ok=True)
os.makedirs(config_vars["dir_validation"] + 'y/0/', exist_ok=True)
os.makedirs(config_vars["dir_test"] + 'y/0/', exist_ok=True)

for filename in training_files:
    shutil.copyfile(
        config_vars["dir_images_normalized_8bit"] + filename, 
        config_vars["dir_training"] + 'x/' + filename
    )
    shutil.copyfile(
        config_vars["dir_boundary_labels"] + filename, 
        config_vars["dir_training"] + 'y/' + filename
    )
    
for filename in validation_files:
    shutil.copyfile(
        config_vars["dir_images_normalized_8bit"] + filename, 
        config_vars["dir_validation"] + 'x/0/' + filename
    )
    shutil.copyfile(
        config_vars["dir_boundary_labels"] + filename, 
        config_vars["dir_validation"] + 'y/0/' + filename
    )
    
for filename in test_files:
    shutil.copyfile(
        config_vars["dir_images_normalized_8bit"] + filename, 
        config_vars["dir_test"] + 'x/0/' + filename
    )
    shutil.copyfile(
        config_vars["dir_boundary_labels"] + filename, 
        config_vars["dir_test"] + 'y/0/' + filename
    )