# Prepare raw data for training

Purpose is to merge Zahra's original dataset (while removing some poorly annotated images) and the new dataset from April 19.

## Place data in `data/raw`

The raw image data from Zahra consists of identically named '.tif' files of the OCT scans and the labeled ground truth masks. This image data was organized in the following:
```
   └── raw
      ├── OCT_scans
      │  ├── images
      │  │  └── ....tif
      │  └── masks
      │     └── ....tif
      └── OCT_scans_new_20230419
         ├── images
         │  └── ....tif
         ├── masks
         │  └── ....tif
         ├── merged_masks # this is mask w/ {0:"bg", 1:"skull", 2:"skin"}
         │  └── ....tif
         ├── skin_mask
         │  └── ....tif
         └── skull_mask
            └── ....tif
```

## Copy data from `data/raw` to `data/interim`

In [13]:
# Prepare notebook
import os
import shutil
from distutils.dir_util import copy_tree
from matplotlib import pyplot as plt
import tifffile
import numpy as np
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# define raw data directories
orig_raw_dir = "../data/raw/OCT_scans"
update_raw_dir = "../data/raw/OCT_scans_new_20230419"

# define new data interim directory
interim_dir = "../data/interim/OCT_scans_original_and_20230419"

# Copy the original data to the interim directory
shutil.copytree(src=orig_raw_dir, dst=interim_dir, dirs_exist_ok=True)

# Copy the images and masks from the updated data to interim dir
# need to specify the mask directory because the {0:"bg", 1:"skull", 2:"skin"}
# masks are located in the "./merged_masks" directory
update_img_dir = f"{update_raw_dir}/images"
update_mask_dir = f"{update_raw_dir}/merged_masks"
interim_img_dir = f"{interim_dir}/images"
interim_mask_dir = f"{interim_dir}/masks"
copy_tree(src=update_img_dir, dst=interim_img_dir)
copy_tree(src=update_mask_dir, dst=interim_mask_dir)

['../data/interim/OCT_scans_original_and_20230419/masks\\unet (1).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (10).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (100).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (102).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (103).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (104).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (105).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (106).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (107).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (108).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (109).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (11).tif',
 '../data/interim/OCT_scans_original_and_20230419/masks\\unet (110).tif',
 '../data/interim/OCT_scans_original_and_2

## Convert all masks to categorically labeled

 - 0: background
 - 1: skull
 - 2: skin

In [16]:
# Interim images and masks directory
img_dir = f"{interim_dir}/images"
mask_dir = f"{interim_dir}/masks"

# Image names in the interim directory
orig_data_name = "train-P (1).tif"
update_data_name = "unet (1).tif"

# Show image statistics and image
img_path = f"{img_dir}/{orig_data_name}"
img = tifffile.imread(img_path)
print(f"IMAGE STATS ({orig_data_name}):\nshape:\t{img.shape}\ndtype:\t{img.dtype}\nmin:\t{np.amin(img)}\nmax:\t{np.amax(img)}")
img_path = f"{img_dir}/{update_data_name}"
img = tifffile.imread(img_path)
print(f"IMAGE STATS ({update_data_name}):\nshape:\t{img.shape}\ndtype:\t{img.dtype}\nmin:\t{np.amin(img)}\nmax:\t{np.amax(img)}")

# Show mask statistics and image
mask_path = f"{mask_dir}/{orig_data_name}"
mask = tifffile.imread(mask_path)
print()
print(f"MASK STATS ({orig_data_name}):\nshape:\t{mask.shape}\ndtype:\t{mask.dtype}\nmin:\t{np.amin(mask)}\nmax:\t{np.amax(mask)}\nunique:\t{np.unique(mask)}")
mask_path = f"{mask_dir}/{update_data_name}"
mask = tifffile.imread(mask_path)
print(f"MASK STATS ({update_data_name}):\nshape:\t{mask.shape}\ndtype:\t{mask.dtype}\nmin:\t{np.amin(mask)}\nmax:\t{np.amax(mask)}\nunique:\t{np.unique(mask)}")

IMAGE STATS (train-P (1).tif):
shape:	(512, 512)
dtype:	uint16
min:	0
max:	65533
IMAGE STATS (unet (1).tif):
shape:	(512, 512)
dtype:	uint16
min:	0
max:	65533

MASK STATS (train-P (1).tif):
shape:	(512, 512)
dtype:	uint8
min:	0
max:	255
unique:	[  0 255]
MASK STATS (unet (1).tif):
shape:	(512, 512)
dtype:	uint8
min:	0
max:	1
unique:	[0 1]


As shown above, the masks for the origianl dataset are not properly labeld because 255 = skull rather than 1 = skull.

Therefore, convert the original masks to be 1 = skull.

In [18]:
def to_(img:np.ndarray, dtype, scaling:float,  offset:float = 0):
    """
    Return image converted to dtype
    
    Pixel-wise intensity formual: I = I*scaling + offset.
    Casts to dtype before return

    Args:
        img (np.ndarray): Image to convert
        dtype: dtype class to convert to
        scaling (float): Scaling in pixel-wise transform formula
        offset (float): Offset in pixel-wise transform formula
    """
    flt = img.astype(float)
    flt = flt*scaling + offset
    return flt.astype(dtype)

In [19]:
for mask_name in os.listdir(mask_dir):
    # all original data are labeled as `train-P (xx).tif`
    if "train-P" in mask_name:
        mask_path = f"{mask_dir}/{mask_name}"
        mask = tifffile.imread(mask_path)
        if 255 in np.unique(mask):
            print(f"Converting {mask_name} mask")
            scaling = 1/255
            mask = to_(mask, np.uint8, scaling)
            tifffile.imwrite(mask_path, mask)

Converting train-P (1).tif mask
Converting train-P (10).tif mask
Converting train-P (11).tif mask
Converting train-P (12).tif mask
Converting train-P (13).tif mask
Converting train-P (14).tif mask
Converting train-P (15).tif mask
Converting train-P (16).tif mask
Converting train-P (17).tif mask
Converting train-P (18).tif mask
Converting train-P (19).tif mask
Converting train-P (2).tif mask
Converting train-P (20).tif mask
Converting train-P (21).tif mask
Converting train-P (22).tif mask
Converting train-P (23).tif mask
Converting train-P (24).tif mask
Converting train-P (25).tif mask
Converting train-P (26).tif mask
Converting train-P (27).tif mask
Converting train-P (28).tif mask
Converting train-P (29).tif mask
Converting train-P (3).tif mask
Converting train-P (30).tif mask
Converting train-P (31).tif mask
Converting train-P (32).tif mask
Converting train-P (33).tif mask
Converting train-P (34).tif mask
Converting train-P (35).tif mask
Converting train-P (36).tif mask
Converting tr

## Split dataset into training and testing/validation datset

In [21]:
# Ignore the poorly labeled images with skin in original dataste
ignore_imgs = [
    # "train-P (1).tif", # skin is not labeled in mask
    "train-P (2).tif",
    "train-P (3).tif",
    "train-P (7).tif",
    "train-P (8).tif",
    "train-P (10).tif",
    "train-P (17).tif",
    "train-P (20).tif",
    "train-P (21).tif",
    "train-P (23).tif",
    "train-P (26).tif",
    "train-P (28).tif",
    "train-P (29).tif",
    # "train-P (30).tif", # skin is not labeled in mask
    "train-P (31).tif",
    "train-P (32).tif",
    "train-P (34).tif",
    "train-P (35).tif",
    "train-P (36).tif",
    "train-P (37).tif",
    "train-P (38).tif",
    "train-P (39).tif",
    "train-P (40).tif",
    "train-P (41).tif",
    "train-P (43).tif",
    "train-P (44).tif",
    "train-P (45).tif",
    "train-P (47).tif",
    "train-P (48).tif",
    "train-P (49).tif",
    "train-P (50).tif",
    # "train-P (51).tif", # skin is not labeled in mask
    # "train-P (52).tif", # skin is not labeled in mask
    # "train-P (53).tif", # skin is not labeled in mask
    # "train-P (54).tif", # skin is not labeled in mask
    # "train-P (55).tif", # skin is not labeled in mask
    # "train-P (57).tif", # skin is not labeled in mask
    "train-P (58).tif",
    "train-P (59).tif",
    "train-P (63).tif",
    "train-P (76).tif",
    # "train-P (77).tif", # skin is not labeled in mask
]

In [26]:
def single_dir_split(data_dir:str, img_dir_name:str, mask_dir_name:str, split_vals=(70, 20, 10), seed=1000, ext='.tif'):
    """
    Splits all images in `data_dir` according to `split_vals`. Output to "data_dir/split".
    
    Args:
        data_dir (str): Root dir containing image and mask directories
        img_dir_name (str): Name of 'images' dir in `data_dir`
        mask_dir_name (str): Name of 'masks' dir in `data_dir`
        split_vals (list): Split percentages as (train, val, test)
        seed (int): random seed
        ext (str): image extension
    """
    # Validate directory paths and split percentages
    img_dir = f"{data_dir}/{img_dir_name}"
    mask_dir = f"{data_dir}/{mask_dir_name}"
    if not os.path.isdir(img_dir):
        raise FileNotFoundError(f"Image directory not found: {img_dir}")
    if not os.path.isdir(mask_dir):
        raise FileNotFoundError(f"Mask directory not found: {mask_dir}")
    if sum(split_vals) != 100:
        raise ValueError(f"Invalid split percentages: {split_vals}. Must add to 100")
    
    # Count number of images and masks
    imgs = os.listdir(img_dir)
    masks =os.listdir(mask_dir)
    if len(imgs) == 0:
        raise ValueError(f"No images in the image directory: {img_dir}")
    if len(masks) == 0:
        raise ValueError(f"No masks in the mask directory: {mask_dir}")
    
    # IGNORE IMAGES
    for img, mask in zip(imgs, masks):
        if img in ignore_imgs:
            imgs.remove(img)
            masks.remove(mask)

    
    # Count number of identically named images/masks
    img_mask_pairs = []
    for img in imgs:
        if img in masks:
            img_mask_pairs.append(img)
    n_unique = len(img_mask_pairs)
    if n_unique == 0:
        raise ValueError("Could not find identical filenames between image dir"
                         f"({img_dir}) and mask dir {mask_dir}.")
    print(f"Found {n_unique} image-mask pairs from {len(imgs)} images and {len(masks)} masks")
    
    # Generate random index for each file corresponding to train/val/test
    indices = np.arange(n_unique)
    np.random.seed(seed)
    np.random.shuffle(indices)
    train_len = int(round(split_vals[0]/100 * n_unique))
    val_len = int(round(split_vals[1]/100 * n_unique))
    test_len = int(round(split_vals[2]/100 * n_unique))
    train_inds = indices[:train_len]
    val_inds = indices[train_len:train_len+val_len] # val draws from front of randmized images
    test_inds = indices[train_len+val_len:] # test draws from back of randmized images

    # Create train/val/test directories
    train_dir = f"{data_dir}/split/train"
    val_dir = f"{data_dir}/split/val"
    test_dir = f"{data_dir}/split/test"
    train_exist = False
    val_exist = False
    test_exist = False 
    for dirname in [img_dir_name, mask_dir_name]:
        if not os.path.isdir(f"{train_dir}/{dirname}"):
            os.makedirs(f"{train_dir}/{dirname}")
        else:
            train_exist = True
        if not os.path.isdir(f"{val_dir}/{dirname}"):
            os.makedirs(f"{val_dir}/{dirname}")
        else:
            val_exist = True
        if not os.path.isdir(f"{test_dir}/{dirname}"):
            os.makedirs(f"{test_dir}/{dirname}")
        else:
            test_exist = True

    # Split data
    if not train_exist and not val_exist and not test_exist:
        print(f'copying data from {len(train_inds)} training instances')
        for train_index in train_inds:
            fname = img_mask_pairs[train_index]
            img_src = f"{img_dir}/{fname}"
            mask_src = f"{mask_dir}/{fname}"
            img_dst = f"{train_dir}/{img_dir_name}/{fname}"
            mask_dst = f"{train_dir}/{mask_dir_name}/{fname}"
            shutil.copy(img_src, img_dst)
            shutil.copy(mask_src, mask_dst)
        print(f'copying data from {len(val_inds)} validation instances')
        for val_index in val_inds:
            fname = img_mask_pairs[val_index]
            img_src = f"{img_dir}/{fname}"
            mask_src = f"{mask_dir}/{fname}"
            img_dst = f"{val_dir}/{img_dir_name}/{fname}"
            mask_dst = f"{val_dir}/{mask_dir_name}/{fname}"
            shutil.copy(img_src, img_dst)
            shutil.copy(mask_src, mask_dst)
        print(f'copying data from {len(test_inds)} testing instances')
        for test_index in test_inds:
            fname = img_mask_pairs[test_index]
            img_src = f"{img_dir}/{fname}"
            mask_src = f"{mask_dir}/{fname}"
            img_dst = f"{test_dir}/{img_dir_name}/{fname}"
            mask_dst = f"{test_dir}/{mask_dir_name}/{fname}"
            shutil.copy(img_src, img_dst)
            shutil.copy(mask_src, mask_dst)
    else:
        print("No data split because test/train/val already exsited.")

In [27]:
single_dir_split(data_dir=interim_dir, img_dir_name='images', mask_dir_name='masks', split_vals=(85, 15, 0), ext='.tif')

Found 183 image-mask pairs from 183 images and 183 masks
copying data from 156 training instances
copying data from 27 validation instances
copying data from 0 testing instances


## Finally move processed data to the `data/processed` directory

In [30]:
SIZE = 512
input_dir = f"{interim_dir}/split"
data_basename = os.path.basename(interim_dir)
output_dir = f"../data/processed/{data_basename}_{SIZE}x{SIZE}"
shutil.copytree(input_dir, output_dir)

'../data/processed/OCT_scans_original_and_20230419_512x512'