# Data Preperation (Manual)

Assumes you had prepared data according to manual method shown in Youtube video (https://www.youtube.com/watch?v=M3ZWfamWrBM)

Prior Steps involved:
1. Create 'dicom_file' folder to store all dicom intermediate data
2. Create 'images' and 'labels' folders in 'dicom_file' to store all input(data) and output(labels)
3. For each patient, use 3D Slicer to convert their image and segmentation data into images and labels
4. Create 'dicom_group' folder to store all subsampled intermediate data
5. Create 'images' and 'labels' folders in 'dicom_group' to store all input(data) and output(labels)
6. Create 'nifti_files' folder to store nifti outputs
7. Create 'images' and 'labels' folders in 'nifti_files' to store all input(data) and output(labels)
8. Create 'task_data' folder to store final data location
9. Create 'TrainVolumes', 'TrainSegmentation', 'TestVolumes', 'TestSegmentation' folders in 'task_data' to store seperation of data for each use case

In [5]:
# define folder containin dicom and nifti intermediates

in_images_dir = "../dicom_file/images"
out_images_dir = "../dicom_groups/images"
out_nifti_img_dir = "../nifti_files/images/"

in_labels_dir = "../dicom_file/labels"
out_labels_dir = "../dicom_groups/labels"
out_nifti_lbl_dir = "../nifti_files/labels/"

# define folder to store testing and training folders

train_images_dir = "../task_data/TrainVolumes/"
train_labels_dir = "../task_data/TrainSegmentation/"
test_images_dir = "../task_data/TestVolumes/"
test_labels_dir = "../task_data/TestSegmentation/"

# define number of slices
num_slices = 64

# define proportion of test and training data (0-1)
train_proportion = 0.8

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
# import required packages

import os
from glob import glob
import shutil
import logging
import numpy as np

from preporcess import create_groups, dcm2nifti

### Step 1: Split DICOM files into similar sized data 
Before we actually split the DICOM data, first print the list of directories to be targetted for confirmation

In [None]:
# print image data
for patient in sorted(glob(in_images_dir + "/*")):
    print(patient)

# print label data
for patient in sorted(glob(in_labels_dir + "/*")):
    print(patient)

Runs the splitting tool provided by original author.
WARNING: original code moves data to save space

In [58]:
# split images
create_groups(in_images_dir, out_images_dir, num_slices)
# split labels
create_groups(in_labels_dir, out_labels_dir, num_slices)

### Optional: Moves split DICOM files back into original files 

In [57]:
# move back images
for patient in glob(in_images_dir + "/*"):
    head, tail = os.path.split(patient)
    for sub_patient in glob(out_images_dir + "/" + tail + "*"):
        if len(os.listdir(sub_patient)) != 0:
            for file in glob(sub_patient + "/*"):
                shutil.move(file, patient)
        shutil.rmtree(sub_patient)
        
# move back labels
for patient in glob(in_labels_dir + "/*"):
    head, tail = os.path.split(patient)
    for sub_patient in glob(out_labels_dir + "/" + tail + "*"):
        if len(os.listdir(sub_patient)) != 0:
            for file in glob(sub_patient + "/*"):
                shutil.move(file, patient)
        shutil.rmtree(sub_patient)

### Step 2: Convert data back into nifti file format

In [None]:
# convert images
dcm2nifti(out_images_dir, out_nifti_img_dir)
# convert labels
dcm2nifti(out_labels_dir, out_nifti_lbl_dir)

### Step 3: Move files into training and testing folders

WARNING: Toolkit moves files to new folder (use following code to move them back)

In [19]:
# function to check that image and label file name matches
def assert_data_labels_match(images, labels):
    assert(len(images) == len(labels))
    for img_name, lbl_name in zip(images, labels):
        assert(os.path.basename(img_name) == os.path.basename(lbl_name))


# load files to be moved
images = sorted(glob(out_nifti_img_dir + "/*.nii.gz"))
labels = sorted(glob(out_nifti_lbl_dir + "/*.nii.gz"))

# run checks first
assert(train_proportion > 0.0 and train_proportion < 1.0) # correct proportion
assert_data_labels_match(images, labels) # image and label name matches

# randomly pull N data for training depending on proportion
N = int(round(len(images) * train_proportion))
print('Num of training data:',N)
print('Num of test data:', len(images) - N)
train_ind = np.full((len(images)), False, dtype=bool)
train_ind[np.random.choice(len(images), N, replace=False)] = True

for ind, (image, label) in enumerate(zip(images, labels)):
    if train_ind[ind]:
        shutil.move(image, train_images_dir)
        shutil.move(label, train_labels_dir)
    else:
        shutil.move(image, test_images_dir)
        shutil.move(label, test_labels_dir)

Num of training data: 372
Num of test data: 93


### Optional: Move files back to Nifti folder to be redeployed

In [22]:
# fetch all files
train_images = glob(train_images_dir + "/*.nii.gz")
train_labels = glob(train_labels_dir + "/*.nii.gz")
test_images = glob(test_images_dir + "/*.nii.gz")
test_labels = glob(test_labels_dir + "/*.nii.gz")

# move train images
for train_image in train_images:
    shutil.move(train_image, out_nifti_img_dir)
print('Moved',len(train_images),'train images')
# move train labels
for train_label in train_labels:
    shutil.move(train_label, out_nifti_lbl_dir)
print('Moved',len(train_labels),'train labels')
# move test images
for test_image in test_images:
    shutil.move(test_image, out_nifti_img_dir)
print('Moved',len(test_images),'test images')
# move test labels
for test_label in test_labels:
    shutil.move(test_label, out_nifti_lbl_dir)
print('Moved',len(test_labels),'test labels')

Moved 372 train images
Moved 372 train labels
Moved 93 test images
Moved 93 test labels
