# Functions

In [9]:
import h5py
import os
import matplotlib.pyplot as plt
import numpy as np
from tifffile import imwrite

# Function to ensure each directory gets at least one image
def ensure_initial_images(assigned, directory, index, im_new):
    if not assigned:
        imwrite(directory + '/' + f'cell_{index}.tif', im_new)
        return True, index + 1
    return assigned, index

# Process slides with modified directory assignment
def process_slides(slides, type_dir, initial_assigned):
    index = 1  # Start indexing for cell images
    assigned = {key: False for key in initial_assigned}

    for slide in slides:
        with h5py.File(original_dir + slide + '.hdf5', 'r') as f:
            dset = f['data']
            n_cells = dset.shape[0]
            current_samp_size = min(samp_size, n_cells)  # Adjust sample size based on available cells
            samp = np.random.choice(n_cells, current_samp_size, replace=False)

            for i in samp:
                im = dset[i][0, :, :]
                crop = im[(center_xy[0] - offset):(center_xy[0] + offset), (center_xy[1] - offset):(center_xy[1] + offset)]
                im_new = np.reshape(crop, (xydim, xydim, 1))

                # Ensure at least one image in each directory
                for key in assigned:
                    if not assigned[key]:
                        assigned[key], index = ensure_initial_images(assigned[key], key, index, im_new)

                # Random assignment after the initial images
                if np.random.uniform() <= split_prob:
                    if np.random.uniform() <= split_prob:
                        dir_path = type_dir['train']
                    else:
                        dir_path = type_dir['validation']
                else:
                    dir_path = type_dir['test']

                imwrite(dir_path + '/' + f'cell_{index}.tif', im_new)
                index += 1

    return index

# Directory definitions
current_directory = os.getcwd()
print(current_directory)

base_directory = "/home/jovyan/Teaching/BigDataDL/"
base_dir = base_directory + "LabData/HPV_slides/"
os.makedirs(base_dir, exist_ok=True)

train_dir = os.path.join(base_dir, 'train')
os.makedirs(train_dir, exist_ok=True)
validation_dir = os.path.join(base_dir, 'validation')
os.makedirs(validation_dir, exist_ok=True)
test_dir = os.path.join(base_dir, 'test')
os.makedirs(test_dir, exist_ok=True)

train_healthy_dir = os.path.join(train_dir, 'healthy')
os.makedirs(train_healthy_dir, exist_ok=True)
train_tumor_dir = os.path.join(train_dir, 'tumor')
os.makedirs(train_tumor_dir, exist_ok=True)

validation_healthy_dir = os.path.join(validation_dir, 'healthy')
os.makedirs(validation_healthy_dir, exist_ok=True)
validation_tumor_dir = os.path.join(validation_dir, 'tumor')
os.makedirs(validation_tumor_dir, exist_ok=True)

test_healthy_dir = os.path.join(test_dir, 'healthy')
os.makedirs(test_healthy_dir, exist_ok=True)
test_tumor_dir = os.path.join(test_dir, 'tumor')
os.makedirs(test_tumor_dir, exist_ok=True)

original_dir = base_directory + 'ZippedLabData/HPV_slides/'

samp_size = 333
split_prob = 0.8
slides_healthy = ['glass3', 'glass4', 'glass5', 'glass6', 'glass7', 'glass8']
slides_tumor = ['glass12', 'glass36', 'glass37', 'glass38']

im_x, im_y = 80, 80
center_xy = [int(im_x/2), int(im_y/2)]
offset = 24
xydim = offset * 2

# Process each slide type
type_dirs_healthy = {'train': train_healthy_dir, 'validation': validation_healthy_dir, 'test': test_healthy_dir}
type_dirs_tumor = {'train': train_tumor_dir, 'validation': validation_tumor_dir, 'test': test_tumor_dir}

initial_assigned_healthy = {k: False for k in type_dirs_healthy.values()}
initial_assigned_tumor = {k: False for k in type_dirs_tumor.values()}

last_index_healthy = process_slides(slides_healthy, type_dirs_healthy, initial_assigned_healthy)
process_slides(slides_tumor, type_dirs_tumor, initial_assigned_tumor)

/home/jovyan/Teaching/BigDataDL/Assignments


1229