# Break up into holdout and vaildation sets

In [2]:
import os
import random
import shutil
import re
import glob
import numpy as np
from sklearn.model_selection import KFold
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import pickle 

In [5]:
# Path to the folder containing the PNG images and labels
data_folder = "/group/pawsey0149/mmckay/singularity/raw_fullsize_images_masks_PNG/"

# Path to the new directories for holdout and training sets
holdout_set_folder = "/group/pawsey0149/mmckay/singularity/holdout_set_224s"
training_set_folder = "/group/pawsey0149/mmckay/singularity/training_set_224s"

# Create the directories if they don't exist
os.makedirs(os.path.join(holdout_set_folder, "images"), exist_ok=True)
os.makedirs(os.path.join(holdout_set_folder, "labels"), exist_ok=True)
os.makedirs(os.path.join(training_set_folder, "images"), exist_ok=True)
os.makedirs(os.path.join(training_set_folder, "labels"), exist_ok=True)

# Get a list of all the image files in the data folder
image_files = [f for f in os.listdir(data_folder) if re.match(r".*_image_\d+\.png", f)]

# Shuffle the list of image files
random.shuffle(image_files)

# Calculate the index where we will split the data
split_index = int(len(image_files) * 0.1)  # 10% for holdout set

# Copy image files to the holdout set "images" directory
for image_file in image_files[:split_index]:
    src_image_path = os.path.join(data_folder, image_file)
    dst_image_path = os.path.join(holdout_set_folder, "images", image_file)
    shutil.copy(src_image_path, dst_image_path)

# Copy image files to the training set "images" directory
for image_file in image_files[split_index:]:
    src_image_path = os.path.join(data_folder, image_file)
    dst_image_path = os.path.join(training_set_folder, "images", image_file)
    shutil.copy(src_image_path, dst_image_path)

# Get a list of label files corresponding to the image files
label_files = [re.sub(r"_image_(\d+)\.png", r"_mask_\1.png", f) for f in image_files]

# Copy label files to the holdout set "labels" directory
for label_file in label_files[:split_index]:
    src_label_path = os.path.join(data_folder, label_file)
    dst_label_path = os.path.join(holdout_set_folder, "labels", label_file)
    shutil.copy(src_label_path, dst_label_path)

# Copy label files to the training set "labels" directory
for label_file in label_files[split_index:]:
    src_label_path = os.path.join(data_folder, label_file)
    dst_label_path = os.path.join(training_set_folder, "labels", label_file)
    shutil.copy(src_label_path, dst_label_path)

print("Data split into holdout and training sets successfully.")

Data split into holdout and training sets successfully.


# Kfold function

In [6]:
# functions for kfold and datablocks
# def label_func(fn): 
#     return path/"labels"/f"{fn.stem.replace('image', 'mask')}{fn.suffix}"

def fname_folds(fold=0):
    def __inner(path, fold=0):
        fnames = get_image_files(path)
        from sklearn.model_selection import KFold
        kfold = KFold(n_splits=5, shuffle=True, random_state=33)
        kfold_ds = {}
        i=0
        for ti, vi in kfold.split(fnames):
            kfold_ds[i] = fnames[ti] + fnames[vi]
            i+=1
        return kfold_ds[fold]
    return __inner

def KfoldSplitter(valid_pct=0.2, seed=None): 
    def _inner(o):
        if seed is not None: torch.manual_seed(seed)
        rand_idx = L(list(torch.randperm(len(o)).numpy()))
        cut = int(valid_pct * len(o))
        return rand_idx[cut:], rand_idx[:cut]
    return _inner

In [7]:
fnames = glob.glob('/group/pawsey0149/mmckay/singularity/training_set_224s/images/*')
fnames = np.array(fnames)

kf = KFold(n_splits=5, shuffle=True, random_state=33)
kfold_ds = {}
i = 0

for ti, vi in kf.split(fnames):
    kfold_ds[i] = np.concatenate((fnames[ti], fnames[vi]))
    i += 1
kfold_ds

# Function to save the dictionary into 'dictionary.pkl'
def save_dict_as_pickle(filename, data):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

# Save the dictionary into 'folds_dictionary.pkl'
save_dict_as_pickle('/group/pawsey0149/mmckay/singularity/folds_dictionary_224s.pkl', kfold_ds)

with open('/group/pawsey0149/mmckay/singularity/folds_dictionary_224s.pkl', 'wb') as f:
    pickle.dump(kfold_ds, f)
        


In [8]:
import pickle 
with open('/group/pawsey0149/mmckay/singularity/folds_dictionary_224s.pkl', 'rb') as f:
    loaded_dict= pickle.load(f)
    
loaded_dict[0]

array(['/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_153_image_66.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/B7t1_image_48.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_115_image_40.png',
       ...,
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_126_image_94.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_132_image_18.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/F5t1_image_22.png'],
      dtype='<U85')

In [9]:
loaded_dict[4]

array(['/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_158_image_30.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_159_image_63.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/E5t2_image_6.png',
       ...,
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/YC_IMG_103_image_67.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/B6t2_image_44.png',
       '/group/pawsey0149/mmckay/singularity/training_set_224s/images/D12t1_image_43.png'],
      dtype='<U85')

# need to modify the 3 python files, at top is instructions to run
#need to create an array and save it as a .pkl file using pickle
#make sure to change batch image size of vgg model to 500pixels
#check callbacks to make sure its not stopping prematurely
#run unet_mmckay.slm and look at to see instructions for each one of the trials files