In [1]:
import os
import shutil
import nibabel as nib
import cv2
import numpy as np
import pathlib
import re
import json
from collections import defaultdict
import random

def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]

# Input directories
path = "ArabidopsisDataset"
read_path = pathlib.Path(path).glob('*/*/*/*.png')
image_paths = sorted([str(path) for path in read_path], key=natural_sort_key)

# Output directories
outpath = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/imagesTr"
labels_outpath = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/labelsTr"

os.makedirs(outpath, exist_ok=True)
os.makedirs(labels_outpath, exist_ok=True)

# Group images by folder
folder_images = defaultdict(list)
for path in image_paths:
    # Get the folder path - using os.path for better compatibility
    folder_path = os.path.dirname(path)
    folder_images[folder_path].append(path)

# Create a mapping dictionary that includes the folder info
folder_image_map = defaultdict(list)

# Counter for image naming
counter = 1
image_id = ""

# Process all images grouped by folder
for folder, images in folder_images.items():
    folder_id = "Manual_Annotation_" + folder.replace("/", "_").replace(" ", "_")  # Create a safe folder ID
    
    print(f'Processing {folder}, image ID: {image_id}')
        
    # Process each image in the folder
    for path in images:
        # Create image ID
        image_id = f'image_{counter}'
        
        # Copy the image to the output directory
        shutil.copy(path, os.path.join(outpath, f'{image_id}_0000.png'))
        
        # Load the mask
        mask_path = path.replace('.png', '.nii.gz')
        mask = nib.load(mask_path).get_fdata().T
        
        if mask.shape[0] == 1:
            mask = mask[0]
        
        image = cv2.imread(path, 0)
        # Image shape and mask shape should be the same
        if image.shape[0] != mask.shape[0]:
            continue

        # Save the mask
        cv2.imwrite(os.path.join(labels_outpath, f'{image_id}.png'), mask.astype('uint8'))
        
        folder_image_map[folder_id].append(image_id)
        
        # Update the counter
        counter += 1

with open('nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/folder_image_map.json', '+w') as f:
    json.dump(folder_image_map, f, indent=4)

Processing ArabidopsisDataset/Etiolation/MultipleVids/Primeras listo, image ID: 
Processing ArabidopsisDataset/Etiolation/MultipleVids/Varios, image ID: image_4
Processing ArabidopsisDataset/Etiolation/rpi101_2024-07-29_14-47/1 listo, image ID: image_12
Processing ArabidopsisDataset/Etiolation/rpi101_2024-07-29_14-47/2 listo, image ID: image_16
Processing ArabidopsisDataset/Etiolation/rpi101_2024-07-29_14-47/3 listo, image ID: image_20
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/1, image ID: image_24
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/2, image ID: image_29
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/3, image ID: image_34
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/4, image ID: image_39
Processing ArabidopsisDataset/Etiolation/rpi102_2024-06-11_14-10/1 listo, image ID: image_44
Processing ArabidopsisDataset/Etiolation/rpi102_2024-06-11_14-10/2 listo, image ID: image_48
Processing ArabidopsisData

In [3]:
import json
import numpy as np
import os

# Path to the folder_image_map.json file
folder_map_path = 'nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/folder_image_map.json'

# Path to save the splits file
splits_output_path = 'nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json'

# Make directories if they don't exist
os.makedirs(os.path.dirname(splits_output_path), exist_ok=True)

# Load the folder-to-images mapping
with open(folder_map_path, 'r') as f:
    folder_image_map = json.load(f)

# Get all folder IDs
folder_ids = list(folder_image_map.keys())

# Set a seed for reproducibility
np.random.seed(42)

# Shuffle the folder IDs
np.random.shuffle(folder_ids)

# Split the folder IDs into 5 folds
folder_folds = np.array_split(folder_ids, 5)

# Create the 5-fold cross validation splits
splits = []

for i in range(5):
    # Folders for validation in this fold
    val_folders = folder_folds[i]
    
    # Initialize train and validation lists
    train_images = []
    val_images = []
    
    # Assign images to train or validation based on their folder
    for folder_id in folder_ids:
        if folder_id in val_folders:
            # This folder goes to validation for this fold
            val_images.extend(folder_image_map[folder_id])
        else:
            # This folder goes to training for this fold
            train_images.extend(folder_image_map[folder_id])
    
    # Create the split dictionary
    split = {
        'train': train_images,
        'val': val_images
    }
    
    # Add to the splits list
    splits.append(split)

# Adds a fake folder 5 with all images in the training set for running inference later

# Initialize train and validation lists
train_images = []
val_images = []

# Assign images to train or validation based on their folder
for folder_id in folder_ids:
    val_images.extend(folder_image_map[folder_id])
    train_images.extend(folder_image_map[folder_id])

# Create the split dictionary
split = {
    'train': train_images,
    'val': val_images
}

# Add to the splits list
splits.append(split)

# Print some statistics
for i, split in enumerate(splits):
    print(f"Fold {i}: {len(split['train'])} training images, {len(split['val'])} validation images")

# Save the splits
os.makedirs(os.path.dirname(splits_output_path), exist_ok=True)
with open(splits_output_path, 'w') as f:
    json.dump(splits, f, indent=4)

print(f"\nSplits saved to: {splits_output_path}")
print(f"Total folders: {len(folder_ids)}")
print(f"Folders per fold: {[len(fold) for fold in folder_folds]}")

Fold 0: 671 training images, 126 validation images
Fold 1: 656 training images, 141 validation images
Fold 2: 636 training images, 161 validation images
Fold 3: 639 training images, 158 validation images
Fold 4: 586 training images, 211 validation images
Fold 5: 797 training images, 797 validation images

Splits saved to: nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json
Total folders: 147
Folders per fold: [30, 30, 29, 29, 29]
