In [1]:
import os
import shutil
import nibabel as nib
import cv2
import numpy as np
import pathlib
import re
import json
from collections import defaultdict
import random

def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]

# Input directories
path = "ArabidopsisDataset"
read_path = pathlib.Path(path).glob('*/*/*/*.png')
image_paths = sorted([str(path) for path in read_path], key=natural_sort_key)

# Output directories
outpath = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/imagesTr"
labels_outpath = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/labelsTr"

os.makedirs(outpath, exist_ok=True)
os.makedirs(labels_outpath, exist_ok=True)

# Group images by folder
folder_images = defaultdict(list)
for path in image_paths:
    # Get the folder path - using os.path for better compatibility
    folder_path = os.path.dirname(path)
    folder_images[folder_path].append(path)

# Create a mapping dictionary that includes the folder info
folder_image_map = defaultdict(list)

# Counter for image naming
counter = 1
image_id = ""

# Process all images grouped by folder
for folder, images in folder_images.items():
    folder_id = "Manual_Annotation_" + folder.replace("/", "_").replace(" ", "_")  # Create a safe folder ID
    
    print(f'Processing {folder}, image ID: {image_id}')
        
    # Process each image in the folder
    for path in images:
        # Create image ID
        image_id = f'image_{counter}'
        
        # Copy the image to the output directory
        shutil.copy(path, os.path.join(outpath, f'{image_id}_0000.png'))
        
        # Load the mask
        mask_path = path.replace('.png', '.nii.gz')
        mask = nib.load(mask_path).get_fdata().T
        
        if mask.shape[0] == 1:
            mask = mask[0]
        
        image = cv2.imread(path, 0)
        # Image shape and mask shape should be the same
        if image.shape[0] != mask.shape[0]:
            continue

        # Save the mask
        cv2.imwrite(os.path.join(labels_outpath, f'{image_id}.png'), mask.astype('uint8'))
        
        folder_image_map[folder_id].append(image_id)
        
        # Update the counter
        counter += 1

with open('nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/folder_image_map.json', '+w') as f:
    json.dump(folder_image_map, f, indent=4)

Processing ArabidopsisDataset/Etiolation/MultipleVids/Primeras listo, image ID: 
Processing ArabidopsisDataset/Etiolation/MultipleVids/Varios, image ID: image_4
Processing ArabidopsisDataset/Etiolation/rpi101_2024-07-29_14-47/1 listo, image ID: image_12
Processing ArabidopsisDataset/Etiolation/rpi101_2024-07-29_14-47/2 listo, image ID: image_16
Processing ArabidopsisDataset/Etiolation/rpi101_2024-07-29_14-47/3 listo, image ID: image_20
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/1, image ID: image_24
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/2, image ID: image_29
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/3, image ID: image_34
Processing ArabidopsisDataset/Etiolation/rpi101_2024-11-06_15-18/4, image ID: image_39
Processing ArabidopsisDataset/Etiolation/rpi102_2024-06-11_14-10/1 listo, image ID: image_44
Processing ArabidopsisDataset/Etiolation/rpi102_2024-06-11_14-10/2 listo, image ID: image_48
Processing ArabidopsisData

In [3]:
import json
import numpy as np
import os
import random

# Path to the folder_image_map.json file
folder_map_path = 'nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/folder_image_map.json'

# Path to save the splits file
splits_output_path = 'nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json'

# Make directories if they don't exist
os.makedirs(os.path.dirname(splits_output_path), exist_ok=True)

# Load the folder-to-images mapping
with open(folder_map_path, 'r') as f:
    folder_image_map = json.load(f)

# Set a seed for reproducibility
random.seed(42)
np.random.seed(42)

# Group folders by major category
major_categories = {}
for folder_id in folder_image_map.keys():
    # Extract major category from folder name
    # Format: Manual_Annotation_ArabidopsisDataset_<CATEGORY>_...
    parts = folder_id.split('_')
    if len(parts) >= 4:
        category = parts[3]  # Etiolation, Germination, ManyRoots, or Roots
        if category not in major_categories:
            major_categories[category] = []
        major_categories[category].append(folder_id)

print("Major categories found:")
for cat, folders in major_categories.items():
    print(f"  {cat}: {len(folders)} folders")

# Initialize train, val, test lists
train_folders = []
val_folders = []
test_folders = []

# Process each major category
for category, folders in major_categories.items():
    print(f"\nProcessing category: {category}")
    
    # Separate MultipleVids folders (always go to train)
    multiplevids_folders = [f for f in folders if "MultipleVids" in f]
    other_folders = [f for f in folders if "MultipleVids" not in f]
    
    print(f"  MultipleVids folders: {len(multiplevids_folders)} (→ train)")
    print(f"  Other folders: {len(other_folders)}")
    
    # Add MultipleVids folders to train
    train_folders.extend(multiplevids_folders)
    
    # Shuffle other folders
    random.shuffle(other_folders)
    
    # Calculate split sizes (70-10-20)
    n_other = len(other_folders)
    n_train = int(0.7 * n_other)
    n_val = int(0.1 * n_other)
    # n_test is the remainder
    
    # Split the other folders
    train_other = other_folders[:n_train]
    val_other = other_folders[n_train:n_train + n_val]
    test_other = other_folders[n_train + n_val:]
    
    print(f"  Split: {len(train_other)} train, {len(val_other)} val, {len(test_other)} test")
    
    # Add to respective lists
    train_folders.extend(train_other)
    val_folders.extend(val_other)
    test_folders.extend(test_other)

# Convert folder lists to image lists
train_images = []
val_images = []
test_images = []

for folder in train_folders:
    train_images.extend(folder_image_map[folder])

for folder in val_folders:
    val_images.extend(folder_image_map[folder])

for folder in test_folders:
    test_images.extend(folder_image_map[folder])

# Create the split dictionary (single split with train/val/test)
splits = [{
    'train': train_images,
    'val': val_images,
    'test': test_images
}]

# Print statistics
print(f"\n{'='*60}")
print(f"FINAL SPLIT STATISTICS")
print(f"{'='*60}")
print(f"Total folders: {len(folder_image_map)}")
print(f"  Train folders: {len(train_folders)}")
print(f"  Val folders: {len(val_folders)}")
print(f"  Test folders: {len(test_folders)}")
print(f"\nTotal images: {len(train_images) + len(val_images) + len(test_images)}")
print(f"  Train images: {len(train_images)} ({100*len(train_images)/(len(train_images)+len(val_images)+len(test_images)):.1f}%)")
print(f"  Val images: {len(val_images)} ({100*len(val_images)/(len(train_images)+len(val_images)+len(test_images)):.1f}%)")
print(f"  Test images: {len(test_images)} ({100*len(test_images)/(len(train_images)+len(val_images)+len(test_images)):.1f}%)")

# Save the splits
with open(splits_output_path, 'w') as f:
    json.dump(splits, f, indent=4)

print(f"\nSplits saved to: {splits_output_path}")

# Also save folder assignments for transparency
folder_assignments = {
    'train_folders': train_folders,
    'val_folders': val_folders,
    'test_folders': test_folders
}

folder_assignments_path = splits_output_path.replace('splits_final.json', 'folder_assignments.json')
with open(folder_assignments_path, 'w') as f:
    json.dump(folder_assignments, f, indent=4)

print(f"Folder assignments saved to: {folder_assignments_path}")

Major categories found:
  Etiolation: 28 folders
  Germination: 35 folders
  ManyRoots: 16 folders
  Roots: 89 folders

Processing category: Etiolation
  MultipleVids folders: 2 (→ train)
  Other folders: 26
  Split: 18 train, 2 val, 6 test

Processing category: Germination
  MultipleVids folders: 2 (→ train)
  Other folders: 33
  Split: 23 train, 3 val, 7 test

Processing category: ManyRoots
  MultipleVids folders: 0 (→ train)
  Other folders: 16
  Split: 11 train, 1 val, 4 test

Processing category: Roots
  MultipleVids folders: 2 (→ train)
  Other folders: 87
  Split: 60 train, 8 val, 19 test

FINAL SPLIT STATISTICS
Total folders: 168
  Train folders: 118
  Val folders: 14
  Test folders: 36

Total images: 911
  Train images: 667 (73.2%)
  Val images: 68 (7.5%)
  Test images: 176 (19.3%)

Splits saved to: nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json
Folder assignments saved to: nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/folder_assignments.js

In [4]:
import json
import shutil
import os

# Paths
splits_path = 'nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json'
images_tr_path = 'nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/imagesTr'
labels_tr_path = 'nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/labelsTr'

images_ts_path = 'nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/imagesTs'
labels_ts_path = 'nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/labelsTs'

# Create output directories
os.makedirs(images_ts_path, exist_ok=True)
os.makedirs(labels_ts_path, exist_ok=True)

# Load the splits
with open(splits_path, 'r') as f:
    splits = json.load(f)

# Get test images (assuming first split contains train/val/test)
test_images = splits[0]['test']

print(f"Copying {len(test_images)} test images...")

# Copy each test image and its label
for image_id in test_images:
    # Copy image
    src_image = os.path.join(images_tr_path, f'{image_id}_0000.png')
    dst_image = os.path.join(images_ts_path, f'{image_id}_0000.png')
    
    if os.path.exists(src_image):
        shutil.copy(src_image, dst_image)
    else:
        print(f"Warning: Image not found: {src_image}")
    
    # Copy label
    src_label = os.path.join(labels_tr_path, f'{image_id}.png')
    dst_label = os.path.join(labels_ts_path, f'{image_id}.png')
    
    if os.path.exists(src_label):
        shutil.copy(src_label, dst_label)
    else:
        print(f"Warning: Label not found: {src_label}")

print(f"\nDone!")
print(f"Images copied to: {images_ts_path}")
print(f"Labels copied to: {labels_ts_path}")

Copying 176 test images...

Done!
Images copied to: nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/imagesTs
Labels copied to: nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/labelsTs
