In [19]:
import os
import shutil
import nibabel as nib
import cv2
import numpy as np
import pathlib
import re
import json
from collections import defaultdict
import random

def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]

# Input directories
path = "TomatoDataset"
read_path = pathlib.Path(path).glob('*/*/*.png')
image_paths = sorted([str(path) for path in read_path], key=natural_sort_key)

# Output directories
outpath = "nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/imagesTr"
labels_outpath = "nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/labelsTr"

os.makedirs(outpath, exist_ok=True)
os.makedirs(labels_outpath, exist_ok=True)

# Group images by folder
folder_images = defaultdict(list)
for path in image_paths:
    # Get the folder path - using os.path for better compatibility
    folder_path = os.path.dirname(path)
    folder_images[folder_path].append(path)

# Create a mapping dictionary that includes the folder info
folder_image_map = defaultdict(list)

# Counter for image naming
counter = 1
image_id = ""

# Process all images grouped by folder
for folder, images in folder_images.items():
    folder_id = "Manual_Annotation_" + folder.replace("/", "_").replace(" ", "_")  # Create a safe folder ID

    print(f'Processing {folder}, image ID: {image_id}')

    # Process each image in the folder
    for path in images:
        # Create image ID
        image_id = f'image_{counter}'
        
        # Copy the image to the output directory
        shutil.copy(path, os.path.join(outpath, f'{image_id}_0000.png'))
        
        # Load the mask
        mask_path = path.replace('.png', '.nii.gz')
        mask = nib.load(mask_path).get_fdata().T
        
        if mask.shape[0] == 1:
            mask = mask[0]
        
        image = cv2.imread(path, 0)
        # Image shape and mask shape should be the same
        if image.shape[0] != mask.shape[0]:
            continue

        # Save the mask
        cv2.imwrite(os.path.join(labels_outpath, f'{image_id}.png'), mask.astype('uint8'))
        
        folder_image_map[folder_id].append(image_id)
        
        # Update the counter
        counter += 1

with open('nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/folder_image_map.json', '+w') as f:
    json.dump(folder_image_map, f, indent=4)

Processing TomatoDataset/Otros/1, image ID: 
Processing TomatoDataset/Otros/2, image ID: image_5
Processing TomatoDataset/Otros/3, image ID: image_10
Processing TomatoDataset/Otros/4, image ID: image_16
Processing TomatoDataset/Otros/5, image ID: image_19
Processing TomatoDataset/Otros/SinVideo (anotado), image ID: image_25
Processing TomatoDataset/rpi101_2024-06-06_14-40 (anotado)/1, image ID: image_37
Processing TomatoDataset/rpi101_2024-06-06_14-40 (anotado)/2, image ID: image_42
Processing TomatoDataset/rpi101_2024-06-06_14-40 (anotado)/3, image ID: image_46
Processing TomatoDataset/rpi101_2024-06-06_14-40 (anotado)/4, image ID: image_51
Processing TomatoDataset/rpi101_2024-07-04_12-31 (anotado)/1, image ID: image_54
Processing TomatoDataset/rpi101_2024-07-04_12-31 (anotado)/2, image ID: image_59
Processing TomatoDataset/rpi101_2024-07-04_12-31 (anotado)/3, image ID: image_63
Processing TomatoDataset/rpi101_2024-07-04_12-31 (anotado)/4, image ID: image_68
Processing TomatoDataset/r

In [20]:
import json
import numpy as np
import os

# Path to the folder_image_map.json file
folder_map_path = 'nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/folder_image_map.json'

# Path to save the splits file
splits_output_path = 'nnUNet_files/nnUNet_preprocessed/Dataset555_TomateRoot/splits_final.json'

# Make directories if they don't exist
os.makedirs(os.path.dirname(splits_output_path), exist_ok=True)

# Load the folder-to-images mapping
with open(folder_map_path, 'r') as f:
    folder_image_map = json.load(f)

# Get all folder IDs
folder_ids = list(folder_image_map.keys())

# Set a seed for reproducibility
np.random.seed(42)

# Shuffle the folder IDs
np.random.shuffle(folder_ids)

# Split the folder IDs into 5 folds
folder_folds = np.array_split(folder_ids, 5)

# Create the 5-fold cross validation splits
splits = []

for i in range(5):
    # Folders for validation in this fold
    val_folders = folder_folds[i]
    
    # Initialize train and validation lists
    train_images = []
    val_images = []
    
    # Assign images to train or validation based on their folder
    for folder_id in folder_ids:
        if folder_id in val_folders:
            # This folder goes to validation for this fold
            val_images.extend(folder_image_map[folder_id])
        else:
            # This folder goes to training for this fold
            train_images.extend(folder_image_map[folder_id])
    
    # Create the split dictionary
    split = {
        'train': train_images,
        'val': val_images
    }
    
    # Add to the splits list
    splits.append(split)

# Adds a fake folder 5 with all images in the training set for running inference later

# Initialize train and validation lists
train_images = []
val_images = []

# Assign images to train or validation based on their folder
for folder_id in folder_ids:
    val_images.extend(folder_image_map[folder_id])
    train_images.extend(folder_image_map[folder_id])

# Create the split dictionary
split = {
    'train': train_images,
    'val': val_images
}

# Add to the splits list
splits.append(split)

# Print some statistics
for i, split in enumerate(splits):
    print(f"Fold {i}: {len(split['train'])} training images, {len(split['val'])} validation images")

# Save the splits
os.makedirs(os.path.dirname(splits_output_path), exist_ok=True)
with open(splits_output_path, 'w') as f:
    json.dump(splits, f, indent=4)

print(f"\nSplits saved to: {splits_output_path}")
print(f"Total folders: {len(folder_ids)}")
print(f"Folders per fold: {[len(fold) for fold in folder_folds]}")

Fold 0: 234 training images, 65 validation images
Fold 1: 245 training images, 54 validation images
Fold 2: 236 training images, 63 validation images
Fold 3: 243 training images, 56 validation images
Fold 4: 238 training images, 61 validation images
Fold 5: 299 training images, 299 validation images

Splits saved to: nnUNet_files/nnUNet_preprocessed/Dataset555_TomateRoot/splits_final.json
Total folders: 45
Folders per fold: [9, 9, 9, 9, 9]


# Tomato + Arabidopsis database

In [21]:
import os
import shutil
import cv2
import numpy as np

# Input directories
path2 = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/imagesTr"
path2_labels = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/labelsTr"

# Output directories
outpath = "nnUNet_files/nnUNet_raw/Dataset557_TArabidopsis/imagesTr"
labels_outpath = "nnUNet_files/nnUNet_raw/Dataset557_TArabidopsis/labelsTr"

os.makedirs(outpath, exist_ok=True)
os.makedirs(labels_outpath, exist_ok=True)

# Combine the image and label paths into lists
image_paths = [path2]

# Process the image directories
counter = 1
for path in image_paths:
    # Sort filenames before processing
    while os.path.exists(os.path.join(path, f'image_{counter}_0000.png')):
        filename = f'image_{counter}_0000.png'
        shutil.copy(os.path.join(path, filename), os.path.join(outpath, f'image_{counter}_0000.png'))

        label_path = os.path.join(path2_labels, filename.replace('_0000.png', '.png'))

        label = cv2.imread(label_path, 0)
        label[label == 6] = 5

        cv2.imwrite(os.path.join(labels_outpath, f'image_{counter}.png'), label.astype('uint8'))
        counter += 1

In [22]:
import os
import shutil

# Input directories
path2 = "nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/imagesTr"
path2_labels = "nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/labelsTr"

# Output directories
outpath = "nnUNet_files/nnUNet_raw/Dataset557_TArabidopsis/imagesTr"
labels_outpath = "nnUNet_files/nnUNet_raw/Dataset557_TArabidopsis/labelsTr"

# Combine the image and label paths into lists
image_paths = [path2]

old_counter = 1

mapping = "nnUNet_files/nnUNet_raw/Dataset555_TomateRoot/folder_image_map.json"
with open(mapping, 'r') as f:
    folder_image_map = json.load(f)

new_mapping = "nnUNet_files/nnUNet_raw/Dataset789_ChronoRoot2/folder_image_map.json"
with open(new_mapping, 'r') as f:
    new_folder_image_map = json.load(f)

# Process the image directories
for path in image_paths:
    # Sort filenames before processing
    while os.path.exists(os.path.join(path, f'image_{old_counter}_0000.png')):
        filename = f'image_{old_counter}_0000.png'
        shutil.copy(os.path.join(path, filename), os.path.join(outpath, f'image_{counter}_0000.png'))
        
        label_path = os.path.join(path2_labels, filename.replace('_0000.png', '.png'))
        
        shutil.copy(label_path, os.path.join(labels_outpath, f'image_{counter}.png'))
                
        # replace old_counter with in the mapping
        for folder_id, images in folder_image_map.items():
            if f"image_{old_counter}" in images:
                if not folder_id in new_folder_image_map:
                    new_folder_image_map[folder_id] = []
                new_folder_image_map[folder_id].append(f'image_{counter}')
        
        counter += 1
        old_counter += 1
                
# save it to tarabidopsis
final_mapping_path = "nnUNet_files/nnUNet_raw/Dataset557_TArabidopsis/folder_image_map.json"
with open(final_mapping_path, 'w') as f:
    json.dump(new_folder_image_map, f)

In [5]:
import json
import numpy as np
import os

# Path to the folder_image_map.json file
folder_map_path = 'nnUNet_files/nnUNet_raw/Dataset557_TArabidopsis/folder_image_map.json'

# Paths to save the splits files
arabidopsis_splits_path = 'nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json'
tomato_splits_path = 'nnUNet_files/nnUNet_preprocessed/Dataset555_TomateRoot/splits_final.json'
mixed_splits_path = 'nnUNet_files/nnUNet_preprocessed/Dataset557_TArabidopsis/splits_final.json'

# Load the folder-to-images mapping
with open(folder_map_path, 'r') as f:
    folder_image_map = json.load(f)

# Separate folder IDs by dataset
arabidopsis_folders = []
tomato_folders = []

for folder_id in folder_image_map.keys():
    if 'Arabidopsis' in folder_id or 'arabidopsis' in folder_id:
        arabidopsis_folders.append(folder_id)
    elif 'Tomato' in folder_id or 'tomato' in folder_id:
        tomato_folders.append(folder_id)
    else:
        # If folder doesn't contain either dataset name, you might want to handle this
        print(f"Warning: Folder '{folder_id}' doesn't contain 'Arabidopsis' or 'Tomato' in name")

print(f"Found {len(arabidopsis_folders)} Arabidopsis folders")
print(f"Found {len(tomato_folders)} Tomato folders")

# Set a seed for reproducibility
np.random.seed(42)

def create_splits(folder_ids, folder_image_map):
    """Create 5-fold cross-validation splits for given folders"""
    # Shuffle the folder IDs
    folder_ids_shuffled = folder_ids.copy()
    np.random.shuffle(folder_ids_shuffled)
    
    # Split the folder IDs into 5 folds
    folder_folds = np.array_split(folder_ids_shuffled, 5)
    
    # Create the 5-fold cross validation splits
    splits = []
    for i in range(5):
        # Folders for validation in this fold
        val_folders = folder_folds[i]
        
        # Initialize train and validation lists
        train_images = []
        val_images = []
        
        # Assign images to train or validation based on their folder
        for folder_id in folder_ids_shuffled:
            if folder_id in val_folders:
                # This folder goes to validation for this fold
                val_images.extend(folder_image_map[folder_id])
            else:
                # This folder goes to training for this fold
                train_images.extend(folder_image_map[folder_id])
        
        # Create the split dictionary
        split = {
            'train': train_images,
            'val': val_images
        }
        
        # Add to the splits list
        splits.append(split)
    
    # Add a fake fold 6 with all images in both training and validation sets for inference
    train_images = []
    val_images = []
    
    for folder_id in folder_ids_shuffled:
        val_images.extend(folder_image_map[folder_id])
        train_images.extend(folder_image_map[folder_id])
    
    split = {
        'train': train_images,
        'val': val_images
    }
    splits.append(split)
    
    return splits, folder_folds

def save_splits(splits, output_path, dataset_name, folder_folds):
    """Save splits to file and print statistics"""
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save the splits
    with open(output_path, 'w') as f:
        json.dump(splits, f, indent=4)
    
    print(f"\n{dataset_name} Dataset:")
    print(f"Splits saved to: {output_path}")
    for i, split in enumerate(splits):
        print(f"Fold {i}: {len(split['train'])} training images, {len(split['val'])} validation images")
    print(f"Folders per fold: {[len(fold) for fold in folder_folds]}")

import copy

def save_tomato_splits(splits, output_path, dataset_name, folder_folds):
    """Save splits to file and print statistics"""
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # make a copy of splits
    splits2 = copy.deepcopy(splits)
    
    # in each of the splits, image_NUMBER = image_NUMBER - 797
    for split in splits2:
        split['train'] = [img.replace(img, "image_" + str(int(img.split("_")[1]) - 797)) for img in split['train']]
        split['val'] = [img.replace(img, "image_" + str(int(img.split("_")[1]) - 797)) for img in split['val']]
        
    # Save the splits
    with open(output_path, 'w') as f:
        json.dump(splits2, f, indent=4)
    
    print(f"\n{dataset_name} Dataset:")
    print(f"Splits saved to: {output_path}")
    for i, split in enumerate(splits2):
        print(f"Fold {i}: {len(split['train'])} training images, {len(split['val'])} validation images")
    print(f"Folders per fold: {[len(fold) for fold in folder_folds]}")


# Create splits for Arabidopsis dataset
if arabidopsis_folders:
    arabidopsis_splits, arabidopsis_folds = create_splits(arabidopsis_folders, folder_image_map)
    save_splits(arabidopsis_splits, arabidopsis_splits_path, "Arabidopsis", arabidopsis_folds)

# Create splits for Tomato dataset
if tomato_folders:
    tomato_splits, tomato_folds = create_splits(tomato_folders, folder_image_map)
    save_tomato_splits(tomato_splits.copy(), tomato_splits_path, "Tomato", tomato_folds)

# Create mixed dataset splits
if arabidopsis_folders and tomato_folders:
    print(f"\nCreating Mixed Dataset Splits...")
    
    mixed_splits = []
    
    # Create 5 mixed folds by combining corresponding folds from both datasets
    for i in range(5):
        # Combine training images from both datasets for this fold
        mixed_train = arabidopsis_splits[i]['train'] + tomato_splits[i]['train']
        # Combine validation images from Tomato dataset only for this fold
        mixed_val = tomato_splits[i]['val']
        
        mixed_split = {
            'train': mixed_train,
            'val': mixed_val
        }
        mixed_splits.append(mixed_split)
    
    # Add mixed inference fold (fold 6)
    mixed_train_all = arabidopsis_splits[5]['train'] + tomato_splits[5]['train']
    mixed_val_all = tomato_splits[5]['val']
    
    mixed_split_all = {
        'train': mixed_train_all,
        'val': mixed_val_all
    }
    mixed_splits.append(mixed_split_all)
    
    # Save mixed splits
    os.makedirs(os.path.dirname(mixed_splits_path), exist_ok=True)
    with open(mixed_splits_path, 'w') as f:
        json.dump(mixed_splits, f, indent=4)
    
    print(f"\nMixed Dataset:")
    print(f"Splits saved to: {mixed_splits_path}")
    for i, split in enumerate(mixed_splits):
        print(f"Fold {i}: {len(split['train'])} training images, {len(split['val'])} validation images")

print(f"\nTotal Arabidopsis folders: {len(arabidopsis_folders)}")
print(f"Total Tomato folders: {len(tomato_folders)}")
print(f"All splits generation completed!")

Found 147 Arabidopsis folders
Found 45 Tomato folders

Arabidopsis Dataset:
Splits saved to: nnUNet_files/nnUNet_preprocessed/Dataset789_ChronoRoot2/splits_final.json
Fold 0: 671 training images, 126 validation images
Fold 1: 656 training images, 141 validation images
Fold 2: 636 training images, 161 validation images
Fold 3: 639 training images, 158 validation images
Fold 4: 586 training images, 211 validation images
Fold 5: 797 training images, 797 validation images
Folders per fold: [30, 30, 29, 29, 29]

Tomato Dataset:
Splits saved to: nnUNet_files/nnUNet_preprocessed/Dataset555_TomateRoot/splits_final.json
Fold 0: 247 training images, 52 validation images
Fold 1: 244 training images, 55 validation images
Fold 2: 238 training images, 61 validation images
Fold 3: 230 training images, 69 validation images
Fold 4: 237 training images, 62 validation images
Fold 5: 299 training images, 299 validation images
Folders per fold: [9, 9, 9, 9, 9]

Creating Mixed Dataset Splits...

Mixed Datas