In [1]:
import os
import shutil
import tqdm
from sklearn.model_selection import train_test_split

In [2]:
def split_dataset(images_path, masks_path, train_images_path, train_masks_path, val_images_path, val_masks_path, split_ratio=0.2, random_seed=42):
    """
    Splits image and mask datasets into train and validation sets.

    Args:
        images_path (str): Path to the folder containing images.
        masks_path (str): Path to the folder containing masks.
        train_images_path (str): Path to save the training images.
        train_masks_path (str): Path to save the training masks.
        val_images_path (str): Path to save the validation images.
        val_masks_path (str): Path to save the validation masks.
        split_ratio (float): Proportion of the dataset to include in the validation set (default 0.2).
        random_seed (int): Random seed for reproducibility (default 42).
    """

    # Create train and val directories if they don't exist
    os.makedirs(train_images_path, exist_ok=True)
    os.makedirs(train_masks_path, exist_ok=True)
    os.makedirs(val_images_path, exist_ok=True)
    os.makedirs(val_masks_path, exist_ok=True)

    # Get a list of all images and masks
    image_files = sorted(os.listdir(images_path))
    mask_files = sorted(os.listdir(masks_path))

    # Ensure images and masks are paired correctly (assuming names match exactly)
    assert len(image_files) == len(mask_files), "The number of images and masks should be the same."
    image_mask_pairs = [(img, msk) for img, msk in zip(image_files, mask_files) if img.split('.')[0] == msk.split('.')[0]]

    # Split the dataset
    train_pairs, val_pairs = train_test_split(image_mask_pairs, test_size=split_ratio, random_state=random_seed)

    # Function to copy files
    def copy_files(pairs, src_img_folder, src_mask_folder, dst_img_folder, dst_mask_folder):
        for img_file, mask_file in tqdm.tqdm(pairs):
            # Copy the image
            shutil.copy(os.path.join(src_img_folder, img_file), os.path.join(dst_img_folder, img_file))
            # Copy the corresponding mask
            shutil.copy(os.path.join(src_mask_folder, mask_file), os.path.join(dst_mask_folder, mask_file))

    # Copy the train files
    copy_files(train_pairs, images_path, masks_path, train_images_path, train_masks_path)

    # Copy the validation files
    copy_files(val_pairs, images_path, masks_path, val_images_path, val_masks_path)

    print("Dataset split completed!")

In [14]:
# covid-lls
split_dataset(
    images_path='./covid-lls/images',
    masks_path='./covid-lls/masks',
    train_images_path='./covid-lls/train/images',
    train_masks_path='./covid-lls/train/masks',
    val_images_path='./covid-lls/val/images',
    val_masks_path='./covid-lls/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42
)

100%|█████████████████████████████████████████████████████████████████████████████| 2183/2183 [00:04<00:00, 486.76it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 546/546 [00:01<00:00, 520.73it/s]

Dataset split completed!





In [15]:
#kvasir-seg
# covid-lls
split_dataset(
    images_path='./kvasir-seg/images',
    masks_path='./kvasir-seg/masks',
    train_images_path='./kvasir-seg/train/images',
    train_masks_path='./kvasir-seg/train/masks',
    val_images_path='./kvasir-seg/val/images',
    val_masks_path='./kvasir-seg/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42
)

100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:05<00:00, 158.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 157.88it/s]

Dataset split completed!





In [6]:
# ph2
split_dataset(
    images_path='./ph2/images',
    masks_path='./ph2/masks',
    train_images_path='./ph2/train/images',
    train_masks_path='./ph2/train/masks',
    val_images_path='./ph2/val/images',
    val_masks_path='./ph2/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42
)

100%|███████████████████████████████████████████████████████████████████████████████| 160/160 [00:00<00:00, 296.11it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 334.03it/s]

Dataset split completed!





In [4]:
# ham10k
split_dataset(
    images_path='./ham10k/images',
    masks_path='./ham10k/masks',
    train_images_path='./ham10k/train/images',
    train_masks_path='./ham10k/train/masks',
    val_images_path='./ham10k/val/images',
    val_masks_path='./ham10k/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42
)

100%|██████████████████████████████████████████████████████████████████████████████| 8012/8012 [02:03<00:00, 64.70it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2003/2003 [00:34<00:00, 58.42it/s]

Dataset split completed!





In [9]:
# ecssd
split_dataset(
    images_path='./ecssd/images',
    masks_path='./ecssd/masks',
    train_images_path='./ecssd/train/images',
    train_masks_path='./ecssd/train/masks',
    val_images_path='./ecssd/val/images',
    val_masks_path='./ecssd/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42
)

100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:03<00:00, 249.42it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 229.19it/s]

Dataset split completed!





In [2]:
import os
import shutil
import tqdm
import random
from sklearn.model_selection import train_test_split

def split_dataset(images_path, masks_path, train_images_path, train_masks_path, val_images_path, val_masks_path, split_ratio=0.2, random_seed=42, num_images=None):
    """
    Splits image and mask datasets into train and validation sets.

    Args:
        images_path (str): Path to the folder containing images.
        masks_path (str): Path to the folder containing masks.
        train_images_path (str): Path to save the training images.
        train_masks_path (str): Path to save the training masks.
        val_images_path (str): Path to save the validation images.
        val_masks_path (str): Path to save the validation masks.
        split_ratio (float): Proportion of the dataset to include in the validation set (default 0.2).
        random_seed (int): Random seed for reproducibility (default 42).
        num_images (int, optional): Number of images to select randomly from the dataset. If None, all images are used.
    """

    # Set the random seed for reproducibility
    random.seed(random_seed)

    # Create train and val directories if they don't exist
    os.makedirs(train_images_path, exist_ok=True)
    os.makedirs(train_masks_path, exist_ok=True)
    os.makedirs(val_images_path, exist_ok=True)
    os.makedirs(val_masks_path, exist_ok=True)

    # Get a list of all images and masks
    image_files = sorted(os.listdir(images_path))
    mask_files = sorted(os.listdir(masks_path))

    # Ensure images and masks are paired correctly (assuming names match exactly)
    assert len(image_files) == len(mask_files), "The number of images and masks should be the same."
    image_mask_pairs = [(img, msk) for img, msk in zip(image_files, mask_files) if img.split('.')[0] == msk.split('.')[0]]

    # Limit the number of images if num_images is specified
    if num_images is not None:
        # Randomly shuffle the image-mask pairs and select the specified number of images
        random.shuffle(image_mask_pairs)
        image_mask_pairs = image_mask_pairs[:num_images]

    # Split the dataset
    train_pairs, val_pairs = train_test_split(image_mask_pairs, test_size=split_ratio, random_state=random_seed)

    # Function to copy files
    def copy_files(pairs, src_img_folder, src_mask_folder, dst_img_folder, dst_mask_folder):
        for img_file, mask_file in tqdm.tqdm(pairs):
            # Copy the image
            shutil.copy(os.path.join(src_img_folder, img_file), os.path.join(dst_img_folder, img_file))
            # Copy the corresponding mask
            shutil.copy(os.path.join(src_mask_folder, mask_file), os.path.join(dst_mask_folder, mask_file))

    # Copy the train files
    copy_files(train_pairs, images_path, masks_path, train_images_path, train_masks_path)

    # Copy the validation files
    copy_files(val_pairs, images_path, masks_path, val_images_path, val_masks_path)

    print("Dataset split completed!")


In [8]:
# ham10k
split_dataset(
    images_path='./ham10k/images',
    masks_path='./ham10k/masks',
    train_images_path='./ham10k/train/images',
    train_masks_path='./ham10k/train/masks',
    val_images_path='./ham10k/val/images',
    val_masks_path='./ham10k/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42,
    num_images=3000
)

100%|█████████████████████████████████████████████████████████████████████████████| 2400/2400 [00:07<00:00, 342.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [00:20<00:00, 29.43it/s]

Dataset split completed!





In [3]:
# covid-lls
split_dataset(
    images_path='./covid-lls/images',
    masks_path='./covid-lls/masks',
    train_images_path='./covid-lls/train/images',
    train_masks_path='./covid-lls/train/masks',
    val_images_path='./covid-lls/val/images',
    val_masks_path='./covid-lls/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42,
    num_images=2000
)

100%|█████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:04<00:00, 391.96it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 400/400 [00:01<00:00, 354.30it/s]

Dataset split completed!





In [4]:
# duts
split_dataset(
    images_path='./duts/images',
    masks_path='./duts/masks',
    train_images_path='./duts/train/images',
    train_masks_path='./duts/train/masks',
    val_images_path='./duts/val/images',
    val_masks_path='./duts/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42,
    num_images=2000
)

100%|█████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:06<00:00, 260.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 400/400 [00:01<00:00, 260.57it/s]

Dataset split completed!





In [6]:
# p3m
split_dataset(
    images_path='./p3m/images',
    masks_path='./p3m/masks',
    train_images_path='./p3m/train/images',
    train_masks_path='./p3m/train/masks',
    val_images_path='./p3m/val/images',
    val_masks_path='./p3m/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42,
    num_images=2000
)

100%|██████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:38<00:00, 41.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:10<00:00, 39.77it/s]

Dataset split completed!





In [4]:
# p3m
split_dataset(
    images_path='./dut-omron/images',
    masks_path='./dut-omron/masks',
    train_images_path='./dut-omron/train/images',
    train_masks_path='./dut-omron/train/masks',
    val_images_path='./dut-omron/val/images',
    val_masks_path='./dut-omron/val/masks',
    split_ratio=0.2,  # 80% train, 20% val
    random_seed=42,
    num_images=2000
)

100%|█████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:05<00:00, 279.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 400/400 [00:01<00:00, 262.91it/s]

Dataset split completed!



