**Loading Data (3000 images)**

In [1]:
import os
import random
import shutil

# Define paths
dataset_dir = ''
annotation_dir = ''
copied_data_dir = ''
selected_data_dir = ''

# Create directories if they don't exist
os.makedirs(copied_data_dir, exist_ok=True)
os.makedirs(selected_data_dir, exist_ok=True)

# Get list of images
images = [f for f in os.listdir(dataset_dir) if f.endswith(".jpg")]

print("Total images available:", len(images))

# Copy all images and their corresponding XML annotations to the copied_data_dir
for img in images:
    # Copy the image file
    src_img = os.path.join(dataset_dir, img)
    dst_img = os.path.join(copied_data_dir, img)
    shutil.copy(src_img, dst_img)
    
    # Copy the corresponding XML annotation file
    annotation = img.replace(".jpg", ".xml")
    src_ann = os.path.join(annotation_dir, annotation)
    dst_ann = os.path.join(copied_data_dir, annotation)
    if os.path.exists(src_ann):
        shutil.copy(src_ann, dst_ann)

# Randomly select 3000 images from the copied dataset
selected_images = random.sample(images, 3000)

# Move selected images and their annotations to the selected_data_dir
for img in selected_images:
    # Move the image file
    src_img = os.path.join(copied_data_dir, img)
    dst_img = os.path.join(selected_data_dir, img)
    shutil.move(src_img, dst_img)
    
    # Move the corresponding XML annotation file
    annotation = img.replace(".jpg", ".xml")
    src_ann = os.path.join(copied_data_dir, annotation)
    dst_ann = os.path.join(selected_data_dir, annotation)
    if os.path.exists(src_ann):
        shutil.move(src_ann, dst_ann)

print(f"Selected {len(selected_images)} images for the dataset.")


Total images available: 167800
Selected 3000 images for the dataset.


**Splitting Dataset**

In [3]:
# Define paths
data_dir = ''
train_dir = ''
val_dir = ''
test_dir = ''

# Create directories for train, validation, and test sets
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Load images from the dataset directory
images = [f for f in os.listdir(data_dir) if f.endswith('.jpg')]
random.shuffle(images)
total_images = len(images)
print("Total images:", total_images)

# Calculate split sizes
train_size = int(total_images * 0.7)
val_size = int(total_images * 0.15)
test_size = total_images - train_size - val_size   

# Split the data
train_images = images[:train_size]
val_images = images[train_size:train_size + val_size]
test_images = images[train_size + val_size:]

# Function to copy images and corresponding XML annotations
def copy_files(file_list, source_dir, target_dir):
    for img in file_list:
        # Copy image file
        shutil.copy(os.path.join(source_dir, img), os.path.join(target_dir, img))
        
        # Copy corresponding XML annotation file if it exists
        annotation = img.replace('.jpg', '.xml')
        annotation_path = os.path.join(source_dir, annotation)
        if os.path.exists(annotation_path):
            shutil.copy(annotation_path, os.path.join(target_dir, annotation))

# Copy files for each split
copy_files(train_images, data_dir, train_dir)
copy_files(val_images, data_dir, val_dir)
copy_files(test_images, data_dir, test_dir)

print(f"Data split complete:\n- Training: {len(train_images)} images\n- Validation: {len(val_images)} images\n- Testing: {len(test_images)} images")


Total images: 3000
Data split complete:
- Training: 2100 images
- Validation: 450 images
- Testing: 450 images
