In [None]:
import os

'''
Script for writing ___.txt files which will contain all the paths to the images

currently it set to scenario where all images, are in train dataset, so it goes through
all the files in images/train folder and outputs their paths into train.txt file

'''

# Define the folder path
folder_path = "test-dataset-cvat/images/train"

# Define the output file path
output_file = "test-dataset-cvat/train.txt"

# Get all files in the folder
files = []
for filename in os.listdir(folder_path):
    # Get the full path
    full_path = os.path.join(folder_path, filename)
    
    # Check if it's a file (not a directory)
    if os.path.isfile(full_path):
        files.append(filename)

# Sort files for consistent ordering
files.sort()

# Write to output file
with open(output_file, 'w') as f:
    for filename in files:
        # Write the relative path format like in your example
        f.write(f"data/images/train/{filename}\n")

print(f"Written {len(files)} filenames to {output_file}")

Written 39 filenames to test-dataset-cvat/train.txt


In [None]:
import os
import shutil
import random
from pathlib import Path

# Configuration
source_images_folder = "synthetic-dataset/images"
source_labels_folder = "synthetic-dataset/labels"
output_base_folder = "synthetic-dataset"

# Split ratios (should sum to 1.0)
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Set random seed for reproducibility
random.seed(42)

def create_folder_structure(base_folder):
    """Create the folder structure for train/val/test splits"""
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(base_folder, split, 'images'), exist_ok=True)
        os.makedirs(os.path.join(base_folder, split, 'labels'), exist_ok=True)
    print("✓ Created folder structure")

def get_image_files(folder_path):
    """Get all image files from the folder"""
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
    files = []
    for filename in os.listdir(folder_path):
        if any(filename.lower().endswith(ext) for ext in image_extensions):
            files.append(filename)
    return files

def split_dataset(files, train_ratio, val_ratio, test_ratio):
    """Split files into train/val/test sets"""
    # Shuffle files
    random.shuffle(files)
    
    # Calculate split indices
    total = len(files)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)
    
    train_files = files[:train_end]
    val_files = files[train_end:val_end]
    test_files = files[val_end:]
    
    return train_files, val_files, test_files

def copy_files(files, source_img_folder, source_lbl_folder, 
               dest_img_folder, dest_lbl_folder):
    """Copy image and label files to destination folders"""
    copied_count = 0
    missing_labels = []
    
    for filename in files:
        # Get base name without extension
        base_name = os.path.splitext(filename)[0]
        
        # Copy image
        src_img = os.path.join(source_img_folder, filename)
        dst_img = os.path.join(dest_img_folder, filename)
        shutil.move2(src_img, dst_img)
        
        # Copy label (assuming .txt extension)
        label_filename = base_name + '.txt'
        src_lbl = os.path.join(source_lbl_folder, label_filename)
        dst_lbl = os.path.join(dest_lbl_folder, label_filename)
        
        if os.path.exists(src_lbl):
            shutil.move2(src_lbl, dst_lbl)
            copied_count += 1
        else:
            missing_labels.append(filename)
    
    return copied_count, missing_labels

# Start processing
print("Starting dataset split...")
print(f"Train: {train_ratio*100}%, Val: {val_ratio*100}%, Test: {test_ratio*100}%")
print("-" * 60)

# Create folder structure
create_folder_structure(output_base_folder)

# Get all image files
all_files = get_image_files(source_images_folder)
print(f"✓ Found {len(all_files)} images")

# Split dataset
train_files, val_files, test_files = split_dataset(
    all_files, train_ratio, val_ratio, test_ratio
)

print(f"✓ Split into: Train={len(train_files)}, Val={len(val_files)}, Test={len(test_files)}")
print("-" * 60)

# Move train files
print("Moving train files...")
train_copied, train_missing = copy_files(
    train_files, source_images_folder, source_labels_folder,
    os.path.join(output_base_folder, 'train', 'images'),
    os.path.join(output_base_folder, 'train', 'labels')
)
print(f"✓ Moved {train_copied} train image-label pairs")
if train_missing:
    print(f"⚠ Warning: {len(train_missing)} train images missing labels")

# Move val files
print("Moving validation files...")
val_copied, val_missing = copy_files(
    val_files, source_images_folder, source_labels_folder,
    os.path.join(output_base_folder, 'val', 'images'),
    os.path.join(output_base_folder, 'val', 'labels')
)
print(f"✓ Moved {val_copied} validation image-label pairs")
if val_missing:
    print(f"⚠ Warning: {len(val_missing)} validation images missing labels")

# Move test files
print("Moving test files...")
test_copied, test_missing = copy_files(
    test_files, source_images_folder, source_labels_folder,
    os.path.join(output_base_folder, 'test', 'images'),
    os.path.join(output_base_folder, 'test', 'labels')
)
print(f"✓ Moved {test_copied} test image-label pairs")
if test_missing:
    print(f"⚠ Warning: {len(test_missing)} test images missing labels")

print("-" * 60)
print("✓ Dataset split complete!")
print(f"\nSummary:")
print(f"  Train: {len(train_files)} images ({train_ratio*100}%)")
print(f"  Val:   {len(val_files)} images ({val_ratio*100}%)")
print(f"  Test:  {len(test_files)} images ({test_ratio*100}%)")
print(f"  Total: {len(all_files)} images")

Starting dataset split...
Train: 70.0%, Val: 15.0%, Test: 15.0%
------------------------------------------------------------
✓ Created folder structure
✓ Found 1000 images
✓ Split into: Train=700, Val=150, Test=150
------------------------------------------------------------
Copying train files...
✓ Copied 700 train image-label pairs
Copying validation files...
✓ Copied 150 validation image-label pairs
Copying test files...
✓ Copied 150 test image-label pairs
------------------------------------------------------------
✓ Dataset split complete!

Summary:
  Train: 700 images (70.0%)
  Val:   150 images (15.0%)
  Test:  150 images (15.0%)
  Total: 1000 images
