In [1]:
import os
import shutil
import random

# Adjustable paths
source_folder = "garbage_classification_raw"
train_folder = "garbage_classification_train_test/train_set"
test_folder = "garbage_classification_train_test/test_set"

# Adjustable split ratio
split_ratio = 0.8

# Create train and test folders
for folder in [train_folder, test_folder]:
    if not os.path.exists(folder):
        os.makedirs(folder)

# Loop through each category
for category in os.listdir(source_folder):
    category_path = os.path.join(source_folder, category)
    if os.path.isdir(category_path):
        images = os.listdir(category_path)
        random.shuffle(images)

        split_point = int(len(images) * split_ratio)
        train_images = images[:split_point]
        test_images = images[split_point:]

        # Create category folders in train and test directories
        train_category_path = os.path.join(train_folder, category)
        test_category_path = os.path.join(test_folder, category)
        os.makedirs(train_category_path, exist_ok=True)
        os.makedirs(test_category_path, exist_ok=True)

        # Copy files to train folder
        for img in train_images:
            src = os.path.join(category_path, img)
            dst = os.path.join(train_category_path, img)
            shutil.copy2(src, dst)

        # Copy files to test folder
        for img in test_images:
            src = os.path.join(category_path, img)
            dst = os.path.join(test_category_path, img)
            shutil.copy2(src, dst)

print("Dataset split complete. Train and test folders are ready.")

Dataset split complete. Train and test folders are ready.


In [3]:
import os

def verify_split(folder):
    print(f"\nVerifying folder: {folder}")
    if not os.path.exists(folder):
        print(f"❌ Folder '{folder}' does not exist.")
        return

    categories = [d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d))]
    if not categories:
        print(f"❌ No category folders found in '{folder}'.")
        return

    total_images = 0
    for category in categories:
        category_path = os.path.join(folder, category)
        images = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))]
        print(f"✅ Category '{category}': {len(images)} images")
        total_images += len(images)
    print(f"✅ Total images in {folder}: {total_images}")

verify_split(train_folder)
verify_split(test_folder)


Verifying folder: garbage_classification_train_test/train_set
✅ Category 'paper': 475 images
✅ Category 'metal': 328 images
✅ Category 'cardboard': 322 images
✅ Category 'trash': 109 images
✅ Category 'glass': 400 images
✅ Category 'plastic': 385 images
✅ Total images in garbage_classification_train_test/train_set: 2019

Verifying folder: garbage_classification_train_test/test_set
✅ Category 'paper': 119 images
✅ Category 'metal': 82 images
✅ Category 'cardboard': 81 images
✅ Category 'trash': 28 images
✅ Category 'glass': 101 images
✅ Category 'plastic': 97 images
✅ Total images in garbage_classification_train_test/test_set: 508


In [4]:
train_folder = "garbage_classification_train_test/train_set"
test_folder = "garbage_classification_train_test/test_set"

def get_all_relative_paths(folder):
    paths = set()
    for category in os.listdir(folder):
        category_path = os.path.join(folder, category)
        if os.path.isdir(category_path):
            for img in os.listdir(category_path):
                relative_path = os.path.join(category, img)
                paths.add(relative_path)
    return paths

train_paths = get_all_relative_paths(train_folder)
test_paths = get_all_relative_paths(test_folder)

print(f"✅ Total images in train set: {len(train_paths)}")
print(f"✅ Total images in test set: {len(test_paths)}")

# Check for duplicates
duplicates = train_paths.intersection(test_paths)

if duplicates:
    print(f"❌ Found {len(duplicates)} duplicate images in both train and test sets:")
    for dup in list(duplicates)[:10]:  # show first 10 only
        print(f"- {dup}")
else:
    print("✅ No duplicate images found between train and test sets.")

✅ Total images in train set: 2019
✅ Total images in test set: 508
✅ No duplicate images found between train and test sets.


In [5]:
def count_images(folder):
    total = 0
    for category in os.listdir(folder):
        category_path = os.path.join(folder, category)
        if os.path.isdir(category_path):
            images = [f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))]
            total += len(images)
    return total

source_count = count_images(source_folder)
train_count = count_images(train_folder)
test_count = count_images(test_folder)

print(f"✅ Total images in source: {source_count}")
print(f"✅ Total images in train: {train_count}")
print(f"✅ Total images in test: {test_count}")

if source_count == train_count + test_count:
    print("✅ The total number of images in train and test matches the original source data.")
else:
    print("❌ Mismatch detected!")
    print(f"Total in train + test: {train_count + test_count}")
    print(f"Difference: {source_count - (train_count + test_count)}")

✅ Total images in source: 2527
✅ Total images in train: 2019
✅ Total images in test: 508
✅ The total number of images in train and test matches the original source data.
