In [None]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import random
import kagglehub
import zipfile

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ZIP_PATH = "/content/drive/MyDrive/YOLO-Waste-Detection-1.zip"
EXTRACT_DIR = "/content/YOLO-Waste-Detection-1"

# Extract only once
if not os.path.exists(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall("/content")

print("Extraction complete.")
print("Extracted contents:", os.listdir(EXTRACT_DIR))

Extraction complete.
Extracted contents: ['valid', 'data.yaml', 'README.dataset.txt', 'test', 'train', 'README.roboflow.txt']


In [None]:
YOLO_ROOT = "/content/YOLO-Waste-Detection-1"
SPLIT = "valid"

IMAGE_DIR = os.path.join(YOLO_ROOT, SPLIT, "images")
LABEL_DIR = os.path.join(YOLO_ROOT, SPLIT, "labels")

print("Images:", IMAGE_DIR)
print("Labels:", LABEL_DIR)

print("Number of images:", len(os.listdir(IMAGE_DIR)))
print("Number of labels:", len(os.listdir(LABEL_DIR)))

Images: /content/YOLO-Waste-Detection-1/valid/images
Labels: /content/YOLO-Waste-Detection-1/valid/labels
Number of images: 2013
Number of labels: 2013


In [None]:
# Load YOLO class names safely
DATA_YAML_PATH = os.path.join(YOLO_ROOT, "data.yaml")

with open(DATA_YAML_PATH, "r") as f:
    yolo_data = yaml.safe_load(f)

YOLO_CLASS_NAMES = yolo_data["names"]
NUM_YOLO_CLASSES = len(YOLO_CLASS_NAMES)

print("Number of YOLO classes:", NUM_YOLO_CLASSES)

Number of YOLO classes: 43


In [None]:
# Check number of images in each split
splits = ['train', 'valid', 'test']

print("Number of images in each split:")
print("-" * 30)

for split in splits:
    image_dir = os.path.join(YOLO_ROOT, split, "images")
    label_dir = os.path.join(YOLO_ROOT, split, "labels")

    num_images = 0
    num_labels = 0

    if os.path.exists(image_dir):
        num_images = len(os.listdir(image_dir))

    if os.path.exists(label_dir):
        num_labels = len(os.listdir(label_dir))

    print(f"{split.capitalize():10s} | Images: {num_images:4d} | Labels: {num_labels:4d}")

# Optional: Print total counts
print("\n" + "=" * 30)
total_images = 0
total_labels = 0

for split in splits:
    image_dir = os.path.join(YOLO_ROOT, split, "images")
    label_dir = os.path.join(YOLO_ROOT, split, "labels")

    if os.path.exists(image_dir):
        total_images += len(os.listdir(image_dir))

    if os.path.exists(label_dir):
        total_labels += len(os.listdir(label_dir))

print(f"TOTAL      | Images: {total_images:4d} | Labels: {total_labels:4d}")

Number of images in each split:
------------------------------
Train      | Images: 19029 | Labels: 19029
Valid      | Images: 2013 | Labels: 2013
Test       | Images:  962 | Labels:  962

TOTAL      | Images: 22004 | Labels: 22004


In [None]:
from PIL import Image
import imghdr

def check_image_properties(folder_path):
    """
    Check image properties for all images in a folder
    Returns: (total_images, uniform_224x224, all_jpeg, resolutions, formats)
    """
    total_images = 0
    uniform_224x224 = True
    all_jpeg = True
    resolutions = set()
    formats = set()

    if not os.path.exists(folder_path):
        print(f"Folder {folder_path} does not exist!")
        return 0, False, False, set(), set()

    image_files = [f for f in os.listdir(folder_path)
                   if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]

    for i, filename in enumerate(image_files):
        if i % 500 == 0 and i > 0:
            print(f"  Processed {i}/{len(image_files)} images...")

        img_path = os.path.join(folder_path, filename)

        # Check file format
        try:
            # Method 1: Check file extension
            ext = os.path.splitext(filename)[1].lower()
            formats.add(ext)

            # Method 2: Check actual file format using imghdr
            file_format = imghdr.what(img_path)
            if file_format:
                formats.add(f".{file_format}")

            if ext not in ('.jpg', '.jpeg'):
                all_jpeg = False

            # Check image dimensions
            with Image.open(img_path) as img:
                width, height = img.size
                resolutions.add((width, height))

                if width != 224 or height != 224:
                    uniform_224x224 = False

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

        total_images += 1

    return total_images, uniform_224x224, all_jpeg, resolutions, formats

# Check all splits
print("Checking image properties for all splits...")
print("=" * 60)

results = {}
for split in ['train', 'valid', 'test']:
    print(f"\nChecking {split} split:")
    print("-" * 40)

    image_dir = os.path.join(YOLO_ROOT, split, "images")

    if not os.path.exists(image_dir):
        print(f"Image directory for {split} does not exist!")
        continue

    total, uniform, all_jpeg, resolutions, formats = check_image_properties(image_dir)

    results[split] = {
        'total': total,
        'uniform_224x224': uniform,
        'all_jpeg': all_jpeg,
        'resolutions': resolutions,
        'formats': formats
    }

    print(f"Total images: {total}")
    print(f"All images are 224x224? {uniform}")
    print(f"All images are JPEG? {all_jpeg}")

    if not uniform:
        print(f"Found {len(resolutions)} different resolution(s):")
        for res in sorted(resolutions):
            print(f"  - {res[0]} x {res[1]}")

    if not all_jpeg:
        print(f"Found {len(formats)} different format(s):")
        for fmt in sorted(formats):
            print(f"  - {fmt}")

# Summary analysis
print("\n" + "=" * 60)
print("SUMMARY: 'All images are resized to uniform resolution of 224x224 pixels and stored in JPEG format'")
print("=" * 60)

all_uniform = all(results.get(split, {}).get('uniform_224x224', False) for split in ['train', 'valid', 'test'])
all_jpeg_format = all(results.get(split, {}).get('all_jpeg', False) for split in ['train', 'valid', 'test'])

print(f"\nOverall evaluation:")
print(f"✓ All images are 224x224 pixels? {'TRUE' if all_uniform else 'FALSE'}")
print(f"✓ All images are JPEG format? {'TRUE' if all_jpeg_format else 'FALSE'}")
print(f"✓ Statement completely true? {'TRUE' if (all_uniform and all_jpeg_format) else 'FALSE'}")

# Show sample images if not uniform
if not all_uniform:
    print(f"\nSample of different resolutions found:")
    for split in ['train', 'valid', 'test']:
        if split in results and not results[split]['uniform_224x224']:
            print(f"\n{split.upper()} split resolutions:")
            for i, res in enumerate(sorted(results[split]['resolutions'])):
                if i < 5:  # Show first 5 resolutions
                    print(f"  {res[0]} x {res[1]}")
                elif i == 5:
                    print(f"  ... and {len(results[split]['resolutions']) - 5} more")
                    break

# Check if at least all are JPEG
if not all_jpeg_format:
    print(f"\nNon-JPEG formats found:")
    for split in ['train', 'valid', 'test']:
        if split in results and not results[split]['all_jpeg']:
            print(f"\n{split.upper()} split formats:")
            for fmt in sorted(results[split]['formats']):
                print(f"  {fmt}")

# Alternative: Quick check with a random sample
print("\n" + "=" * 60)
print("QUICK RANDOM SAMPLE CHECK (first 10 images from each split):")
print("=" * 60)

for split in ['train', 'valid', 'test']:
    image_dir = os.path.join(YOLO_ROOT, split, "images")
    if not os.path.exists(image_dir):
        continue

    image_files = sorted([f for f in os.listdir(image_dir)
                         if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))])[:10]

    print(f"\n{split.upper()} split - First 10 images:")
    print("-" * 40)

    for filename in image_files:
        img_path = os.path.join(image_dir, filename)
        try:
            with Image.open(img_path) as img:
                width, height = img.size
                ext = os.path.splitext(filename)[1].lower()
                print(f"{filename:30s} | {width:4d}x{height:<4d} | Format: {ext}")
        except Exception as e:
            print(f"{filename:30s} | ERROR: {e}")

  import imghdr


Checking image properties for all splits...

Checking train split:
----------------------------------------
  Processed 500/19029 images...
  Processed 1000/19029 images...
  Processed 1500/19029 images...
  Processed 2000/19029 images...
  Processed 2500/19029 images...
  Processed 3000/19029 images...
  Processed 3500/19029 images...
  Processed 4000/19029 images...
  Processed 4500/19029 images...
  Processed 5000/19029 images...
  Processed 5500/19029 images...
  Processed 6000/19029 images...
  Processed 6500/19029 images...
  Processed 7000/19029 images...
  Processed 7500/19029 images...
  Processed 8000/19029 images...
  Processed 8500/19029 images...
  Processed 9000/19029 images...
  Processed 9500/19029 images...
  Processed 10000/19029 images...
  Processed 10500/19029 images...
  Processed 11000/19029 images...
  Processed 11500/19029 images...
  Processed 12000/19029 images...
  Processed 12500/19029 images...
  Processed 13000/19029 images...
  Processed 13500/19029 imag

In [None]:
# Check class distribution in labels
import pandas as pd
import numpy as np

def analyze_class_distribution(split='train'):
    label_dir = os.path.join(YOLO_ROOT, split, "labels")
    class_counts = {}

    for label_file in os.listdir(label_dir):
        if label_file.endswith('.txt'):
            with open(os.path.join(label_dir, label_file), 'r') as f:
                for line in f:
                    class_id = int(line.split()[0])
                    class_counts[class_id] = class_counts.get(class_id, 0) + 1

    return class_counts

# Analyze for all splits
print("Class distribution analysis:")
for split in ['train', 'valid', 'test']:
    counts = analyze_class_distribution(split)
    print(f"\n{split.upper()} split - Total objects: {sum(counts.values())}")
    print(f"Classes present: {sorted(counts.keys())}")
    # Show top 5 most frequent classes
    sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:5]
    print("Top 5 classes:")
    for class_id, count in sorted_counts:
        class_name = YOLO_CLASS_NAMES[class_id] if class_id < len(YOLO_CLASS_NAMES) else f"Unknown({class_id})"
        print(f"  {class_name}: {count}")

Class distribution analysis:

TRAIN split - Total objects: 30785
Classes present: [0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43]
Top 5 classes:
  Unknown(43): 12383
  Aluminum can: 3846
  Plastic bottle: 2700
  Plastic bag: 1878
  Ramen Cup: 1865

VALID split - Total objects: 2970
Classes present: [0, 1, 2, 3, 6, 7, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 28, 29, 30, 32, 33, 36, 37, 38, 39, 40, 41, 42, 43]
Top 5 classes:
  Unknown(43): 1133
  Aluminum can: 404
  Ramen Cup: 278
  Plastic bottle: 234
  Organic: 152

TEST split - Total objects: 1326
Classes present: [0, 1, 3, 7, 8, 9, 10, 12, 17, 18, 19, 23, 24, 25, 26, 28, 29, 30, 32, 33, 36, 37, 38, 39, 41, 43]
Top 5 classes:
  Unknown(43): 502
  Aluminum can: 192
  Plastic bottle: 123
  Plastic bag: 104
  Cardboard: 75


In [None]:
# Check aspect ratios and potential distortions
aspect_ratios = []

for split in ['train', 'valid', 'test']:
    image_dir = os.path.join(YOLO_ROOT, split, "images")
    if not os.path.exists(image_dir):
        continue

    for filename in os.listdir(image_dir)[:100]:  # Sample 100 from each
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            img_path = os.path.join(image_dir, filename)
            try:
                with Image.open(img_path) as img:
                    width, height = img.size
                    aspect_ratios.append(width/height)
            except:
                pass

print(f"\nAspect ratio analysis (sample):")
print(f"Min aspect ratio: {min(aspect_ratios):.2f}")
print(f"Max aspect ratio: {max(aspect_ratios):.2f}")
print(f"Mean aspect ratio: {np.mean(aspect_ratios):.2f}")
print(f"Standard deviation: {np.std(aspect_ratios):.2f}")


Aspect ratio analysis (sample):
Min aspect ratio: 1.00
Max aspect ratio: 1.00
Mean aspect ratio: 1.00
Standard deviation: 0.00


In [None]:
# Check if all images have corresponding labels
print("\nLabel-image correspondence check:")
for split in ['train', 'valid', 'test']:
    image_dir = os.path.join(YOLO_ROOT, split, "images")
    label_dir = os.path.join(YOLO_ROOT, split, "labels")

    if not os.path.exists(image_dir) or not os.path.exists(label_dir):
        continue

    image_files = set([os.path.splitext(f)[0] for f in os.listdir(image_dir)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    label_files = set([os.path.splitext(f)[0] for f in os.listdir(label_dir)
                      if f.endswith('.txt')])

    images_without_labels = image_files - label_files
    labels_without_images = label_files - image_files

    print(f"\n{split.upper()} split:")
    print(f"  Images without labels: {len(images_without_labels)}")
    print(f"  Labels without images: {len(labels_without_images)}")


Label-image correspondence check:

TRAIN split:
  Images without labels: 0
  Labels without images: 0

VALID split:
  Images without labels: 0
  Labels without images: 0

TEST split:
  Images without labels: 0
  Labels without images: 0
