In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import cv2
import numpy as np
from PIL import Image, UnidentifiedImageError   
import os
import shutil
from pathlib import Path

In [2]:
#### CLEAN DATASET – REMOVE CORRUPTED IMAGES

In [3]:
DATA_PATH = "Data segrigation"           
CORRUPTED_FOLDER = "corrupted_images"

os.makedirs(CORRUPTED_FOLDER, exist_ok=True)

corrupted_count = 0
total_checked = 0

print("Scanning your dataset for corrupted or broken images...\n")
print("-" * 80)

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp')):
            file_path = os.path.join(root, file)
            total_checked += 1
            
            try:
                
                with Image.open(file_path) as img:
                    img.verify() 
                
            except (Exception, UnidentifiedImageError, OSError, SyntaxError) as e:
                # All possible PIL errors are caught
                print(f"CORRUPTED → {file_path}")
                corrupted_count += 1
                
                # Move corrupted file to safe folder
                class_name = Path(root).name
                new_name = f"{class_name}_{file}"
                dest = os.path.join(CORRUPTED_FOLDER, new_name)
                try:
                    shutil.move(file_path, dest)
                    print(f"   → Moved to {dest}")
                except:
                    print(f"   → Failed to move (possibly already moved)")

print("-" * 80)
print(f"\nCLEANING COMPLETE!")
print(f"Total images scanned     : {total_checked}")
print(f"Corrupted & removed      : {corrupted_count}")
print(f"Healthy images remaining : {total_checked - corrupted_count}")
print(f"All corrupted files saved in: {CORRUPTED_FOLDER}")
print("\nYour dataset is now CLEAN and safe for augmentation & training!")

Scanning your dataset for corrupted or broken images...

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

CLEANING COMPLETE!
Total images scanned     : 3236
Corrupted & removed      : 0
Healthy images remaining : 3236
All corrupted files saved in: corrupted_images

Your dataset is now CLEAN and safe for augmentation & training!


In [4]:
# Define which classes need heavy augmentation
MINORITY_CLASSES = {
    'Bacteria Blight': 69,
    'Caterpillar Damage': 76,
    'Kanamadiri haniya': 134,
    'Red Spider mite Damage': 111
}

In [5]:
# Target: bring all minority classes to ~600–800 images each
TARGET_PER_CLASS = 500

# Create output directory
AUGMENTED_DIR = "Data segrigation_Augmented"
os.makedirs(AUGMENTED_DIR, exist_ok=True)

# Advanced augmentation generator
aug_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.3,
    height_shift_range=0.3,
    shear_range=0.3,
    zoom_range=0.4,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.6, 1.4],
    channel_shift_range=30,
    fill_mode='nearest'
)

print("STARTING TARGETED AUGMENTATION FOR MINORITY CLASSES...")
print(f"Target: ~{TARGET_PER_CLASS} images per minority class\n")

STARTING TARGETED AUGMENTATION FOR MINORITY CLASSES...
Target: ~500 images per minority class



In [6]:
DATA_PATH= 'Data segrigation'

total_generated = 0

for class_name, original_count in MINORITY_CLASSES.items():
    class_dir = os.path.join(DATA_PATH, class_name)
    aug_class_dir = os.path.join(AUGMENTED_DIR, class_name)
    os.makedirs(aug_class_dir, exist_ok=True)
    
    # Copy original images first
    for img_file in os.listdir(class_dir):
        if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
            src = os.path.join(class_dir, img_file)
            dst = os.path.join(aug_class_dir, img_file)
            if not os.path.exists(dst):
                Image.open(src).save(dst)
    
    images_needed = TARGET_PER_CLASS - original_count
    if images_needed <= 0:
        print(f"{class_name}: Already has {original_count} ≥ {TARGET_PER_CLASS} → Skipping")
        continue
    
    print(f"{class_name}: {original_count} → Need {images_needed} more → Generating...")

    # Load all images from this class
    img_files = [f for f in os.listdir(class_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    generated = 0
    
    while generated < images_needed:
        np.random.shuffle(img_files)
        for img_file in img_files:
            if generated >= images_needed:
                break
                
            img_path = os.path.join(class_dir, img_file)
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = img.reshape((1,) + img.shape)
            
            # Generate 5 augmented versions per original image
            i = 0
            for batch in aug_datagen.flow(img, batch_size=1):
                aug_img = batch[0].astype('uint8')
                
                # Save augmented image
                save_path = os.path.join(aug_class_dir, f"aug_{generated:05d}_{img_file}")
                Image.fromarray(aug_img).save(save_path)
                
                generated += 1
                total_generated += 1
                i += 1
                if i >= 5:  # 5 aug per original image per loop
                    break
                if generated >= images_needed:
                    break
            if generated >= images_needed:
                break
    
    final_count = len(os.listdir(aug_class_dir))
    print(f"   Done! {class_name}: {original_count} → {final_count} images")

print(f"\nAUGMENTATION COMPLETE!")
print(f"Total new augmented images generated: {total_generated}")
print(f"Saved in: {AUGMENTED_DIR}")

# Now update your DATA_PATH to use augmented data
print("\nRECOMMENDED: Change your DATA_PATH to use augmented dataset:")
print(f'DATA_PATH = "{AUGMENTED_DIR}"   # ← Use this for better balance!')

Kanamadiri haniya: 134 → Need 366 more → Generating...
   Done! Kanamadiri haniya: 134 → 406 images
Red Spider mite Damage: 111 → Need 389 more → Generating...
   Done! Red Spider mite Damage: 111 → 479 images

AUGMENTATION COMPLETE!
Total new augmented images generated: 755
Saved in: Data segrigation_Augmented

RECOMMENDED: Change your DATA_PATH to use augmented dataset:
DATA_PATH = "Data segrigation_Augmented"   # ← Use this for better balance!
