Here, we filter and downsample calsses in our train, validation and test sets for our second model, trained with mixed dataset (PlantVillage and PlantDoc). We keep all PlantDoc images, as they are only a few tens to a few hundreds per class and downsample on PlantVillage images.

For the test sets, we only filter out classes not present in the model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Filtering and downsampling Mixed dataset



In [None]:
import os
import shutil
from tqdm import tqdm



# Folder that contains the merged training images 
SOURCE_DIR = "/content/drive/MyDrive/plant_village_dataset/train"

# Output directory 
OUTPUT_DIR = "/content/drive/MyDrive/plant_village_dataset/train_downsampled"

# List of class names that exist in PlantDoc
plantdoc_classes = [
    "Apple___Apple_scab", "Apple___Cedar_apple_rust", "Apple___healthy",
    "Blueberry___healthy",
    "Cherry_(including_sour)___healthy",
    "Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot", "Corn_(maize)___Common_rust_","Corn_(maize)___Northern_Leaf_Blight",
    "Grape___Black_rot","Grape___healthy",
    "Peach___healthy",
    "Pepper,_bell___Bacterial_spot","Pepper,_bell___healthy",
    "Potato___Early_blight","Potato___Late_blight",
    "Raspberry___healthy",
    "Soybean___healthy",
    "Squash___Powdery_mildew",
    "Strawberry___healthy",
    "Tomato___Bacterial_spot","Tomato___Early_blight","Tomato___Late_blight",
    "Tomato___healthy","Tomato___Leaf_Mold","Tomato___Septoria_leaf_spot",
    "Tomato___Spider_mites Two-spotted_spider_mite",
    "Tomato___Tomato_mosaic_virus","Tomato___Tomato_Yellow_Leaf_Curl_Virus"

]

## Mixed train set




In [None]:


# Max images per class
LIMIT = 1000


os.makedirs(OUTPUT_DIR, exist_ok=True)

for class_name in tqdm(os.listdir(SOURCE_DIR), desc="Processing classes"):

    class_path = os.path.join(SOURCE_DIR, class_name)

    # Skip items that are not folders
    if not os.path.isdir(class_path):
        continue

    # Skip classes not present in PlantDoc
    if class_name not in plantdoc_classes:
        continue

    # Gather images
    all_images = os.listdir(class_path)

    # Separate by origin
    doc_images = [img for img in all_images if img.startswith("doc_")]
    pv_images  = [img for img in all_images if not img.startswith("doc_")]  # PlantVillage

    # Select images (doc first)
    selected = []

    # Add all PlantDoc images first
    selected.extend(doc_images)

    # If still under limit, fill from PlantVillage images
    if len(selected) < LIMIT:
        remaining = LIMIT - len(selected)
        selected.extend(pv_images[:remaining])

    # If class has fewer than LIMIT images, keep all
    selected = selected[:min(LIMIT, len(all_images))]

    # Create output folder
    out_class_path = os.path.join(OUTPUT_DIR, class_name)
    os.makedirs(out_class_path, exist_ok=True)

    # Copy selected images
    for img_name in selected:
        src = os.path.join(class_path, img_name)
        dst = os.path.join(out_class_path, img_name)
        shutil.copy2(src, dst)

print("\n DONE Downsampled dataset created at:", OUTPUT_DIR)


## Mixed Validation Set

In [None]:



# Folder that contains the validation dataset
VAL_SOURCE_DIR = "/content/drive/MyDrive/plant_village_dataset/val"

# Output directory for downsampled validation set
VAL_OUTPUT_DIR = "/content/drive/MyDrive/plant_village_dataset/val_downsampled"




os.makedirs(VAL_OUTPUT_DIR, exist_ok=True)

LIMIT = 100



for class_name in tqdm(os.listdir(VAL_SOURCE_DIR), desc="Processing classes"):

    class_path = os.path.join(VAL_SOURCE_DIR, class_name)

    # Skip items that are not folders
    if not os.path.isdir(class_path):
        continue

    # Skip classes not in PlantDoc
    if class_name not in plantdoc_classes:
        continue

    # Gather images
    all_images = os.listdir(class_path)

    # Separate by origin
    doc_images = [img for img in all_images if img.startswith("doc_")]
    pv_images  = [img for img in all_images if not img.startswith("doc_")]  # PlantVillage

    # Select images (doc first)
    selected = []

    # Add all PlantDoc images first
    selected.extend(doc_images)

    # If still under limit, fill from PlantVillage images
    if len(selected) < LIMIT:
        remaining = LIMIT - len(selected)
        selected.extend(pv_images[:remaining])

    # If class has fewer than LIMIT images, keep all
    selected = selected[:min(LIMIT, len(all_images))]

    # Create output folder
    out_class_path = os.path.join(VAL_OUTPUT_DIR, class_name)
    os.makedirs(out_class_path, exist_ok=True)

    # Copy selected images
    for img_name in selected:
        src = os.path.join(class_path, img_name)
        dst = os.path.join(out_class_path, img_name)
        shutil.copy2(src, dst)

print("\n DONE! Downsampled dataset created at:", VAL_OUTPUT_DIR)

# Filtering testing sets

In [None]:
def downsample_validation_by_class(source_dir, dest_dir, allowed_classes):

    os.makedirs(dest_dir, exist_ok=True)

    print(f"\n Downsampling validation set")
    print(f"Source: {source_dir}")
    print(f"Destination: {dest_dir}\n")

    for class_name in tqdm(os.listdir(source_dir), desc="Classes"):

        src_class_dir = os.path.join(source_dir, class_name)

        # Skip anything that is not a folder
        if not os.path.isdir(src_class_dir):
            continue

        # Skip classes not in the allowed list
        if class_name not in allowed_classes:
            continue

        # Create destination folder
        dst_class_dir = os.path.join(dest_dir, class_name)
        os.makedirs(dst_class_dir, exist_ok=True)

        # Copy all images in that class
        for img in os.listdir(src_class_dir):
            src_path = os.path.join(src_class_dir, img)
            dst_path = os.path.join(dst_class_dir, img)
            shutil.copy2(src_path, dst_path)

    print("\n Validation downsampled successfully!")
    print(f"Output stored in: {dest_dir}\n")


In [None]:
#for plant village test
downsample_validation_by_class(
    source_dir="/content/drive/MyDrive/plant_village_dataset/test",
    dest_dir="/content/drive/MyDrive/plant_village_dataset/test_downsampled",
    allowed_classes=plantdoc_classes
)

In [None]:
#for Crop_Disease_Detection
downsample_validation_by_class(
    source_dir="/content/drive/MyDrive/plant_village_dataset/(test)Crop_Disease_Detection",
    dest_dir="/content/drive/MyDrive/plant_village_dataset/(test)Crop_Disease_Detection_downsampled",
    allowed_classes=plantdoc_classes
)

In [None]:
#for FieldPlantVillage
downsample_validation_by_class(
    source_dir="/content/drive/MyDrive/plant_village_dataset/(test)Field-PlantVillage",
    dest_dir="/content/drive/MyDrive/plant_village_dataset/(test)Field-PlantVillage_downsampled",
    allowed_classes=plantdoc_classes
)