# Separating Plant Village Dataset for deep learning application

In [1]:
import os
import shutil
import random
from collections import defaultdict

In [None]:
# Counting images for each class:
def count_images_per_class(directory):
    """
    Counts the number of images in each class directory within the specified directory.

    Args:
    - directory (str): Path to the main directory containing subdirectories for each class.

    Returns:
    - dict: A dictionary where keys are class names and values are the number of images in each class.
    """
    class_counts = defaultdict(int)
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            num_images = len(os.listdir(class_path))
            class_counts[class_name] = num_images
    return class_counts

# Directory containing the raw data:
raw_dir = '../data/raw'

# Count images per class:
class_counts = count_images_per_class(raw_dir)

# Display the results
for class_name, count in class_counts.items():
    print(f"{class_name}: {count} images")

Apple___Apple_scab: 630 images
Apple___Black_rot: 621 images
Apple___Cedar_apple_rust: 275 images
Apple___healthy: 1645 images
Blueberry___healthy: 1502 images
Cherry_(including_sour)___healthy: 854 images
Cherry_(including_sour)___Powdery_mildew: 1052 images
Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot: 513 images
Corn_(maize)___Common_rust_: 1192 images
Corn_(maize)___healthy: 1162 images
Corn_(maize)___Northern_Leaf_Blight: 985 images
Grape___Black_rot: 1180 images
Grape___Esca_(Black_Measles): 1383 images
Grape___healthy: 423 images
Grape___Leaf_blight_(Isariopsis_Leaf_Spot): 1076 images
Orange___Haunglongbing_(Citrus_greening): 5507 images
Peach___Bacterial_spot: 2297 images
Peach___healthy: 360 images
Pepper,_bell___Bacterial_spot: 997 images
Pepper,_bell___healthy: 1478 images
Potato___Early_blight: 1000 images
Potato___healthy: 152 images
Potato___Late_blight: 1000 images
Raspberry___healthy: 371 images
Soybean___healthy: 5090 images
Squash___Powdery_mildew: 1835 images
S

In [None]:
# Split dataset into training, validation, and test sets
def split_data(source_dir, train_dir, val_dir, test_dir, train_ratio=0.7, val_ratio=0.15):
    """
    Splits images in the source directory into training, validation, and test sets.

    Args:
    - source_dir (str): Path to the source directory containing subdirectories for each class.
    - train_dir (str): Path to the directory where training images will be stored.
    - val_dir (str): Path to the directory where validation images will be stored.
    - test_dir (str): Path to the directory where test images will be stored.
    - train_ratio (float): Proportion of images allocated to the training set.
    - val_ratio (float): Proportion of images allocated to the validation set.
    
    This function ensures that each class folder within the source directory is split according to 
    the specified ratios, with images being randomly shuffled to promote randomness.
    """

    # Create destination folders if they do not exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    for class_name in os.listdir(source_dir):
        class_path = os.path.join(source_dir, class_name)
        if os.path.isdir(class_path):
            images = os.listdir(class_path)
            random.shuffle(images)  # Shuffle images for randomness

            # Calculate counts for each set
            train_count = int(len(images) * train_ratio)
            val_count = int(len(images) * val_ratio)
            test_count = len(images) - train_count - val_count

            # Create class folders within train, val, and test directories
            os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
            os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)
            os.makedirs(os.path.join(test_dir, class_name), exist_ok=True)

            # Copy images to the respective directories
            for i, image in enumerate(images):
                src_path = os.path.join(class_path, image)
                if i < train_count:
                    dst_path = os.path.join(train_dir, class_name, image)
                elif i < train_count + val_count:
                    dst_path = os.path.join(val_dir, class_name, image)
                else:
                    dst_path = os.path.join(test_dir, class_name, image)
                shutil.copy(src_path, dst_path)

# Paths for the train, validation, and test directories
train_dir = '../data/train'
val_dir = '../data/val'
test_dir = '../data/test'

# Split the data
split_data(raw_dir, train_dir, val_dir, test_dir)