## Libraries

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt

## Create directories

In [None]:
dataset_path = "./Datasets"
output_path = "./split_dataset"

# for split in ["train", "val", "test"]:
#     os.makedirs(os.path.join(output_path, split), exist_ok=True)

## Splitting

### Train-Test-Validation Split Ratio

In [None]:
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

### Path Directories

In [None]:
split_dataset_path = "split_dataset" 

# Define paths for train, val, and test datasets inside `split_dataset`
train_path = os.path.join(split_dataset_path, "train")
val_path = os.path.join(split_dataset_path, "val")
test_path = os.path.join(split_dataset_path, "test")

# create the directories if no exists

os.makedirs(split_dataset_path, exist_ok=True)
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)
os.makedirs(val_path, exist_ok=True)

In [None]:
# Loop through each category (e.g., fruit types) and split
for category in os.listdir(dataset_path):
    category_path = os.path.join(dataset_path, category)
    
    if not os.path.isdir(category_path):
        continue
    
    # Create subdirectories for train, val, and test inside `split_dataset`
    train_category_path = os.path.join(train_path, category)
    val_category_path = os.path.join(val_path, category)
    test_category_path = os.path.join(test_path, category)
    
    os.makedirs(train_category_path, exist_ok=True)
    os.makedirs(val_category_path, exist_ok=True)
    os.makedirs(test_category_path, exist_ok=True)

    # Loop through each subcategory (e.g., ripe, unripe)
    for subcategory in os.listdir(category_path):
        subcategory_path = os.path.join(category_path, subcategory)

        if not os.path.isdir(subcategory_path):
            continue

        # Create subcategory directories in train, val, and test folders
        train_subcategory_path = os.path.join(train_category_path, subcategory)
        val_subcategory_path = os.path.join(val_category_path, subcategory)
        test_subcategory_path = os.path.join(test_category_path, subcategory)

        os.makedirs(train_subcategory_path, exist_ok=True)
        os.makedirs(val_subcategory_path, exist_ok=True)
        os.makedirs(test_subcategory_path, exist_ok=True)

        # Get all images in the subcategory
        image_list = [os.path.join(subcategory_path, img) for img in os.listdir(subcategory_path) if img.endswith(('.png', '.jpg', '.jpeg'))]
        
        # Split the image list into train+validation and test sets
        train_val_images, test_images = train_test_split(image_list, test_size=0.1, random_state=42)  # 10% for test
        
        # Split the train+validation set into train and validation sets
        train_images, val_images = train_test_split(train_val_images, test_size=0.22, random_state=42)  # 20% for validation (from the remaining 90%)

        # Copy images to their respective directories
        for image in train_images:
            shutil.copy(image, os.path.join(train_subcategory_path, os.path.basename(image)))
        
        for image in val_images:
            shutil.copy(image, os.path.join(val_subcategory_path, os.path.basename(image)))
        
        for image in test_images:
            shutil.copy(image, os.path.join(test_subcategory_path, os.path.basename(image)))

print("Dataset split complete. Files saved inside 'split_dataset' directory.")

## Pre-processing

### Resizing

Images resized (224x224)

In [None]:
# Edit the target size accordingly

TARGET_SIZE = (224, 224)

def resize_imgs(img, out_path):
    # Resize the image to the target size
    img_resized = cv.resize(img, TARGET_SIZE)
    # Save the resized image
    cv.imwrite(out_path, img_resized)

#### Masking (cv2grabcut), Hue & Saturation extraction, Normalization

Our images have complex backgroung (diverse bg). We are going to use a powerful tool to extract the forground. 

We convert BGR to HSV, extract Hue (ripeness) and Saturation/Value (for fruit differentiation).


In [None]:

def apply_grabcut(image_path, output_path):
    # Load the image
    img = cv.imread(image_path)
    assert img is not None, "File could not be read, check with os.path.exists()"
    
    # Create an initial mask
    mask = np.zeros(img.shape[:2], np.uint8)

    # Define background and foreground models
    bgdModel = np.zeros((1, 65), np.float64)
    fgdModel = np.zeros((1, 65), np.float64)

    # Define the rectangle around the object (adjust these values as needed)
    rect = (5, 5, img.shape[1] - 5, img.shape[0] - 5)  

    # Apply GrabCut algorithm
    cv.grabCut(img, mask, rect, bgdModel, fgdModel, 5, cv.GC_INIT_WITH_RECT)

    # Modify mask to create a binary mask (0 = background, 1 = foreground)
    mask2 = np.where((mask == 2) | (mask == 0), 0, 1).astype('uint8')

    # Apply the mask to the image to extract the foreground
    result = img * mask2[:, :, np.newaxis]

    # Change color 
    result_rgb = cv.cvtColor(result, cv.COLOR_BGR2RGB)

    # Resize the result (no need to resize again if it's already resized)
    resize_imgs(result, output_path)

    # Hue, Saturation, Value extraction and Normalization 
    hue = hsv[:, :, 0] / 180.0        # Normalize Hue
    saturation = hsv[:, :, 1] / 255.0 # Normalize Saturation
    value = hsv[:, :, 2] / 255.0      # Normalize Value

    hsv_normalized = np.stack((hue, saturation, value), axis=-1) # Stacking as 3 channels

    

    # Plot the result (masked image)
    #plt.imshow(result_rgb)
    #plt.colorbar()
    #plt.show()

 

In [7]:
   
resized_dataset_path = "masked_resized_dataset"  # Output directory for processed images

# Loop through each category and subcategory in the dataset
for category in os.listdir(dataset_path):
    category_path = os.path.join(dataset_path, category)
    
    if not os.path.isdir(category_path):
        continue

    for subcategory in os.listdir(category_path):
        subcategory_path = os.path.join(category_path, subcategory)

        if not os.path.isdir(subcategory_path):
            continue

        # Create a new directory for masked and resized images
        new_subcategory_path = os.path.join(resized_dataset_path, category, subcategory)
        os.makedirs(new_subcategory_path, exist_ok=True)

        # Process each image in the subcategory folder
        for image_name in os.listdir(subcategory_path):
            image_path = os.path.join(subcategory_path, image_name)
            output_path = os.path.join(new_subcategory_path, image_name)

            if image_name.endswith(('.png', '.jpg', '.jpeg')):
                apply_grabcut(image_path, output_path)

print("Masking and resizing complete. Images saved in 'masked_resized_dataset/'.")

KeyboardInterrupt: 