<a href="https://colab.research.google.com/github/melihhguvenn/Plant-Species-Classification/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Set dataset folder path from google drive and set processed dataset folder on drive, define splitted paths and create if those do not exist

In [None]:
import os
from pathlib import Path

DATASET_PATH = Path("/content/drive/MyDrive/dataset")
SPLIT_DATASET_PATH = Path("/content/drive/MyDrive/split_dataset")

TRAIN_PATH = SPLIT_DATASET_PATH / "train"
VAL_PATH = SPLIT_DATASET_PATH / "val"
TEST_PATH = SPLIT_DATASET_PATH / "test"

os.makedirs(TRAIN_PATH, exist_ok=True)
os.makedirs(VAL_PATH, exist_ok=True)
os.makedirs(TEST_PATH, exist_ok=True)

Extract image paths and labels. Then create data frame for easier manipulation of images

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMAGE_PATH_LIST = list(DATASET_PATH.glob("*/*.jpg"))
images_path = [img_path for img_path in IMAGE_PATH_LIST]
labels = [img_path.parent.stem for img_path in IMAGE_PATH_LIST]

dataset_df = pd.DataFrame({'image_path': images_path, 'label': labels})

Dataset splitted into 80 10 10

In [None]:
train_df, temp_df = train_test_split(
    dataset_df,
    test_size=0.2,
    stratify=dataset_df['label'],
    random_state=42
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=42
)

print(f"Total Images: {len(dataset_df)}")
print(f"Training Set: {len(train_df)} images")
print(f"Validation Set: {len(val_df)} images")
print(f"Test Set: {len(test_df)} images")

Create directory structure using labels

In [None]:
def create_dir_structure(base_path, labels):
    for label in labels:
        os.makedirs(base_path / label, exist_ok=True)

unique_labels = dataset_df['label'].unique()
create_dir_structure(TRAIN_PATH, unique_labels)
create_dir_structure(VAL_PATH, unique_labels)
create_dir_structure(TEST_PATH, unique_labels)

In [None]:
Copy images to directories

In [None]:
def copy_images(df, dest_dir):
    for idx, row in df.iterrows():
        src = str(row['image_path'])
        dst = str(dest_dir / row['label'] / row['image_path'].name)
        shutil.copy(src, dst)

copy_images(train_df, TRAIN_PATH)
copy_images(val_df, VAL_PATH)
copy_images(test_df, TEST_PATH)

Data augmentation to equalize class counts using resizing, shear, rotation, shifts, zoom, flip. Save with prefix aug_

In [None]:
class_counts = train_df['label'].value_counts()
max_count = class_counts.max()
image_size = 224

for label, count in class_counts.items():
    num_to_generate = max_count - count
    if num_to_generate > 0:
        print(f"Augmenting {label}: {count} -> {max_count} images")
        datagen = ImageDataGenerator(
            rotation_range=30,
            width_shift_range=0.1,
            height_shift_range=0.1,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            fill_mode='nearest'
        )
        class_dir = TRAIN_PATH / label
        class_images = [f for f in os.listdir(class_dir) if os.path.isfile(class_dir / f)]
        df_aug = pd.DataFrame({'filename': class_images})
        generator = datagen.flow_from_dataframe(
            df_aug,
            directory=str(class_dir),
            x_col='filename',
            y_col=None,
            target_size=(image_size, image_size),
            class_mode=None,
            batch_size=1,
            save_to_dir=str(class_dir),
            save_prefix='aug_',
            save_format='jpg',
            shuffle=True
        )
        num_generated = 0
        for _ in range(num_to_generate):
            next(generator)
            num_generated += 1
            if num_generated >= num_to_generate:
                break
        print(f"Generated {num_generated} images for class '{label}'.")

print("Data augmentation and oversampling completed.")

Check for class imbalance problem after augmentation

In [None]:
balanced_class_counts = {label: len(os.listdir(TRAIN_PATH / label)) for label in unique_labels}
print("Balanced Training Set Class Distribution:")
for label, count in balanced_class_counts.items():
    print(f"{label}: {count}")