<a href="https://colab.research.google.com/github/asma-10/Identification-des-elements-non-vegetaux-et-des-organes-des-plantes-en-utilisant-YOLOv8/blob/main/Yolo_kfold_and_hyperparams_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ultralytics
# Install and update Ultralytics and Ray Tune packages
#!pip install -U ultralytics "ray[tune]"




**Replacing class names in labeling files by integers**

In [None]:

def replace_class_with_integer(file_path):
    # Define a mapping of class names to integers
    class_mapping = {'leaf': 0, 'flower': 1, 'fruit': 2, 'seed': 3, 'stem': 4,'root':5}

    # Read the content of the file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Process each line and replace the class name with the corresponding integer
    modified_lines = []
    for line in lines:
        parts = line.strip().split()
        if parts and parts[0] in class_mapping:
            parts[0] = str(class_mapping[parts[0]])
            modified_lines.append(' '.join(parts))

    # Write the modified content back to the file
    with open(file_path, 'w') as file:
        file.write('\n'.join(modified_lines))

def process_folder(folder_path):
    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            replace_class_with_integer(file_path)

# Replace classes with integers in all text files in the specified folder
folder_path = '/content/drive/MyDrive/data/val/labels'
process_folder(folder_path)


**Training the YOLOv8 Model on data Using Kfold Cross Validation**

In [None]:
import datetime
import shutil
from pathlib import Path
from collections import Counter

import yaml
import numpy as np
import pandas as pd
from ultralytics import YOLO
from sklearn.model_selection import KFold

In [90]:
dataset_path = Path('/content/drive/MyDrive/data') # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("*labels/*.txt")) # all data in 'labels'

In [115]:
yaml_file = '/content/drive/MyDrive/data/dataset.yml'  # your data YAML with data directories and names dictionary
with open(yaml_file, 'r', encoding="utf8") as y:
    classes = yaml.safe_load(y)['names']
cls_idx = sorted(classes.keys())

In [116]:
indx = [l.stem for l in labels] # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)

In [117]:

for label in labels:
    lbl_counter = Counter()

    with open(label, 'r') as lf:
        lines = lf.readlines()

    for l in lines:
        # Classes for YOLO label use an integer at the first position of each line
        lbl_counter[int(l.split(' ')[0])] += 1

    # Create a DataFrame from the Counter
    lbl_df = pd.DataFrame.from_dict(lbl_counter, orient='index', columns=[label.stem])

    # Transpose the DataFrame and assign it to labels_df
    labels_df = labels_df.append(lbl_df.T, sort=False)

# Fill NaN values with 0.0
labels_df = labels_df.fillna(0.0)

In [122]:
ksplit = 3
kf = KFold(n_splits=ksplit, shuffle=True)   # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [123]:
folds = [f'split_{n}' for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)

for idx, (train, val) in enumerate(kfolds, start=1):
    folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train'
    folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val'

In [124]:
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1E-7)
    fold_lbl_distrb.loc[f'split_{n}'] = ratio

In [125]:
supported_extensions = ['.jpg', '.jpeg', '.png']

# Initialize an empty list to store image file paths
images = []

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(sorted((dataset_path / 'images').rglob(f"*{ext}")))

# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(dataset_path / f'{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val')
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True)
    (split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True)
    (split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True)
    (split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f'{split}_dataset.yaml'
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, 'w') as ds_y:
        yaml.safe_dump({
            'path': split_dir.as_posix(),
            'train': 'train',
            'val': 'val',
            'names': classes
        }, ds_y)

In [126]:
import os
import shutil

data_path = '/content/drive/MyDrive/data'  # Replace with the actual path to your data folder

# Assuming images and labels are folders inside the data folder
images_path = os.path.join(data_path, 'images')
labels_path = os.path.join(data_path, 'labels')

# Iterate over the files in the 'images' folder
for image_filename in os.listdir(images_path):
    image_path = os.path.join(images_path, image_filename)

    # Extract the filename without extension (stem) and extension separately
    image_stem, image_ext = os.path.splitext(image_filename)

    # Assuming corresponding label filenames match the image filenames
    label_filename = image_stem + '.txt'  # Adjust the extension if needed
    label_path = os.path.join(labels_path, label_filename)

    # Assuming `folds_df` has columns 'split' and 'k_split' in each row
    try:
        split_value = folds_df.loc[image_stem, 'split']
        k_split_value = folds_df.loc[image_stem, 'ksplit']
    except KeyError:
        # Handle the case where the keys are not present in the DataFrame
        print(f"Keys 'split' and 'k_split' not found for {image_stem}. Skipping.")
        continue

    # Destination directory
    img_to_path = os.path.join(save_path, str(split_value), str(k_split_value), 'images')
    lbl_to_path = os.path.join(save_path, str(split_value), str(k_split_value), 'labels')

    # Copy image and label files to new directory (SamefileError if file already exists)
    shutil.copy(image_path, os.path.join(img_to_path, image_filename))
    shutil.copy(label_path, os.path.join(lbl_to_path, label_filename))


Keys 'split' and 'k_split' not found for 436951439. Skipping.
Keys 'split' and 'k_split' not found for 436951425. Skipping.
Keys 'split' and 'k_split' not found for 436951420. Skipping.
Keys 'split' and 'k_split' not found for 436951406. Skipping.
Keys 'split' and 'k_split' not found for 436951394. Skipping.
Keys 'split' and 'k_split' not found for 436951393. Skipping.
Keys 'split' and 'k_split' not found for 436951392. Skipping.
Keys 'split' and 'k_split' not found for 436951492. Skipping.
Keys 'split' and 'k_split' not found for 436951456. Skipping.
Keys 'split' and 'k_split' not found for 436951449. Skipping.
Keys 'split' and 'k_split' not found for 436951625. Skipping.
Keys 'split' and 'k_split' not found for 436951989. Skipping.
Keys 'split' and 'k_split' not found for 436951577. Skipping.
Keys 'split' and 'k_split' not found for 436951539. Skipping.
Keys 'split' and 'k_split' not found for 436951494. Skipping.
Keys 'split' and 'k_split' not found for 436952282. Skipping.
Keys 'sp

In [127]:
weights_path = '/content/yolov8n.pt'
model = YOLO(weights_path, task='detect')

In [129]:
results = {}

# Define your additional arguments here
batch = 16
project = 'kfold_demo'
epochs = 2

for k in range(ksplit):
    dataset_yaml = ds_yamls[k]
    model.train(data=dataset_yaml,epochs=epochs, batch=batch, project=project)  # include any train arguments
    results[k] = model.metrics  # save output metrics for further analysis

Ultralytics YOLOv8.0.236 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=/content/yolov8n.pt, data=/content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_1/split_1_dataset.yaml, epochs=2, time=None, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=0, project=kfold_demo, name=train53, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_

[34m[1mtrain: [0mScanning /content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_1/train/labels.cache... 237 images, 38 backgrounds, 0 corrupt: 100%|██████████| 275/275 [00:00<?, ?it/s]

[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01), CLAHE(p=0.01, clip_limit=(1, 4.0), tile_grid_size=(8, 8))



[34m[1mval: [0mScanning /content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_1/val/labels.cache... 103 images, 38 backgrounds, 0 corrupt: 100%|██████████| 141/141 [00:00<?, ?it/s]


Plotting labels to kfold_demo/train53/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
2 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/2         0G      1.735      1.976       1.28         88        640: 100%|██████████| 18/18 [05:11<00:00, 17.31s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:43<00:00,  8.66s/it]

                   all        141       3109      0.461      0.153      0.105     0.0541






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/2         0G      1.695      1.863      1.266         98        640: 100%|██████████| 18/18 [04:50<00:00, 16.12s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:44<00:00,  8.89s/it]

                   all        141       3109      0.497      0.178      0.118     0.0583






2 epochs completed in 0.193 hours.
Optimizer stripped from kfold_demo/train53/weights/last.pt, 6.2MB
Optimizer stripped from kfold_demo/train53/weights/best.pt, 6.2MB

Validating kfold_demo/train53/weights/best.pt...
Ultralytics YOLOv8.0.236 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
Model summary (fused): 168 layers, 3006818 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:40<00:00,  8.06s/it]


                   all        141       3109      0.496      0.178      0.118     0.0583
                  leaf        141       1594      0.356      0.642      0.414      0.237
                flower        141        588      0.196      0.167     0.0949     0.0348
                 fruit        141        241      0.166     0.0373     0.0489     0.0229
                  seed        141          2          1          0          0          0
                  stem        141        675      0.259      0.224       0.15     0.0553
                  root        141          9          1          0          0          0
Speed: 2.3ms preprocess, 196.2ms inference, 0.0ms loss, 21.4ms postprocess per image
Results saved to [1mkfold_demo/train53[0m
New https://pypi.org/project/ultralytics/8.0.237 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.236 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=/

[34m[1mtrain: [0mScanning /content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_2/train/labels.cache... 247 images, 38 backgrounds, 0 corrupt: 100%|██████████| 285/285 [00:00<?, ?it/s]

[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01), CLAHE(p=0.01, clip_limit=(1, 4.0), tile_grid_size=(8, 8))



[34m[1mval: [0mScanning /content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_2/val/labels.cache... 94 images, 37 backgrounds, 0 corrupt: 100%|██████████| 131/131 [00:00<?, ?it/s]


Plotting labels to kfold_demo/train532/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
2 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/2         0G      1.673      1.818      1.253        448        640: 100%|██████████| 18/18 [05:25<00:00, 18.06s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:40<00:00,  8.18s/it]

                   all        131       2645      0.513      0.195      0.131     0.0621






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/2         0G      1.709      1.731      1.257        409        640: 100%|██████████| 18/18 [05:05<00:00, 16.97s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:40<00:00,  8.03s/it]

                   all        131       2645      0.506       0.21      0.138     0.0689






2 epochs completed in 0.199 hours.
Optimizer stripped from kfold_demo/train532/weights/last.pt, 6.2MB
Optimizer stripped from kfold_demo/train532/weights/best.pt, 6.2MB

Validating kfold_demo/train532/weights/best.pt...
Ultralytics YOLOv8.0.236 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
Model summary (fused): 168 layers, 3006818 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:41<00:00,  8.27s/it]


                   all        131       2645      0.507       0.21      0.138     0.0688
                  leaf        131       1330      0.397      0.664      0.447      0.267
                flower        131        584      0.226      0.111     0.0864     0.0303
                 fruit        131        167      0.152      0.138     0.0901     0.0393
                  seed        131          7          1          0          0          0
                  stem        131        544      0.266      0.347        0.2     0.0754
                  root        131         13          1          0    0.00228    0.00105
Speed: 3.8ms preprocess, 222.0ms inference, 0.0ms loss, 21.6ms postprocess per image
Results saved to [1mkfold_demo/train532[0m
New https://pypi.org/project/ultralytics/8.0.237 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.236 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=

[34m[1mtrain: [0mScanning /content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_3/train/labels.cache... 233 images, 39 backgrounds, 0 corrupt: 100%|██████████| 272/272 [00:00<?, ?it/s]

[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01), CLAHE(p=0.01, clip_limit=(1, 4.0), tile_grid_size=(8, 8))



[34m[1mval: [0mScanning /content/drive/MyDrive/data/2024-01-07_3-Fold_Cross-val/split_3/val/labels.cache... 105 images, 39 backgrounds, 0 corrupt: 100%|██████████| 144/144 [00:00<?, ?it/s]


Plotting labels to kfold_demo/train5322/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
2 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/2         0G      1.724      1.718      1.257        587        640: 100%|██████████| 17/17 [05:08<00:00, 18.13s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:44<00:00,  8.81s/it]

                   all        144       2962        0.5       0.22      0.134     0.0656






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/2         0G      1.636      1.743      1.254        527        640: 100%|██████████| 17/17 [04:50<00:00, 17.10s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:44<00:00,  8.97s/it]

                   all        144       2962      0.502      0.239      0.143     0.0703






2 epochs completed in 0.193 hours.
Optimizer stripped from kfold_demo/train5322/weights/last.pt, 6.2MB
Optimizer stripped from kfold_demo/train5322/weights/best.pt, 6.2MB

Validating kfold_demo/train5322/weights/best.pt...
Ultralytics YOLOv8.0.236 🚀 Python-3.10.12 torch-2.1.0+cu121 CPU (Intel Xeon 2.20GHz)
Model summary (fused): 168 layers, 3006818 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:45<00:00,  9.03s/it]


                   all        144       2962        0.5      0.241      0.143     0.0702
                  leaf        144       1525      0.383      0.693      0.462      0.264
                flower        144        512      0.121      0.283     0.0848     0.0313
                 fruit        144        261      0.277      0.115      0.117     0.0537
                  seed        144          1          1          0          0          0
                  stem        144        645      0.221      0.353      0.191     0.0713
                  root        144         18          1          0    0.00211   0.000844
Speed: 3.3ms preprocess, 221.9ms inference, 0.0ms loss, 24.9ms postprocess per image
Results saved to [1mkfold_demo/train5322[0m


In [None]:
from sklearn.model_selection import ParameterGrid
import os

# Define hyperparameters to tune
learning_rates = [0.0001, 0.001, 0.01]
batch_sizes = [32, 64, 128]
optimizers = ['adam', 'sgd']
epochs = [50, 100, 150]
network_architectures = ['yolov8', 'yolov8-tiny']
activation_functions = ['relu', 'leaky_relu']
loss_functions = ['binary_crossentropy', 'focal_loss']
data_augmentation = [True, False]

# Set up the search space
search_space = {'learning_rate': learning_rates,
                'batch_size': batch_sizes,
                'optimizer': optimizers,
                'epochs': epochs,
                'network_architecture': network_architectures,
                'activation_function': activation_functions,
                'loss_function': loss_functions,
                'data_augmentation': data_augmentation}

# Create parameter grid
parameter_grid = ParameterGrid(search_space)

# Train and evaluate the model for each combination of hyperparameters
for parameters in parameter_grid:
    # Set hyperparameters for the model
    learning_rate = parameters['learning_rate']
    batch_size = parameters['batch_size']
    optimizer = parameters['optimizer']
    num_epochs = parameters['epochs']
    architecture = parameters['network_architecture']
    activation = parameters['activation_function']
    loss = parameters['loss_function']
    augmentation = parameters['data_augmentation']

    # Train and evaluate the model
    train_yolo_v8(learning_rate, batch_size, optimizer, num_epochs, architecture, activation, loss, augmentation)
    mAP = evaluate_yolo_v8()

    # Save the results
    result = {'learning_rate': learning_rate,
              'batch_size': batch_size,
              'optimizer': optimizer,
              'num_epochs': num_epochs,
               'architecture': architecture,
              'activation':activation,
              'loss': loss,
              'augmentation':augmentation}