# Create datasets for training/validation/testing

In [None]:
IMAGES_ROOT = (
    ""  # root folder with images
)
MASKS_ROOT = (
    ""  # root folder with labels
)
SEG_DS_SAVE_PATH = "../data/yolo_train"  # folder to save the segmentation dataset

### Imports/utils

In [None]:
import sys

sys.path.insert(0, "../src")

In [None]:
import pathlib
import os
import shutil
import random
import yaml

import cv2
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from dataset import (
    build_dataset,
    save_dataset,
    load_dataset,
    DEFAULT_DS_PATH,
)

from saveload import read_image, read_masks
from masks import draw_joined_masks_on_image, mask_joined_to_masks_dict

### 1. Prepare ds file, split train/val/test if not done before

In [None]:
if pathlib.Path(DEFAULT_DS_PATH).exists():
    ds = load_dataset(images_root=IMAGES_ROOT, masks_root=MASKS_ROOT)
    print(f"Loaded dataset {DEFAULT_DS_PATH} with {len(ds)} items")
    ds.sample(5)
    for _, r in ds.iterrows():
        assert pathlib.Path(
            r.image_path
        ).exists(), f"Path {r.image_path} does not exist"
        assert pathlib.Path(r.mask_path).exists(), f"Path {r.mask_path} does not exist"

else:
    print(f"Building dataset {DEFAULT_DS_PATH}")

    ds = build_dataset(IMAGES_ROOT, MASKS_ROOT)
    save_dataset(ds, DEFAULT_DS_PATH, IMAGES_ROOT, MASKS_ROOT)

    # check
    loaded_ds = load_dataset(DEFAULT_DS_PATH, IMAGES_ROOT, MASKS_ROOT)
    assert ds.equals(loaded_ds)

    display(ds)

##### Calculate statistics

In [None]:
# Number of images in each role
for nn_role in ["train", "val", "test"]:
    count = len(ds[ds["nn_role"] == nn_role])
    share = count / len(ds) * 100
    print(f"Number of {nn_role} images: {count} ({share:.2f}%)")

In [None]:
# Number of sequences in each role
for nn_role in ["train", "val", "test"]:
    ds_role = ds[ds["nn_role"] == nn_role]
    ngroups = ds_role.groupby(["plant", "rep"]).ngroups
    ratio = len(ds_role) / ngroups

    print(
        f"Number of sequences in {nn_role}: {ngroups} ({ratio:.2f} avg. images per sequence)"
    )

### 2. Resave train/val data to local files for YOLO segmentation network training 

In [None]:
out_path = pathlib.Path(SEG_DS_SAVE_PATH)
assert not os.path.exists(out_path), "Output path already exists"
USE_ID = 0  # class id for the leaf


class ContoursExtractor:
    def __init__(self, erosion=5):
        self.erosion = erosion
        self.kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (erosion, erosion))

    def get_biggest_contour(self, mask):
        # join parts of leaf in case some stem is visible upon the mask,
        #    and that splits it to several parts
        mask = cv2.dilate(mask.astype(np.uint8), self.kernel, iterations=1)
        mask = cv2.erode(mask.astype(np.uint8), self.kernel, iterations=1)

        contours, _ = cv2.findContours(
            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        if len(contours) == 0:
            raise ValueError("No contours found")

        return max(contours, key=cv2.contourArea)


## load dataset
ds = load_dataset(DEFAULT_DS_PATH, IMAGES_ROOT, MASKS_ROOT)
ds = ds[ds["nn_role"].isin(["train", "val"])]
contours_extractor = ContoursExtractor()

for i, row in tqdm(ds.iterrows(), total=len(ds)):
    image = read_image(row)
    masks = read_masks(row)

    image_output_path = (
        out_path
        / "images"
        / row["nn_role"]
        / row["plant"]
        / row["rep"]
        / pathlib.Path(row["image_path"]).name
    )
    os.makedirs(image_output_path.parent, exist_ok=True)
    shutil.copy(row["image_path"], image_output_path)

    label_output_path = (
        out_path
        / "labels"
        / row["nn_role"]
        / row["plant"]
        / row["rep"]
        / (pathlib.Path(row["image_path"]).name.rsplit(".", 1)[0] + ".txt")
    )
    os.makedirs(label_output_path.parent, exist_ok=True)

    with open(label_output_path, "w") as f:
        for m in masks.values():
            mask = m["segmentation"]
            imgwidth, imgheight = mask.shape[1], mask.shape[0]
            contour = contours_extractor.get_biggest_contour(mask)

            main_contour_str = (
                f"{USE_ID} "
                + " ".join(f"{x/imgwidth} {y/imgheight}" for (x, y) in contour[:, 0, :])
                + "\n"
            )
            f.write(main_contour_str)

##### Save dataset paths
with global paths

In [None]:
p = pathlib.Path('../data/yolo_train')
p.absolute().resolve()

In [None]:
# Ultralytics YOLO dataset format
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
dataset_paths = { 
    "path": str(pathlib.Path(SEG_DS_SAVE_PATH).absolute().resolve()), # dataset root dir
    "train": "images/train", # train images (relative to 'path')
    "val": "images/val", # val images (relative to 'path')

    # Classes
    "names": {0: "leaf"},
}

with open('../data_meta/yolo_train_ds.yaml', 'w') as f: 
    yaml.dump(dataset_paths, f)

### Sanity check
View a random saved image

In [None]:
random.seed(1)

In [None]:
images = list(out_path.glob("images/train/*/*/*.png"))
img_path = str(random.choice(images))
mask_path = img_path.replace("images", "labels").replace(".png", ".txt")
img = read_image({"image_path": img_path})
with open(mask_path, "r") as f:
    mask_lines = f.readlines()

masks = {}
for i, l in enumerate(mask_lines):
    parts = l.strip().split(" ")[1:]
    nums = list(map(float, l.strip().split(" ")[1:]))
    contour = np.array(
        [(x * img.shape[1], y * img.shape[0]) for (x, y) in zip(nums[::2], nums[1::2])]
    )
    b_mask = np.zeros(img.shape[:2], np.uint8)
    contour = contour.astype(np.int32)
    contour = contour.reshape(-1, 1, 2)
    _ = cv2.drawContours(b_mask, [contour], -1, (255, 255, 255), cv2.FILLED)
    masks[i] = {"segmentation": b_mask > 0}
plt.imshow(draw_joined_masks_on_image(img, masks, not_on_image=False))