This will be the main notebook, where data is loaded, investigated, prepared, and models are build, trained and experimented with.

In [28]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import cv2
import numpy as np
import torch

In [2]:
PATH_IMAGES = os.path.join('Kvasir-SEG', 'images')
PATH_LABELS = os.path.join('Kvasir-SEG', 'bbox')

## Data Analysing

In [19]:
def load_images(directory_path):
    """ Returns a list of file paths to all .jpg images in the given directory. """
    image_files = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.jpg'):
            file_path = os.path.join(directory_path, filename)
            image_files.append(file_path)
    return sorted(image_files)

image_paths = load_images(PATH_IMAGES)
len(image_paths)

1000

In [None]:
def load_labels(label_dir):
    """ 
    Loads all CSV label files from the specified directory. 
    Open each file, and save corners of each bounding box together with id/filename as new row.
    Combine everything into a single DataFrame and return it.
    """
    all_labels = []

    for filename in sorted(os.listdir(label_dir)):
        if filename.endswith(".csv"):
            file_id = os.path.splitext(filename)[0]
            df = pd.read_csv(os.path.join(label_dir, filename))
            df["id"] = file_id
            all_labels.append(df)

    labels_df = pd.concat(all_labels, ignore_index=True)

    return labels_df

labels_df = load_labels(PATH_LABELS)
labels_df.shape

(1071, 6)

Checking, if some images have multiple bounding boxes:

In [11]:
labels_df['id'].value_counts().head()

id
cju3uhb79gcgr0871orbrbi3x    10
cju414lf2l1lt0801rl3hjllj     4
cju32a52lb9rc0799xi40qs00     3
cju0roawvklrq0799vmjorwfv     3
cju43c92lm5cj0755lorsorfg     3
Name: count, dtype: int64

Checking if some images have no bounding box at all:

In [12]:
image_ids = {os.path.splitext(os.path.basename(path))[0] for path in image_paths}
labeled_ids = set(labels_df['id'].unique())
unlabeled_ids = image_ids - labeled_ids
len(unlabeled_ids)  # should be 0 if all images are labeled

0

## Dynamic Data Loader

In [26]:
class KvasirPolypYOLODataset(Dataset):
    """
    Dataset for Kvasir-SEG in YOLO format.

    __getitem__ returns:
        - image: Tensor (3, H, W) float32 in [0, 1]
        - targets: Tensor (N, 5) with [class, x_center, y_center, w, h] normalized
                  (N = number of bboxes for that image, can be 0)
    """

    def __init__(
        self,
        images_dir,
        labels_df,
        img_size=None,      # optional resizing to (img_size, img_size), 640 is common for YOLO
        default_class=0,    
        transforms=None     # optional add transform
    ):
        super().__init__()

        self.images_dir = images_dir
        self.img_paths = load_images(images_dir)
        self.img_size = img_size
        self.default_class = default_class
        self.transforms = transforms

        # pre-build a dictionary id -> dataframe rows (bboxes)
        # id is the filename without extension (like "image_001")
        self.id_to_boxes = {}
        for img_id, group in labels_df.groupby("id"):
            self.id_to_boxes[img_id] = group.reset_index(drop=True)

        # ensure no images without labels
        image_ids = {os.path.splitext(os.path.basename(p))[0] for p in self.img_paths}
        labeled_ids = set(self.id_to_boxes.keys())
        unlabeled_ids = image_ids - labeled_ids
        print(f"Total images: {len(image_ids)}")
        print(f"Images with at least one bbox: {len(labeled_ids)}")
        print(f"Images without bbox: {len(unlabeled_ids)}")

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        # image
        img_path = self.img_paths[idx]
        img_id = os.path.splitext(os.path.basename(img_path))[0]

        # read image with cv2 (BGR)
        img = cv2.imread(img_path)
        if img is None:
            raise RuntimeError(f"Cannot read image: {img_path}")

        # BGR -> RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        orig_h, orig_w = img.shape[:2]

        # optional resizing (for YOLO often 640x640 or similar)
        if self.img_size is not None:
            img = cv2.resize(img, (self.img_size, self.img_size))
            new_h, new_w = self.img_size, self.img_size
        else:
            new_h, new_w = orig_h, orig_w

        # normalize to [0,1] and change to (C,H,W)
        img = img.astype(np.float32) / 255.0
        img = np.transpose(img, (2, 0, 1))  # (H,W,C) -> (C,H,W)
        img_tensor = torch.from_numpy(img)  # img tensor

        # all bboxes for this image, None if no bboxes
        df_boxes = self.id_to_boxes.get(img_id, None)

        targets = []

        if df_boxes is not None and len(df_boxes) > 0:
            for _, row in df_boxes.iterrows():
                xmin = float(row["xmin"])
                ymin = float(row["ymin"])
                xmax = float(row["xmax"])
                ymax = float(row["ymax"])

                # if resized, scale the coordinates
                if self.img_size is not None:
                    # scaling factor
                    scale_x = new_w / orig_w
                    scale_y = new_h / orig_h
                    xmin *= scale_x
                    xmax *= scale_x
                    ymin *= scale_y
                    ymax *= scale_y

                # convert to normalized YOLO format
                box_w = xmax - xmin
                box_h = ymax - ymin
                x_center = xmin + box_w / 2.0
                y_center = ymin + box_h / 2.0

                x_center /= new_w
                y_center /= new_h
                box_w    /= new_w
                box_h    /= new_h

                # check for valid boxes
                if box_w <= 0 or box_h <= 0:
                    continue

                # [class, x_center, y_center, w, h]
                targets.append([
                    float(self.default_class),
                    float(x_center),
                    float(y_center),
                    float(box_w),
                    float(box_h),
                ])

        if len(targets) > 0:
            targets = torch.tensor(targets, dtype=torch.float32)
        else:
            # in case of no bboxes put zero-tensor (0,5)
            targets = torch.zeros((0, 5), dtype=torch.float32)

        # optional: implement transformations here (on img and bbox)

        return img_tensor, targets, img_id 

In [27]:
def yolo_collate_fn(batch):
    """
    Custom collate function for YOLO-format batches.
    
    Takes a list of (image, targets, img_id) tuples and combines them into:
        - images: stacked tensor of shape (B, 3, H, W)
        - all_targets: concatenated tensor of shape (M, 6) where each row is 
          [batch_index, class, x_center, y_center, width, height]
        - img_ids: list of image identifiers
    """
    images = []
    all_targets = []
    img_ids = []

    for i, (img, targets, img_id) in enumerate(batch):
        images.append(img)
        img_ids.append(img_id)

        if targets.numel() > 0:
            # add column with the index of the image in the batch
            batch_idx = torch.full(
                (targets.size(0), 1),
                i,
                dtype=targets.dtype
            )
            # [batch_idx, class, x_c, y_c, w, h]
            t = torch.cat([batch_idx, targets], dim=1)
            all_targets.append(t)

    images = torch.stack(images, dim=0)  # (B,3,H,W)

    if len(all_targets) > 0:
        all_targets = torch.cat(all_targets, dim=0)
    else:
        all_targets = torch.zeros((0, 6), dtype=torch.float32)

    return images, all_targets, img_ids

In [None]:
dataset = KvasirPolypYOLODataset(
    images_dir=PATH_IMAGES,
    labels_df=labels_df,
    img_size=640,   
    default_class=0,
)

dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    num_workers=2,  
    collate_fn=yolo_collate_fn
)

# prova a iterare un batch
for images, targets, img_ids in dataloader:
    print("Batch images shape:", images.shape)  # (B,3,H,W)
    print("Targets shape:", targets.shape)      # (M,6) -> [batch_idx, class, x_c, y_c, w, h]
    print("Example img_ids:", img_ids[:3])
    break

Total images: 1000
Images with at least one bbox: 1000
Images without bbox: 0
