# Make dataset Structure

```
monuseg_2018/
├── images/
│   ├── train/
│   ├── val/
│   └── test/         # optional
├── labels/
│   ├── train/
│   ├── val/
│   └── test/         # optional
└── data.yaml
```


In [1]:
!mkdir images
!mkdir images/train
!mkdir images/val
!mkdir images/test

!mkdir labels
!mkdir labels/train
!mkdir labels/val
!mkdir labels/test

!touch data.yaml

!tree

[01;34m.[0m
├── [00mdata.yaml[0m
├── [01;34mimages[0m
│   ├── [01;34mtest[0m
│   ├── [01;34mtrain[0m
│   └── [01;34mval[0m
├── [01;34mlabels[0m
│   ├── [01;34mtest[0m
│   ├── [01;34mtrain[0m
│   └── [01;34mval[0m
└── [00mmonuseg_data_processing.ipynb[0m

8 directories, 2 files


In [18]:
!mkdir -p annotations
!mkdir -p annotations/train
!mkdir -p annotations/val
!mkdir -p annotations/test

!tree

[01;34m.[0m
├── [01;34mannotations[0m
│   ├── [01;34mtest[0m
│   ├── [01;34mtrain[0m
│   └── [01;34mval[0m
├── [00mdata.yaml[0m
├── [01;34mimages[0m
│   ├── [01;34mtest[0m
│   │   ├── [01;35mTCGA-2Z-A9J9-01A-01-TS1.tif[0m
│   │   ├── [01;35mTCGA-44-2665-01B-06-BS6.tif[0m
│   │   ├── [01;35mTCGA-69-7764-01A-01-TS1.tif[0m
│   │   ├── [01;35mTCGA-A6-6782-01A-01-BS1.tif[0m
│   │   ├── [01;35mTCGA-AC-A2FO-01A-01-TS1.tif[0m
│   │   ├── [01;35mTCGA-AO-A0J2-01A-01-BSA.tif[0m
│   │   ├── [01;35mTCGA-CU-A0YN-01A-02-BSB.tif[0m
│   │   ├── [01;35mTCGA-EJ-A46H-01A-03-TSC.tif[0m
│   │   ├── [01;35mTCGA-FG-A4MU-01B-01-TS1.tif[0m
│   │   ├── [01;35mTCGA-GL-6846-01A-01-BS1.tif[0m
│   │   ├── [01;35mTCGA-HC-7209-01A-01-TS1.tif[0m
│   │   ├── [01;35mTCGA-HT-8564-01Z-00-DX1.tif[0m
│   │   ├── [01;35mTCGA-IZ-8196-01A-01-BS1.tif[0m
│   │   └── [01;35mTCGA-ZF-A9R5-01A-01-TS1.tif[0m
│   ├── [01;34mtrain[0m
│   │   ├── [01;32mTCGA-18-5592-01Z-00-DX1.tif[0m
│   │ 

# Modify data.yaml

In [3]:
%%writefile data.yaml

path: ../datasets/monuseg_2018  # or '.' if running from dataset dir
train: images/train
val: images/val

nc: 1  # number of classes
names: ['nucleus']  # replace with your class name

Overwriting data.yaml


# Copy images of train, val, test


In [19]:
import os
import glob
import shutil
train_image_dir = "/home/mitun/Documents/thesis_oulu/datasets/MoNuseg 2018/train/Tissue Images"
train_xml_dir = "/home/mitun/Documents/thesis_oulu/datasets/MoNuseg 2018/train/Annotations"

test_image_dir = "/home/mitun/Documents/thesis_oulu/datasets/MoNuseg 2018/test"
test_xml_dir = "/home/mitun/Documents/thesis_oulu/datasets/MoNuseg 2018/test"

new_train_image_dir = "images/train"
new_val_image_dir = "images/val"
new_test_image_dir = "images/test"

new_train_xml_dir = "annotations/train"
new_val_xml_dir = "annotations/val"
new_test_xml_dir = "annotations/test"

patient_ids = [
    "TCGA-A7-A13E",
    "TCGA-A7-A13F",
    "TCGA-AR-A1AK",
    "TCGA-AR-A1AS",
    "TCGA-E2-A1B5",
    "TCGA-E2-A14V",
    "TCGA-B0-5711",
    "TCGA-HE-7128",
    "TCGA-HE-7129",
    "TCGA-HE-7130",
    "TCGA-B0-5710",
    "TCGA-B0-5698",
    "TCGA-18-5592",
    "TCGA-38-6178",
    "TCGA-49-4488",
    "TCGA-50-5931",
    "TCGA-21-5784",
    "TCGA-21-5786",
    "TCGA-G9-6336",
    "TCGA-G9-6348",
    "TCGA-G9-6356",
    "TCGA-G9-6363",
    "TCGA-CH-5767",
    "TCGA-G9-6362",
    "TCGA-DK-A2I6",
    "TCGA-G2-A2EK",
    "TCGA-AY-A8YK",
    "TCGA-NH-A8F7",
    "TCGA-KB-A93J",
    "TCGA-RD-A8N9"
]
# copy train images
train_images = os.listdir(train_image_dir)
train_images = set(train_images)

print("Train Images:",len(train_images))
    
for patient_id in patient_ids:
    # search for patient_id in train_image_dir
    path = glob.glob(os.path.join(train_image_dir, f"{patient_id}*.tif"))

    # get the file name
    file_name = os.path.basename(path[0])

    # remove the file name from the set of files
    train_images.remove(file_name)

    # copy the file to the new directory
    shutil.copy(path[0], new_train_image_dir)
    shutil.copy(os.path.join(train_xml_dir, file_name.replace(".tif", ".xml")), new_train_xml_dir)

print("Validation Images:",len(train_images))
# Copy rest of the files to the val directory
for file in train_images:
    shutil.copy(os.path.join(train_image_dir, file), new_val_image_dir)
    shutil.copy(os.path.join(train_xml_dir, file.replace(".tif", ".xml")), new_val_xml_dir)


# copy test images
test_images = glob.glob(os.path.join(test_image_dir, f"*.tif"))
test_images = set(test_images)
print("Test Images:",len(test_images))
for image_name in test_images:
    shutil.copy(os.path.join(test_image_dir, image_name), new_test_image_dir)
    shutil.copy(os.path.join(test_xml_dir, image_name.replace(".tif", ".xml")), new_test_xml_dir)


Train Images: 37
Validation Images: 7
Test Images: 14


# Make Yolo labels from xml files

In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import torch
import torchvision
from torchvision import tv_tensors
from torch.utils.data import Dataset
import torchvision.transforms.v2 as T
import xml.etree.ElementTree as ET
from torchvision.transforms.v2 import functional as F

class MoNuSegDataset(Dataset):
    def __init__(self, image_dir, xml_dir, transform=None):
        self.image_dir = image_dir
        self.xml_dir = xml_dir
        self.transform = transform  # Adjust size as needed
        
        # Assumes matching filenames
        self.image_names = sorted(os.listdir(image_dir))

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_name = self.image_names[idx]
        xml_name = img_name.replace(".tif", ".xml")  # adjust if needed

        img_path = os.path.join(self.image_dir, img_name)
        xml_path = os.path.join(self.xml_dir, xml_name)

        # Load image
        image = Image.open(img_path).convert("RGB")
        image = tv_tensors.Image(image)
        
        
        # Get contours from XML
        contours = self.get_contours_from_xml(xml_path)
        
        # Create masks from contours
        masks = np.array([self.contour_to_mask(contour, F.get_size(image)) for contour in contours])
        masks = torch.tensor(masks)
        
        # Generate bounding boxes
        boxes = torchvision.ops.masks_to_boxes(masks)
        
        # Generate bounding boxes from contours
        labels = torch.ones(len(boxes), dtype=torch.int64)

        # Define the target dictionary
        target = {}
        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(image))   # dtype=torch.int64
        target["masks"] = tv_tensors.Mask(masks)   # dtype=torch.uint8
        target["labels"] = labels # dtype=torch.int64
        

        # Apply transformations if any
        if self.transform:
            image, target = self.transform(image, target)

        return image, target

    @staticmethod
    def get_contours_from_xml(xml_file):
        """
        Parses an XML annotation file and extracts region vertices.

        Args:
            xml_file (str): Path to the annotation XML file.

        Returns:
            list of numpy.ndarray: Each array is of shape (n_points, 2),
            containing (x, y) coordinates of one annotated region.
        """
        xy = []
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()
            regions = root.findall('.//Annotation/Regions/Region')

            for region in regions:
                vertices = region.findall('./Vertices/Vertex')
                coords = [(float(v.get('X')), float(v.get('Y'))) for v in vertices]
                if coords:
                    xy.append(np.array(coords, dtype=np.int32))

        except FileNotFoundError:
            print(f"Error: XML file not found: {xml_file}")
            return []
        except ET.ParseError:
            print(f"Error: Could not parse XML file: {xml_file}")
            return []

        return xy

    @staticmethod    
    def contour_to_mask(contour, mask_size):
        """
        Converts a contour (polygon) to a binary mask.

        Args:
            contour (numpy.ndarray): List of (x, y) vertices defining the contour.
            mask_size (tuple): Size of the output mask (height, width).

        Returns:
            numpy.ndarray: Binary mask with the contour filled.
        """
        mask = np.zeros(mask_size, dtype=np.uint8)
        cv2.fillPoly(mask, [np.array(contour, dtype=np.int32)], 1)
        return mask



# Load the dataset
train_dataset = MoNuSegDataset(image_dir=new_train_image_dir, xml_dir=new_train_xml_dir)
val_dataset = MoNuSegDataset(image_dir=new_val_image_dir, xml_dir=new_val_xml_dir)
test_dataset = MoNuSegDataset(image_dir=new_test_image_dir, xml_dir=new_test_xml_dir)

