# Model training - Semantic Segmentation

This script performs the semantic segmentation model training of the Coco dataset.

It loads the pre-processed data from the directory, build the model network, train it.

- It uses masks that were built assigning each pixel in the image to its belonging classes (one mask for each image).
- It uses Sparse Cross Entropy (because of the point above, and the fact that each pixel only belongs to one class).

In [96]:
import os
import cv2
import torch
import json
import numpy as np
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import PyQt5

In [97]:
%matplotlib qt

## Create Dataset and Dataloader

In [98]:
class COCOSegmentationDataset(Dataset):
    def __init__(self, image_dir, mask_dir, transform=None):
        """
        Args:
            image_dir (str): Path to the directory containing images.
            mask_dir (str): Path to the directory containing instance masks.
            transform (callable, optional): Optional transform to be applied to images.
        """
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(".jpg")])
        self.mask_files = sorted([f for f in os.listdir(mask_dir) if f.endswith(".jpg")])

    def __len__(self):
        """
        Return the number of files in the image dataset (each image correspond to one mask)
        """
        return len(self.image_files)

    def __getitem__(self, idx):
        # Load the images and masks
        image_name = self.image_files[idx]
        image_path = os.path.join(self.image_dir, image_name)
        mask_name = image_name  # Image and mask have the same filename
        mask_path = os.path.join(self.mask_dir, mask_name)

        if not mask_name:
            return None  # No mask found, handle accordingly

        # Load image
        image_original = cv2.imread(image_path)
        image_normalized = image_original.astype(np.float32) / 255.0  # Normalize
        
        # Pad image to match desired size
        original_h, original_w, _ = image_normalized.shape
        pad_h = max(0, (700 - original_h) // 2)
        pad_w = max(0, (700 - original_w) // 2)
        image_padded = np.pad(image_normalized, ((pad_h, 700 - original_h - pad_h), (pad_w, 700 - original_w - pad_w), (0, 0)), mode='constant', constant_values=0)
        
        # Load mask (grayscale) and expand values
        mask_original = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        mask_normalized = (mask_original / 1) * 1  # Expand values between min and max (keep original for now)
        
        # Pad mask to match desired size instead of interpolation
        original_h, original_w = mask_normalized.shape
        pad_h = max(0, (700 - original_h) // 2)
        pad_w = max(0, (700 - original_w) // 2)
        mask_padded = np.pad(mask_normalized, ((pad_h, 700 - original_h - pad_h), (pad_w, 700 - original_w - pad_w)), mode='constant', constant_values=0)
        
        # Ensure mask values remain categorical (0 to 255 after expansion)
        mask_tensor = torch.tensor(mask_padded, dtype=torch.long)  # Keep it as an integer tensor
        
        # Convert image to tensor
        image_tensor = torch.tensor(image_padded, dtype=torch.float32).permute(2, 0, 1)  # Shape (C, H, W)

        return image_tensor, mask_tensor

In [99]:
image_val_dir = "/home/maver02/Development/Datasets/COCO/preprocess_coco_2_v1/val/images"
mask_val_dir = "/home/maver02/Development/Datasets/COCO/preprocess_coco_2_v1/val/masks"
image_train_dir = "/home/maver02/Development/Datasets/COCO/preprocess_coco_2_v1/train/images"
masks_train_dir = "/home/maver02/Development/Datasets/COCO/preprocess_coco_2_v1/train/masks"
instances_val_dir = "/home/maver02/Development/Datasets/COCO/annotations/instances_val2017.json"
instances_train_dir = "/home/maver02/Development/Datasets/COCO/annotations/instances_val2017.json"

In [100]:
test_data = COCOSegmentationDataset(image_val_dir, mask_val_dir)

In [101]:
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [102]:
# load json file with instances
with open(instances_val_dir, 'r') as file:
    val_instances_json = json.load(file)

In [104]:
val_instances_json.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [108]:
# Create a dict mapping categories id (pixel values) into their names
categories_dict = {}
categories_dict[0] = 'unknown'
for category in val_instances_json['categories']:
    categories_dict[category['id']] = category['name']

## Visualise data in Dataloader

In [111]:
# Display image and label.
test_features, test_masks = next(iter(test_dataloader))
print(f"Feature batch shape: {test_features.size()}")
print(f"Labels batch shape: {test_masks.size()}")

image = test_features[0].permute(1, 2, 0).numpy()  # Convert (C, H, W) to (H, W, C)
mask = test_masks[0].numpy()

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Display image
ax1 = axes[0]
im1 = ax1.imshow(image)
ax1.set_title("Image")
ax1.axis("off")

# Display mask
ax2 = axes[1]
im2 = ax2.imshow(mask, cmap="gray")
ax2.set_title("Mask")
ax2.axis("off")

# Function to show pixel values on hover
def format_coord_image(x, y):
    x, y = int(x), int(y)
    if 0 <= x < image.shape[1] and 0 <= y < image.shape[0]:
        pixel_value = image[y, x]
        return f"x={x}, y={y}, RGB={pixel_value}"
    return ""

def format_coord_mask(x, y):
    x, y = int(x), int(y)
    if 0 <= x < mask.shape[1] and 0 <= y < mask.shape[0]:
        pixel_value = mask[y, x]
        category = categories_dict.get(int(pixel_value /1 * 1), "Unknown")  # Convert to category, re normalize value
        return f"x={x}, y={y}, Category={category}"
    return ""

ax1.format_coord = format_coord_image
ax2.format_coord = format_coord_mask

plt.show()


Feature batch shape: torch.Size([64, 3, 700, 700])
Labels batch shape: torch.Size([64, 700, 700])


## Training