In [1]:
import torch
import torchvision
from torchvision.models.detection.mask_rcnn import MaskRCNN_ResNet50_FPN_Weights
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import cv2
import numpy as np
from utils.general_utils import get_mask_from_rle

In [2]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [None]:
def get_model(num_classes=2):
    """
    Create a Mask R-CNN model pre-trained on COCO for instance segmentation, 
    customized for the desired number of classes.

    :param num_classes: Number of output classes (including background). 
                        Default is 2 (background + ship).
    :return: Mask R-CNN model with adjusted box and mask predictors.
    """
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(
        weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1
    )

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = \
        torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden = 256
    model.roi_heads.mask_predictor = \
        torchvision.models.detection.mask_rcnn.MaskRCNNPredictor(
            in_features_mask, hidden, num_classes
        )

    return model

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(device)

model = get_model()
model.to(device)

optimizer = optim.SGD(
    model.parameters(),
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

writer = SummaryWriter(log_dir="runs/maskrcnn_train")


cpu


In [5]:
from utils.general_utils import get_image

In [None]:
from torch.utils.data import Dataset


class ShipDataset(Dataset):
    """
    PyTorch Dataset for instance segmentation of ships.
    Loads images lazily (one at a time) and decodes RLE masks on-demand.
    """

    def __init__(self, rle_dict: dict[str, list[str]], transforms=None):
        """
        Initialize the dataset.

        :param rle_dict: Dictionary mapping image filenames to lists of RLE strings.
                         Example: {"img1.jpg": ["rle1", "rle2", ...], ...}
        :param transforms: Optional torchvision transforms to apply to the images.
        """
        self.rle_dict = rle_dict
        self.transforms = transforms
        self.img_ids = list(rle_dict.keys())

    def __len__(self) -> int:
        """
        Return the number of images in the dataset.

        :return: Number of images
        """
        return len(self.img_ids)

    def __getitem__(self, idx: int):
        """
        Load one image and all its instance masks, returning the format expected by Mask R-CNN.

        :param idx: Index of the image to load
        :return: Tuple (image_tensor, target_dict) where
                - image_tensor: torch.Tensor of shape [C, H, W], dtype=torch.float32
                - target_dict: dict containing
                - "boxes": Tensor[N, 4] of bounding boxes [x1, y1, x2, y2]
                - "labels": Tensor[N] of class labels (1=ship)
                - "masks": Tensor[N, H, W] of binary masks
                - "image_id": Tensor[1] with image index
                - "area": Tensor[N] of bounding box areas
                - "iscrowd": Tensor[N] indicating crowd instances (all 0 here)
        """
        img_id = self.img_ids[idx]
        rle_list = self.rle_dict[img_id]

        img = get_image(img_id)
        img = img.astype(np.float32) / 255.0
        img_tensor = torch.from_numpy(img).permute(2, 0, 1)  # HWC â†’ CHW

        masks = []
        boxes = []

        for rle in rle_list:
            mask = get_mask_from_rle(rle).astype(np.uint8)

            if mask.sum() == 0:
                continue

            ys, xs = np.where(mask == 1)
            if len(xs) == 0:
                continue
            
            masks.append(mask)

            x1, y1 = xs.min(), ys.min()
            x2, y2 = xs.max(), ys.max()
            boxes.append([x1, y1, x2, y2])
         
        if len(boxes) == 0:
            return None

        masks = torch.as_tensor(np.stack(masks), dtype=torch.uint8)  # [N,H,W]
        boxes = torch.as_tensor(boxes, dtype=torch.float32)          # [N,4]
        labels = torch.ones((len(boxes),), dtype=torch.int64)        # all 1 = ship class

        target = {
            "boxes": boxes,
            "labels": labels,
            "masks": masks,
            "image_id": torch.tensor([idx]),
            "area": (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]),
            "iscrowd": torch.zeros((len(boxes),), dtype=torch.int64),
        }

        return img_tensor, target


In [7]:
from tqdm import tqdm
from torch.utils.data import DataLoader

In [None]:
def train_one_epoch(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    data_loader: DataLoader,
    device: str,
    epoch: int
) -> float:
    """
    Train a Mask R-CNN model for one epoch.

    :param model: Mask R-CNN model
    :param optimizer: Optimizer (SGD, Adam, etc.)
    :param data_loader: PyTorch DataLoader yielding (images, targets)
    :param device: Device string, 'cuda' or 'cpu'
    :param epoch: Current epoch number
    :return: Average loss over the epoch
    """
    model.train()
    total_loss: float = 0.0

    pbar = tqdm(data_loader, desc=f"Epoch {epoch}")

    for batch in pbar:
        
        if batch is None:
            continue

        images, targets = batch
        
        images: list[torch.Tensor] = [img.to(device) for img in images]

        targets: list[dict[str, torch.Tensor]] = [
            {key: val.to(device) for key, val in t.items()}
            for t in targets
        ]

        loss_dict: dict[str, torch.Tensor] = model(images, targets)

        losses: torch.Tensor = sum(loss_dict.values(), torch.tensor(0.0, device=device))
        
        total_loss += losses.item()

        optimizer.zero_grad()   
        losses.backward()
        optimizer.step()

  
        pbar.set_postfix(loss=float(losses.item()))

    return total_loss / len(data_loader)

In [9]:
import os 

In [None]:
def train(model, train_loader, val_loader, epochs):
    """
    Full training loop for Mask R-CNN.

    :param model: Mask R-CNN model
    :param train_loader: DataLoader for training data
    :param val_loader: Optional DataLoader for validation data
    :param epochs: Number of epochs to train
    """
    os.makedirs("checkpoints", exist_ok=True)
    for epoch in range(1, epochs + 1):
        train_loss = train_one_epoch(model, optimizer, train_loader, device, epoch)

        lr_scheduler.step()

        writer.add_scalar("Loss/train", train_loss, epoch)

        print(f"Epoch {epoch}/{epochs} - Loss: {train_loss:.4f}")

        torch.save(model.state_dict(), f"checkpoints/maskrcnn_epoch_{epoch}.pth")
    writer.close()


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [12]:
from utils.general_utils import load_masks

In [13]:
df = load_masks()

rle_dict = (
    df.groupby("ImageId")["EncodedPixels"]
      .apply(list)
      .to_dict()
)

img_ids = list(rle_dict.keys())

test_img_ids, temp_img_ids = train_test_split(img_ids, test_size=0.4, random_state=42)
val_img_ids, train_img_ids = train_test_split(temp_img_ids, test_size=0.5, random_state=42)

train_rle_dict = {img_id: rle_dict[img_id] for img_id in train_img_ids}

shipDataset = ShipDataset(rle_dict=train_rle_dict, transforms=None)

In [14]:
from numpy.typing import NDArray

In [None]:

def collate_fn(batch : list[tuple[NDArray, dict[str, NDArray]]]):
    """
    Custom collate function for PyTorch DataLoader that filters out `None` samples
    and combines a batch of (image, target) tuples into tuples of images and targets.

    :param batch: List of (image, target) tuples, where
                  - image: NumPy array of shape [H, W, C]
                  - target: dictionary with keys like "boxes", "labels", "masks"
    :return: Tuple of:
             - images: tuple of NDArray images
             - targets: tuple of dictionaries
             Returns None if all batch elements are None.
    """
    batch = [b for b in batch if b is not None]
    if len(batch) == 0:
        return None
    return tuple(zip(*batch))

In [16]:
train(
    model=model,
    train_loader=DataLoader(shipDataset, batch_size=4, shuffle=True, collate_fn=collate_fn),
    val_loader=None,
    epochs=5
)

Epoch 1:   0%|          | 16/9628 [00:56<9:29:01,  3.55s/it, loss=0.735] 


KeyboardInterrupt: 

In [None]:
from typing import Dict, List
from utils.general_utils import compute_iou_matrix, compute_average_f_score

@torch.inference_mode()
def evaluate_model(model, rle_dict: Dict[str, List[str]], device="cuda"):
    """
    Evaluate Mask R-CNN segmentation performance using average F2 score.

    :param model: Mask R-CNN model
    :param rle_dict: Dictionary mapping image IDs to lists of ground truth RLE strings
                     Example: {"img1.jpg": ["rle1", "rle2", ...], ...}
    :param device: Device string ('cuda' or 'cpu')
    :return: Tuple containing:
             - mean F2 score across all images
             - list of F2 scores per image
    """
    model.eval()
    f2_scores = []

    for img_id, gt_rles in tqdm(rle_dict.items(), desc="Evaluating"):
        img = get_image(img_id)
        img_tensor = torch.from_numpy(img.astype(np.float32) / 255.).permute(2,0,1).to(device)

        pred = model([img_tensor])[0]  
        pred_masks = pred["masks"].squeeze(1).cpu().numpy() > 0.5  # [N,H,W]

        gt_masks = [get_mask_from_rle(rle).astype(np.uint8) for rle in gt_rles]

        iou_mat = compute_iou_matrix(gt_masks, pred_masks)

        f2 = compute_average_f_score(iou_mat)
        f2_scores.append(f2)

    return np.mean(f2_scores), f2_scores


In [None]:
_mean_f2, f2_scores = evaluate_model(
    model=model,
    rle_dict={im_id: rle_dict[im_id] for im_id in val_img_ids},
    device=device
)

print(f"Mean F2 Score on Validation Set: {_mean_f2:.4f}")