In [1]:
!wget --quiet http://sceneparsing.csail.mit.edu/data/ChallengeData2017/images.tar
!wget --quiet http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
!tar -xf images.tar
!tar -xf annotations_instance.tar
!rm images.tar annotations_instance.tar

In [4]:
import os
import time
import random
import collections
import glob

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torchvision
import torchvision.transforms as T
from torchvision.transforms import ToPILImage
from torchvision.transforms import functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

In [5]:
WIDTH = 704
HEIGHT = 520

# Reduced the train dataset to 5000 rows
TEST = False

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

RESNET_MEAN = (0.485, 0.456, 0.406)
RESNET_STD = (0.229, 0.224, 0.225)

BATCH_SIZE = 8

# No changes tried with the optimizer yet.
MOMENTUM = 0.9
LEARNING_RATE = 0.005
WEIGHT_DECAY = 0.0005


# Changes the confidence required for a pixel to be kept for a mask.
# Only used 0.5 till now.
MASK_THRESHOLD = 0.5

# Normalize to resnet mean and std if True.
NORMALIZE = False


# Use a StepLR scheduler if True. Not tried yet.
USE_SCHEDULER = False

# Number of epochs
NUM_EPOCHS = 1


BOX_DETECTIONS_PER_IMG = 539


MIN_SCORE = 0.59

In [7]:
all_images = glob.glob('/kaggle/working/images/training/*.jpg')
all_annots = glob.glob('/kaggle/working/annotations_instance/training/*.png')

In [8]:
annots = []
for ann in tqdm(all_annots):
    _ann = np.array(Image.open(ann).convert("RGB")).transpose(2,0,1)
    r , g ,b  = _ann
    if 4 not in np.unique(r):
        continue
    annots.append(ann)

100%|██████████| 20210/20210 [02:06<00:00, 159.90it/s]


In [9]:
from sklearn.model_selection import train_test_split
_annots = [annot.split('/')[-1].split('.')[0] for annot in annots ]
trn_items,val_items=train_test_split(_annots,random_state=2)

In [10]:
# These are slight redefinitions of torch.transformation classes
# The difference is that they handle the target and the mask
# Copied from Abishek, added new ones
class Compose:
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image, target):
        for t in self.transforms:
            image, target = t(image, target)
        return image, target

class VerticalFlip:
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image, target):
        if random.random() < self.prob:
            height, width = image.shape[-2:]
            image = image.flip(-2)
            bbox = target["boxes"]
            bbox[:, [1, 3]] = height - bbox[:, [3, 1]]
            target["boxes"] = bbox
            target["masks"] = target["masks"].flip(-2)
        return image, target

class HorizontalFlip:
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image, target):
        if random.random() < self.prob:
            height, width = image.shape[-2:]
            image = image.flip(-1)
            bbox = target["boxes"]
            bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
            target["boxes"] = bbox
            target["masks"] = target["masks"].flip(-1)
        return image, target

class Normalize:
    def __call__(self, image, target):
        image = F.normalize(image, RESNET_MEAN, RESNET_STD)
        return image, target

class ToTensor:
    def __call__(self, image, target):
        image = F.to_tensor(image)
        return image, target


def get_transform(train):
    transforms = [ToTensor()]
    if NORMALIZE:
        transforms.append(Normalize())
    return Compose(transforms)

In [11]:
class MasksDataset(Dataset):

    def __init__(self, items, transforms, N):
        self.items = items
        self.transforms = transforms
        self.N = N

    def get_mask(self, path):
        an = np.array(Image.open(path).convert("RGB")).transpose(2,0,1)
        r,g,b = an
        nzs = np.nonzero(r==4)
        instances = np.unique(g[nzs])
        masks = np.zeros((len(instances), *r.shape))
        for ix,_id in enumerate(instances):
            masks[ix] = g==_id
        return masks

    def __getitem__(self, ix):
        _id = self.items[ix]
        img_path = f'images/training/{_id}.jpg'
        mask_path = f'annotations_instance/training/{_id}.png'
        masks = self.get_mask(mask_path)
        obj_ids = np.arange(1, len(masks)+1)
        img = np.array(Image.open(img_path).convert("RGB"))
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            obj_pixels = np.where(masks[i])
            xmin = np.min(obj_pixels[1])
            xmax = np.max(obj_pixels[1])
            ymin = np.min(obj_pixels[0])
            ymax = np.max(obj_pixels[0])
            if (((xmax-xmin)<=10) | (ymax-ymin)<=10):
                xmax = xmin+10
                ymax = ymin+10
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        image_id = torch.tensor([ix])
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        if self.transforms is not None:
            img, target = self.transforms(img, target)
        return img, target

    def __len__(self):
        return self.N


In [13]:
def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained = True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # Replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,num_classes)
    return model

In [14]:
model = get_model_instance_segmentation(2).to(DEVICE)
model

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
100%|██████████| 170M/170M [00:01<00:00, 149MB/s]  


MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in

In [16]:
dataset = MasksDataset(trn_items, get_transform(train=True), N=len(trn_items),)
dataset_test = MasksDataset(val_items, get_transform(train=False), N=len(val_items))


# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=8, shuffle=True, num_workers=0,
    collate_fn=lambda x: tuple(zip(*x)))

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=8, shuffle=False, num_workers=0,
    collate_fn= lambda x: tuple(zip(*x)))

In [17]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=LEARNING_RATE,
                                    momentum=MOMENTUM
                                  , weight_decay=WEIGHT_DECAY)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=3,
                                                gamma=0.1)


n_batches = len(data_loader)

In [18]:
for epoch in range(1, NUM_EPOCHS + 1):
    print(f"Starting epoch {epoch} of {NUM_EPOCHS}")
    model.train()
    time_start = time.time()

    train_loss_accum = 0.0
    train_loss_mask_accum = 0.0

    valid_loss_accum = 0.0
    valid_loss_mask_accum = 0.0

    for batch_idx, (images, targets) in enumerate(data_loader, 1):

        # Predict
        images = list(image.to(DEVICE) for image in images)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

        train_loss_dict = model(images, targets)
        train_loss = sum(loss for loss in train_loss_dict.values())

        # Backprop
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # Logging
        train_loss_mask = train_loss_dict['loss_mask'].item()
        train_loss_accum += train_loss.item()
        train_loss_mask_accum += train_loss_mask

        if batch_idx % 50 == 0:
          print(f"    [Batch {batch_idx:3d} / {n_batches:3d}] Batch train loss: {train_loss.item():7.3f}. Mask-only loss: {train_loss_mask:7.3f}")


    with torch.no_grad():
      for batch_idx, (images, targets) in enumerate(data_loader_test, 1):
          # Predict
          images = list(image.to(DEVICE) for image in images)
          targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]

          valid_loss_dict = model(images, targets)
          valid_loss = sum(loss for loss in valid_loss_dict.values())

          # Logging
          valid_loss_mask = valid_loss_dict['loss_mask'].item()
          valid_loss_accum += valid_loss.item()
          valid_loss_mask_accum += valid_loss_mask



    if USE_SCHEDULER:
        lr_scheduler.step()

    # Train losses
    train_loss = train_loss_accum / n_batches
    train_loss_mask = train_loss_mask_accum / n_batches

    valid_loss = valid_loss_accum / n_batches
    valid_loss_mask = valid_loss_mask_accum / n_batches

    elapsed = time.time() - time_start


    torch.save(model.state_dict(), f"pytorch_model-e{epoch}.bin")
    prefix = f"[Epoch {epoch:2d} / {NUM_EPOCHS:2d}]"
    print(f"{prefix} Train mask-only loss: {train_loss_mask:7.3f}")
    print(f"{prefix} Train loss: {train_loss:7.3f}. [{elapsed:.0f} secs]")
    print(f"{prefix} Valid mask-only loss: {valid_loss_mask:7.3f}")
    print(f"{prefix} Valid loss: {valid_loss:7.3f}. [{elapsed:.0f} secs]")