Writing a custom dataset for PennFudan

In [56]:
import os
import numpy as np
import torch
from PIL import Image

class PennFudanDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        
        self.imgs = list(sorted(os.listdir(os.path.join(root, 'PNGImages'))))
        self.masks = list(sorted(os.listdir(os.path.join(root, 'PedMasks'))))
        
    def __getitem__(self, idx):
        img_path = os.path.join(self.root, 'PNGImages', self.imgs[idx])
        mask_path = os.path.join(self.root, 'PedMasks', self.masks[idx])
        img = Image.open(img_path).convert('RGB')
        
        mask = Image.open(mask_path)
        mask = np.array(mask)
        
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        
        masks = mask == obj_ids[:, None, None]
        
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            
            boxes.append([xmin, ymin, xmax, ymax])
            
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        
        labels = torch.ones((num_objs, ), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx], dtype=torch.int64)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        iscrowd = torch.zeros((num_objs, ), dtype=torch.int64)
        
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['masks'] = masks
        target['image_id'] = image_id
        target['area'] = area
        target['iscrowd'] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)
            
        return img, target
    
    def __len__(self):
        return len(self.imgs)

Finetuning from a pretrained model

In [57]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretranied=True)

num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

Modifying the model to add a different backbone

In [58]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

backbone = torchvision.models.mobilenet_v2(pretrained=True).features
backbone.out_channels = 1280

anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ), 
                                   aspect_ratios=((0.5, 1.0, 2.0), ))

roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], 
                                              output_size=7, 
                                              sampling_ratio=2)

model = FasterRCNN(backbone, 
                   num_classes=2, 
                   rpn_anchor_generator=anchor_generator, 
                   box_roi_pool=roi_pooler)



An Instance segmentation model for PennFudan Dataset

In [59]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    
    model.roi_heads.mask_predictr = MaskRCNNPredictor(in_features_mask, 
                                                      hidden_layer, 
                                                      num_classes)
    
    return model

Putting everything together

In [60]:
import transforms as T
import utils

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        
    return T.Compose(transforms)

Testing forward() method

In [61]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0, collate_fn=utils.collate_fn)

images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v.long() for k, v in t.items()} for t in targets]
output = model(images, targets)

model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  



In [62]:
from engine import train_one_epoch, evaluate
import utils

def main():
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    num_classes = 2
    
    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))
    
    indices = torch.randperm(len(dataset)).tolist()
    dataset =torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset, indices[-50:])
    
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0, collate_fn=utils.collate_fn)
    data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn)
    
    model.to(device)
    
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
    
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    num_epochs = 10
    
    for epoch in range(num_epochs):
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        
        lr_scheduler.step()
        evaluate(model, data_loader_test, device=device)
        
    print('That\'s it!')

In [63]:
main()

Epoch: [0]  [ 0/60]  eta: 0:28:41  lr: 0.000090  loss: 0.2344 (0.2344)  loss_classifier: 0.1380 (0.1380)  loss_box_reg: 0.0725 (0.0725)  loss_objectness: 0.0183 (0.0183)  loss_rpn_box_reg: 0.0057 (0.0057)  time: 28.6973  data: 0.0547
Epoch: [0]  [10/60]  eta: 0:25:01  lr: 0.000936  loss: 0.2194 (0.2296)  loss_classifier: 0.1380 (0.1493)  loss_box_reg: 0.0615 (0.0534)  loss_objectness: 0.0196 (0.0217)  loss_rpn_box_reg: 0.0057 (0.0053)  time: 30.0328  data: 0.0716
Epoch: [0]  [20/60]  eta: 0:19:41  lr: 0.001783  loss: 0.1482 (0.1767)  loss_classifier: 0.0750 (0.1047)  loss_box_reg: 0.0502 (0.0528)  loss_objectness: 0.0060 (0.0145)  loss_rpn_box_reg: 0.0034 (0.0047)  time: 29.5814  data: 0.0742
Epoch: [0]  [30/60]  eta: 0:14:59  lr: 0.002629  loss: 0.1360 (0.1671)  loss_classifier: 0.0403 (0.0863)  loss_box_reg: 0.0597 (0.0651)  loss_objectness: 0.0019 (0.0107)  loss_rpn_box_reg: 0.0044 (0.0051)  time: 29.9585  data: 0.0798
Epoch: [0]  [40/60]  eta: 0:09:51  lr: 0.003476  loss: 0.1363 (0

TypeError: 'numpy.float64' object cannot be interpreted as an integer