In [1]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
import matplotlib.pyplot as plt
import time
import os
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import copy
import pprint
from PIL import Image
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

plt.ion()

PyTorch Version:  1.2.0+cpu
Torchvision Version:  0.4.0+cpu


# Load dataset

In [2]:
DATASET_DIR = "data/keypoints_data"
annotations = os.path.join(DATASET_DIR, "annotations/ann.csv")

In [3]:
def parse_record(raw_record):
    out_dict = {}
    raw_data = raw_record.split(";")
    
    out_dict["file_path"] = raw_data[0]
    tmp_keypoints = [data.split(",") for data in raw_data[1:9]]
    
    out_dict["keypoints"] = []
    for keypoint in tmp_keypoints:
        keypoint = [int(elem) for elem in keypoint]
        out_dict["keypoints"].append(keypoint)
        
    out_dict["position"] = [float(data) for data in raw_data[9:12]]
    
    out_dict["rotation"] = [float(data) for data in raw_data[12:]]
    
    return out_dict

In [35]:
class CubeDataset(Dataset):
    def __init__(self, dataset_dir, transform=None):
        self.records = []
        with open(os.path.join(dataset_dir, "annotations/ann.csv"), "r") as ann:
            line = ann.readline().rstrip()
            while line:
                record = parse_record(line)
                self.records.append(record)
                line = ann.readline().rstrip()
        self.transform = transform
                  
    def __len__(self):
        return len(self.records)
                  
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_path = self.records[idx]["file_path"]
        img = Image.open(img_path).convert("RGB")
        mask_path = self.records[idx]["file_path"].replace("images", "annotations/masks")
        mask = Image.open(mask_path)
        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]
        
        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        keypoints = np.array(self.records[idx]["keypoints"])
        position = self.records[idx]["position"]
        rotation = self.records[idx]["rotation"]
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        target["keypoints"] = torch.tensor(keypoints[None, :, :])
        print(target["keypoints"])
        
        if self.transform:
            img, target = self.transform(img, target)
        
        return img, target

In [36]:
class ToTensor(object):
    def __call__(self, img, target):
        image, key_pts = img, target
        if(len(image.shape) == 2):
            image = image.reshape(image.shape[0], image.shape[1], 1)
        image = image.transpose((2, 0, 1))
        
        return {'image': torch.from_numpy(image),
                'keypoints': torch.from_numpy(key_pts)}

In [37]:
transformations = transforms.Compose([ToTensor()])
cube_dataset = CubeDataset(
    "data/keypoints_data",
    transformations
    )
print("Dataset with {} samples loaded".format(len(cube_dataset)))

Dataset with 967 samples loaded


# Transfer Learning

In [38]:
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.keypoint_rcnn import KeypointRCNNPredictor

model = torchvision.models.detection.keypointrcnn_resnet50_fpn(
    pretrained=True)

model.roi_heads.keypoint_predictor = KeypointRCNNPredictor(512, 8)

In [39]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.train()
model.to(device)

KeypointRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )


In [40]:
batch_size = 2
train_loader = DataLoader(cube_dataset, 
                          batch_size=batch_size,
                          shuffle=True, 
                          num_workers=0)

In [41]:
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        phase = "train"
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in train_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [42]:
criterion = nn.MSELoss()

In [43]:
model = train_model(model, criterion, optimizer, lr_scheduler,
                       num_epochs=25)

Epoch 0/24
----------
tensor([[[595, 269,   1],
         [581, 283,   1],
         [547, 204,   1],
         [562, 193,   1],
         [521, 299,   0],
         [504, 314,   1],
         [470, 236,   1],
         [488, 224,   1]]], dtype=torch.int32)


RuntimeError: Could not infer dtype of dict