In [2]:
#@title Connect to google drive
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Configure kaggle and dowload dome-mesh-ycb dataset (only once, takes ~34 min)
import os
# Must download kaggle environmnet key and place in folder of choice
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/DeepLearning/Kaggle"
%cd /content/drive/MyDrive/DeepLearning/Kaggle/
#!kaggle datasets download -d noellelaw/dome-mesh-ycb --unzip

In [None]:
#@title Install needed repositories
!pip install mmcv-full==v1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html

In [None]:
#@title Clone ViTDope
%cd /content/
! git clone https://github.com/noellelaw/vit-dope
%cd /content/vit-dope
! pip install -r requirements.txt

In [None]:
#@title Install timm and einops
! pip install timm==0.4.9 einops

In [None]:
#@title Imports
import copy
import os
import os.path as osp
from os.path import exists
import matplotlib.pyplot as plt
from scipy.ndimage.filters import gaussian_filter
import time
import warnings
import numpy as np
import json
import datetime
import glob
import cv2
import colorsys
import math

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.utils.data as data
import torchvision.models as models
from torch.distributions import MultivariateNormal as MVN
from torch.nn.utils import clip_grad_norm_
from torch.nn.functional import cross_entropy

from PIL import Image
from PIL import ImageDraw
from PIL import ImageEnhance

from math import acos
from math import sqrt
from math import pi  

import mmcv
from mmcv import Config, DictAction
from mmcv.utils import get_git_hash
from mmcv.runner import get_dist_info, init_dist, set_random_seed

from collections import OrderedDict
import tempfile
import random
from __future__ import print_function

from models.backbones import ViT
from scripts.ndds_dataloader import MultipleVertexJson
from core.evaluation.top_down_eval import (keypoint_pck_accuracy,
                            keypoints_from_heatmaps,
                            pose_pck_accuracy)
from models.heads import TopdownHeatmapSimpleHead

In [None]:
#@title Training hyperparameters
YCB_OBJECT = 'cracker_box'#@param{type:'string'} 
# Path to training data
DATA_PATH = '/content/drive/MyDrive/DeepLearning/Kaggle/cracker_box' #@param{type:'string'} 
# Path to testing data
DATA_PATH_TEST = ''#@param{type:'string'} 
# Path to pretrained MAE vit-b
PRETRAINED = '/content/drive/MyDrive/DeepLearning/mae_pretrain_vit_base.pth'#@param{type:'string'} 
# Path to weights to resum training from
FROM_NET = '/content/drive/MyDrive/DeepLearning/cracker_box_train/net_epoch_62.pth'#@param{type:'string'} 
# Path to output weight and loss data
OUT_FLDR = '/content/drive/MyDrive/DeepLearning/cracker_box_train'#@param{type:'string'} 
# What you want to name weight files
NAME_FILE = 'epoch'#@param{type:'string'} 
LEARNING_RATE = 5e-4#@param{type:'number'}
EPOCHS = 60#@param{type:'integer'}
# Tunable parameter for BMSE loss function
BMSE_NOISE = 0.1#@param{type:'number'}
BATCH_SIZE = 64#@param{type:'integer'}
IMAGE_SIZE = 256#@param{type:'integer'}
FREEZE_BACKBONE = False#@param{type:'boolean'}
DATASIZE = None
SAVE = False
NORMAL_IMGS = None
MAX_NORM = 1.
NORM_TYPE = 2
NUM_BELIEFS = 9
NUM_AFFINITIES = 16
NOISE = 1e-5
BRIGHTNESS = 1e-5
CONTRAST = 1e-5
LOG_INTERVAL = 3
SIGMA = 4

In [None]:
#@title Empty cuda cache as needed
# GPU messin with my workflow 
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
#@title Get training and testing data loaders 
print ("Loading data...")
# Image transform
transform = transforms.Compose([
                          transforms.Resize((IMAGE_SIZE,IMAGE_SIZE)),
                          transforms.ToTensor()])
# Get training data loader
trainingdata = None
if not DATA_PATH == "":
  train_dataset = MultipleVertexJson(
      root=DATA_PATH,
      objectofinterest=YCB_OBJECT,
      keep_orientation = True,
      noise = NOISE,
      sigma = SIGMA,
      data_size = DATASIZE,
      save = SAVE,
      transform = transform,
      normal = NORMAL_IMGS,
      target_transform = transforms.Compose([
                              transforms.Resize(IMAGE_SIZE//4),
          ]),
      )
  trainingdata = torch.utils.data.DataLoader(train_dataset,
      batch_size = BATCH_SIZE, 
      shuffle = True,
      num_workers = 1, 
      pin_memory = True
      )

# Get testing data loader
testingdata = None
if not DATA_PATH_TEST == "":
  testingdata = torch.utils.data.DataLoader(
      MultipleVertexJson(
          root = DATA_PATH_TEST,
          objectofinterest=YCB_OBJECT,
          keep_orientation = True,
          noise = NOISE,
          sigma = SIGMA,
          data_size = DATASIZE,
          save = SAVE,
          transform = transform,
          normal = NORMAL_IMGS,
          target_transform = transforms.Compose([
                                  transforms.Resize(IMAGE_SIZE//4),
              ]),
          ),
      batch_size = BATCH_SIZE, 
      shuffle = True,
      num_workers = 1, 
      pin_memory = True)

Loading data...


In [None]:
#@title Set up ViTDope Network
class ViTDopeNetwork(nn.Module):
  def __init__(
            self,
            pretrained=False,
            numBeliefMap=9,
            numAffinity=16
            ):
    super(ViTDopeNetwork, self).__init__()
    # Set up backbone accordance with ViT-B
    backbone = ViT(img_size=(256,256),
                  patch_size=16,
                  embed_dim=768,
                  depth=12,
                  num_heads=12,
                  ratio=1,
                  use_checkpoint=False,
                  mlp_ratio=4,
                  qkv_bias=True,
                  drop_path_rate=0.3,
    )
    # Init ViT weights from ViT MAE trained on image net
    if not PRETRAINED == '':
        backbone.init_weights(pretrained=PRETRAINED)
    # Set classical decoder head for belief maps
    belief_head = TopdownHeatmapSimpleHead(
        in_channels=768,
        num_deconv_layers=2,
        num_deconv_filters=(256, 256),
        num_deconv_kernels=(4, 4),
        extra=dict(final_conv_kernel=1, ),
        out_channels=numBeliefMap,
        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)
    )
    # Set classical decoder head for affity maps
    affinity_head = TopdownHeatmapSimpleHead(
        in_channels=768,
        num_deconv_layers=2,
        num_deconv_filters=(256, 256),
        num_deconv_kernels=(4, 4),
        extra=dict(final_conv_kernel=1, ),
        out_channels=numAffinity,
        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)
    )

    self.backbone = nn.Sequential(*[backbone])
    self.belief_head = nn.Sequential(*[belief_head])
    self.affinity_head = nn.Sequential(*[affinity_head])

  # Forward
  def forward(self, x):
    backbone_out = self.backbone(x)
    belief_out = self.belief_head(backbone_out)
    affinity_out = self.affinity_head(backbone_out)
    return belief_out, affinity_out



In [None]:
#@title Set up files for testing & training progress
with open (OUT_FLDR+'/loss_train.csv','w') as file: 
    file.write('epoch,batchid,loss\n')

with open (OUT_FLDR+'/loss_test.csv','w') as file: 
    file.write('epoch,batchid,loss\n')

In [None]:
#@title Load model
net = ViTDopeNetwork()
net = net.to('cuda')
# Load for inference or to resume training
if not FROM_NET == '':
    print('Loading model from prior weights...')
    net.load_state_dict(torch.load(FROM_NET))

Loading model from prior weights...


In [None]:
#@title Set up optimizer and scheduler
if FREEZE_BACKBONE:
  for name, param in net.named_parameters():            
      if name.startswith('backbone'):
          param.requires_grad = False

parameters = filter(lambda p: p.requires_grad, net.parameters())
optimizer = torch.optim.AdamW(parameters,
                              lr=LEARNING_RATE, 
                              betas=(0.9, 0.999), 
                              weight_decay=0.1)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=10)

In [None]:
#@title Print out model parameters
count = 0
for p in net.parameters():
    if p.requires_grad:
      count += p.numel() 
    
print("Number of trainable parameters: ", count)

Number of trainable parameters:  94241049


In [None]:
#@title Get balanced MSE Loss
# adapted from: https://github.com/jiawei-ren/BalancedMSE
def get_bmse_loss(preds, targets):
    # Batch size, num outputs, height, width
    B,N,H,W = preds.shape 
    resize_to = H*W
    loss = 0
    for i in range(N):   
        # Get identity matrix      
        I = torch.eye( resize_to )
        # Reshape target and belief maps
        belief = preds[:,i,:,:].reshape((B,resize_to)).cpu()
        target = targets[:,i,:,:].reshape((B,resize_to)).cpu()
        # Use trainign distribution prior to make statistical conversion for mse
        # logit size: [batch, batch]
        logits = MVN(belief.unsqueeze(1), (BMSE_NOISE*I)).log_prob(target.unsqueeze(0))  
        # Apply contrastive-like loss
        loss_temp = cross_entropy(logits, torch.arange(B))     
        loss_temp = loss_temp * (2 * BMSE_NOISE)
        loss += loss_temp
    return loss


In [3]:
#@title Run the network for one epoch 
def _run_network(epoch, loader, train=True):

    if train:
        net.train()
    else:
        net.eval()

    # Iterate through batches
    for batch_idx, targets in enumerate(loader):
        # Get data and targets
        data = Variable(targets['img'].cuda())
        target_belief = Variable(targets['beliefs'].cuda())        
        target_affinity = Variable(targets['affinities'].cuda())
        loss = None
        if train:
            optimizer.zero_grad()

        # Get predictions
        output_belief, output_affinities = net(data) 

        # Get balanced mse loss for belief maps
        loss = get_bmse_loss(output_belief, target_belief)

        # Get balanced mse loss for affinity maps
        loss += get_bmse_loss(output_affinities, target_affinity)

        # Update weights
        if train:
            loss.backward()
            # Gradient clipping
            nn.utils.clip_grad_norm_(parameters, max_norm=MAX_NORM, norm_type=NORM_TYPE)
            optimizer.step()

        # Determine file to write loss into 
        if train:
            namefile = '/loss_train.csv'
        else:
            namefile = '/loss_test.csv'
        # Write to files
        with open (OUT_FLDR+namefile,'a') as file:
            s = '{}, {},{:.15f}\n'.format(
                epoch,batch_idx,loss.data.item()) 
            file.write(s)

        # Print results
        if train:
            if batch_idx % LOG_INTERVAL == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.15f}'.format(
                    epoch, batch_idx * len(data), len(loader.dataset),
                    100. * batch_idx / len(loader), loss.data.item()))
        else:
            if batch_idx % LOG_INTERVAL == 0:
                print('Test Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.15f}'.format(
                    epoch, batch_idx * len(data), len(loader.dataset),
                    100. * batch_idx / len(loader), loss.data.item()))


In [None]:
#@title Run training over all epochs 

print ("Start:" , datetime.datetime.now().time())
for epoch in range(1, EPOCHS + 1):

    if not trainingdata is None:
        _run_network(epoch,trainingdata)

    if not DATA_PATH_TEST == "":
        _run_network(epoch,testingdata,train = False)
        if  DATA_PATH == "":
            break # lets get out of this if we are only testing
    try:
        torch.save(net.state_dict(), '{}/net_{}_{}.pth'.format(OUT_FLDR, NAME_FILE, epoch))
    except:
        pass
    
    scheduler.step()

print ("End:" , datetime.datetime.now().time())