In [None]:
# Access to dataset through Drive
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/My Drive/')

Mounted at /content/drive/


In [None]:
%cd '/content/drive/MyDrive/STCN/STCN'

/content/drive/.shortcut-targets-by-id/1O_DxTckzGnTlprgYFmKup7Ct6cS1OJiW/STCN/STCN


In [None]:
import os
from os import path
import time
from argparse import ArgumentParser

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from PIL import Image

from model.eval_network import STCN
from dataset.davis_test_dataset import DAVISTestDataset
from util.tensor_util import unpad
from inference_core import InferenceCore

from progressbar import progressbar

import tqdm

import glob
import json

import cv2
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pycocotools.coco import COCO

import torch.optim as optim
from torch import nn, Tensor
from torch.utils.data import Dataset
import torchvision
import torchvision.transforms.functional as TF
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torchvision.models as models

import math

# Setuping STCN model

In [None]:
# Authors' implementation of STCN on Davis2017
# https://github.com/hkchengrex/STCN/blob/main/eval_davis.py

"""
Arguments loading
"""
parser = ArgumentParser()
parser.add_argument('--model', default='/content/drive/MyDrive/STCN/STCN/saves/stcn.pth')
parser.add_argument('--davis_path', default='/content/drive/MyDrive/STCN/DAVIS/2017')
parser.add_argument('--output', default='/content/drive/MyDrive/STCN/experiment/Davis2017/Yujin/val_toast_fps') #saving_masks path
parser.add_argument('--split', help='val/testdev', default='val')
parser.add_argument('--top', type=int, default=20)
parser.add_argument('--amp', action='store_true')
parser.add_argument('--mem_every', default=5, type=int)
parser.add_argument('--include_last', help='include last frame as temporary memory?', action='store_true')
parser.add_argument('--visualisation', default=True, type=bool) # Save the visualisation

args, unknown = parser.parse_known_args()

davis_path = args.davis_path
out_path = args.output
VIZ = args.visualisation

# Simple setup
os.makedirs(out_path, exist_ok=True)
palette = Image.open(path.expanduser(davis_path + '/trainval/Annotations/480p/blackswan/00000.png')).getpalette()

# Loading Dataset for evaluation + Loading pretrained model

In [None]:
torch.autograd.set_grad_enabled(False)

# Setup Dataset
if args.split == 'val':
    test_dataset = DAVISTestDataset(davis_path + '/trainval', imset='2017/val.txt')
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1)
elif args.split == 'testdev':
    test_dataset = DAVISTestDataset(davis_path + '/test-dev', imset='2017/test-dev.txt')
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1)
else:
    raise NotImplementedError


use_cuda = torch.cuda.is_available()
if use_cuda:
    print('Using GPU')
else:
    print('Using CPU')


# Load our checkpoint
top_k = args.top
prop_model = STCN().cuda().eval()

# Performs input mapping such that stage 0 model can be loaded
prop_saved = torch.load(args.model)
for k in list(prop_saved.keys()):
    if k == 'value_encoder.conv1.weight':
        if prop_saved[k].shape[1] == 4:
            pads = torch.zeros((64,1,7,7), device=prop_saved[k].device)
            prop_saved[k] = torch.cat([prop_saved[k], pads], 1)
prop_model.load_state_dict(prop_saved)

Using GPU


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

<All keys matched successfully>

# Utils functions

Imshow dunction were mainly used to debug the process, it displays an image based on the tensor you give as argument. color and grayscale

Finding bounding box function help to filter the segmentation model output in order to be able to measure correctly between authors' result and ours

In [None]:
def imshow(img,title):
    #img = img * 0.5 + 0.2  # unnormalize
    npimg = img.detach().numpy()
    npimg = np.transpose(npimg,(1,2,0))

    plt.figure(figsize=(16,16))
    plt.imshow(npimg)
    plt.title(title)
    plt.show()

def imshow_bw(img,title):
    #img = img * 0.5 + 0.2  # unnormalize
    npimg = img.detach().numpy()
    #npimg = np.transpose(npimg,(1,2,0))

    plt.figure(figsize=(10,10))
    plt.imshow(npimg,cmap='gray')
    plt.title(title)
    plt.show()

def finding_bounding_box(mask):
  D,L,W = mask.shape
  bounding_box = torch.empty((D,4), dtype=torch.float)
  for d in range(D):
    flag_i =0
    flag_j = 0
    for i in range(L):
      if (mask[d,i,:].sum()>0.0 and flag_i == 0):
        y0 = float(i)
        flag_i = 1
      if ((mask[d,i,:].sum()==0.0 and flag_i == 1 )or i==L-1 and flag_i == 1):
        y1 = float(i)
        flag_i = 2
    for j in range(W):
      if (mask[d,:,j].sum()>0.0 and flag_j == 0):
        x0 = float(j)
        flag_j = 1
      if ((mask[d,:,j].sum() == 0.0 and flag_j == 1) or (j== W-1 and flag_j == 1)):
        x1=float(j)
        flag_j = 2
    bounding_box[d] = torch.tensor([[x0,y0,x1,y1]])
  return bounding_box

def imshow_bw_box(img,box,title):
    #img = img * 0.5 + 0.2  # unnormalize
    npimg = img.detach().numpy()
    #npimg = np.transpose(npimg,(1,2,0))
    print(box)
    plt.figure(figsize=(16,16))
    plt.imshow(npimg,cmap='gray')
    ax = plt.gca()
    rect = patches.Rectangle((box[0],box[1]),box[2]-box[0],box[3]-box[1],linewidth=1, edgecolor='r', facecolor='none')
    ax.add_patch(rect)
    plt.title(title)
    plt.show()

#function predicting_mask for MaskRCNN and PointRend

The two pretrained models have different output format, some small differences are in the two functions but the idea is the same.

* As input of the function we have :

1.   Output of segmentation model
2.   Bounding box of ground truth object for first frame
3.   some args

We filter the output of the segmentation results by calculating the $L_2$ distance between the bounding box of ground truth objects and the one found by the segmentation models. The set of first frame masks is then computed.

In [None]:
def predicting_mask_sync(prediction,bb_dataset,acceptance_rate=0.8,threshold_object = 0.5,display=False):
  '''
  input :
  -- prediction : output of Mask-R-CNN
  -- bb_dataset : mask of original GT objects to find the same with Mask-R-CNN
  -- acceptance_rate (default 0.8) : confidence level to allow an object to be found
  -- threshold_object (default 0.5) : threshold value for binary mask
  -- display (default False): option to display mask images + overall mask 
  output :
  -- Give back pack of masks noramlized for STCN training
  -- Number of mask/objects discovered
  '''
  scores = prediction[0]["scores"]
  mask = prediction[0]["masks"]
  boxes = prediction[0]["boxes"]

  obj_accepted = scores[scores>=acceptance_rate]
  obj_accepted = obj_accepted.shape
  INF = 9999999

  D,_ = bb_dataset.shape #D = number of object in GT

  idx_best_mask = np.zeros(D)
  #compares predicted boxes with GT boxes only X object > accpetance rate
  for objects in range(D):
    dist_saved = INF
    for predicted_boxes in range(obj_accepted[0]):
      dist = torch.cdist(torch.unsqueeze(bb_dataset[objects],0),torch.unsqueeze(boxes[predicted_boxes],0),2)
      if(dist < dist_saved):
        dist_saved = dist
        idx_best_mask[objects] = predicted_boxes
    
  shape_mask = mask[0,:,:,:].shape #I get the shape of the image

  final_mask = torch.zeros(D,1,shape_mask[1],shape_mask[2]).cuda()

  for i in range(D):
    final_mask[i,0,:,:]= mask[int(idx_best_mask[i]),0,:,:]

  final_mask[final_mask>=threshold_object]=1.
  final_mask[final_mask<threshold_object]=0.
  return final_mask


def predicting_mask_sync_detectron2(prediction,bb_dataset,acceptance_rate=0.8,threshold_object = 0.5,display=False):
  '''
  input :
  -- prediction : output of Detectron2
  -- bb_dataset : mask of original GT objects to find the same with Mask-R-CNN
  -- acceptance_rate (default 0.8) : confidence level to allow an object to be found
  -- threshold_object (default 0.5) : threshold value for binary mask
  -- display (default False): option to display mask images + overall mask 
  output :
  -- Give back pack of masks noramlized for STCN training
  -- Number of mask/objects discovered
  '''
  nbr_objects = len(prediction["instances"])
  mask = prediction["instances"][:].pred_masks
  scores = prediction["instances"][:].scores
  boxes = prediction["instances"][0:nbr_objects].pred_boxes


  obj_accepted = scores[scores>=0.01]
  obj_accepted = obj_accepted.shape

  D,_ = bb_dataset.shape #D = number of object in GT
  INF = 9999999

  idx_best_mask = np.zeros(D)
  #compares predicted boxes with GT boxes only X object > accpetance rate
  for objects in range(D):
    dist_saved = INF
    for i,predicted_boxes in enumerate(boxes):
      dist = torch.cdist(torch.unsqueeze(bb_dataset[objects],0),torch.unsqueeze(predicted_boxes,0),2)
      if(dist < dist_saved):
        dist_saved = dist
        idx_best_mask[objects] = i

  shape_mask = mask.shape #I get the shape of the image
  final_mask = torch.zeros(D,1,shape_mask[1],shape_mask[2]).cuda()

  for i in range(D):
      final_mask[i,0,:,:]= mask[int(idx_best_mask[i])]

  final_mask[final_mask==True]=1.
  final_mask[final_mask==False]=0.


  return final_mask

#Preprocessing functions

In [None]:
def preprocessing_maskrcnn(rgb,std,mean):
    first_frame = rgb[0,0] #to get first image of video
    first_frame = first_frame * std + mean
    first_frame = torch.unsqueeze(first_frame, dim=0)
    first_frame = first_frame.cuda()
    return first_frame


def preprocessing_pointrend(rgb,std,mean):
    first_frame = rgb[0,0] #to get first image of video
    first_frame = first_frame * std + mean
    first_frame = (first_frame * 255)
    first_frame = torch.permute(first_frame,(1,2,0))
    first_frame_int = first_frame.float().numpy()
    return first_frame_int

# Algorithm segmentation

In [None]:

def algo_maskrcnn(model,gt_mask,rgb,std,mean):

    first_frame = preprocessing_maskrcnn(rgb,std,mean) #preprocessing first frame of video
    predict_mask = model(first_frame) #predicting objects segmentation
    bb_first_frame = finding_bounding_box(msk[:,0,0]).cuda() #used to filter segmentation model result
    msk_p = predicting_mask_sync(predict_mask,bb_first_frame,acceptance_rate=0.01) #generate first frame bounding box
    msk_p.cuda()
    return msk_p
    
def algo_pointrend(model,gt_mask,rgb,std,mean):

    first_frame = preprocessing_pointrend(rgb,std,mean) #preprocessing first frame of video
    predict_mask = model(first_frame) #predicting objects segmentation
    bb_first_frame = finding_bounding_box(msk[:,0,0]).cuda() #used to filter segmentation model result
    msk_p = predicting_mask_sync_detectron2(predict_mask,bb_first_frame,acceptance_rate=0.01) #generate first frame bounding box
    msk_p.cuda()
    return msk_p

# Evaluation on STCN model using pretrained Mask-R-CNN as 1st frame segmentation

## Calling pretrained models

In [None]:
maskrcnn_resnet50_fpn = models.detection.maskrcnn_resnet50_fpn(pretrained=True)
maskrcnn_resnet50_fpn.to(device).eval()

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in

In [None]:
### Hyper parameters ###


VIZ = False ## Option to generate video with mask (add more processing time)

mean = torch.tensor([0.485, 0.456, 0.406])[:, None, None].cuda()
std = torch.tensor([0.229, 0.224, 0.225])[:, None, None].cuda()

output_path = '/content/drive/MyDrive/STCN/experiment/Davis2017/Yujin/val_22jan/'

In [None]:
### Start evaluation ###
total_process_time = 0
total_frames = 0

for data in test_loader:
    with torch.cuda.amp.autocast(enabled=args.amp):
        rgb = data['rgb'].cuda()
        msk = data['gt'][0].cuda() # original annotation
        info = data['info']
        name = info['name'][0]
        k = len(info['labels'][0])
        size = info['size_480p']

        #############################################################################
        ######################### SEGMENTATION ALGORTIMH ############################
        #############################################################################
        msk_p = algo_maskrcnn(maskrcnn_resnet50_fpn,msk,rgb,std,mean)
        #############################################################################

        torch.cuda.synchronize()
        process_begin = time.time()

        processor = InferenceCore(prop_model, rgb, k, top_k=top_k, 
                        mem_every=args.mem_every, include_last=args.include_last)
        processor.interact(msk_p, 0, rgb.shape[1]) #msk_p changed here

        # Do unpad -> upsample to original size 
        out_masks = torch.zeros((processor.t, 1, *size), dtype=torch.uint8, device='cuda')
        for ti in range(processor.t):
            prob = unpad(processor.prob[:,ti], processor.pad)
            prob = F.interpolate(prob, size, mode='bilinear', align_corners=False)
            out_masks[ti] = torch.argmax(prob, dim=0)
        
        out_masks = (out_masks.detach().cpu().numpy()[:,0]).astype(np.uint8)

        torch.cuda.synchronize()
        total_process_time += time.time() - process_begin
        total_frames += out_masks.shape[0]

        # Save the results
        this_out_path = path.join(out_path, name)
        os.makedirs(this_out_path, exist_ok=True)
        for f in range(out_masks.shape[0]):
            img_E = Image.fromarray(out_masks[f])
            img_E.putpalette(palette)
            img_E.save(os.path.join(this_out_path, '{:05d}.png'.format(f)))


        # Adapted from the github of STM
        # https://github.com/seoungwugoh/STM/blob/master/eval_DAVIS.py
        if VIZ:
          from helpers import overlay_davis
          # visualize results
          viz_path = os.path.join(output_path, name) 
          if not os.path.exists(viz_path):
              os.makedirs(viz_path)

          for f in range(out_masks.shape[0]):
              im = rgb[0,f]  
              im = im * std + mean
              pF = (im.permute(1,2,0).cpu().numpy() * 255.).astype(np.uint8)
              pE = out_masks[f]
              canvas = overlay_davis(pF, pE, palette)
              canvas = Image.fromarray(canvas)
              canvas.save(os.path.join(viz_path, 'f{}.jpg'.format(f)))

          vid_path = os.path.join(output_path, '{}.mp4'.format(name))
          frame_path = os.path.join(output_path, name, 'f%d.jpg')
          os.system('ffmpeg -framerate 10 -i {} {} -vcodec libx264 -crf 10  -pix_fmt yuv420p  -nostats -loglevel 0 -y'.format(frame_path, vid_path))


        del rgb
        del msk_p
        del processor

print('Total processing time: ', total_process_time)
print('Total processed frames: ', total_frames)
print('FPS: ', total_frames / total_process_time)

Total processing time:  117.11059904098511
Total processed frames:  1999
FPS:  17.06933459797615


#implementing Detectron2


In [None]:
%cd /content/

/content


In [None]:
# install dependencies: 
!pip install pyyaml==5.1
# clone the repo in order to access pre-defined configs in PointRend project
!git clone --branch v0.6 https://github.com/facebookresearch/detectron2.git detectron2_repo
# install detectron2 from source
!pip install -e detectron2_repo

Collecting pyyaml==5.1
  Downloading PyYAML-5.1.tar.gz (274 kB)
[?25l[K     |█▏                              | 10 kB 37.3 MB/s eta 0:00:01[K     |██▍                             | 20 kB 39.1 MB/s eta 0:00:01[K     |███▋                            | 30 kB 44.6 MB/s eta 0:00:01[K     |████▉                           | 40 kB 28.9 MB/s eta 0:00:01[K     |██████                          | 51 kB 17.9 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 18.7 MB/s eta 0:00:01[K     |████████▍                       | 71 kB 14.4 MB/s eta 0:00:01[K     |█████████▋                      | 81 kB 15.9 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 16.7 MB/s eta 0:00:01[K     |████████████                    | 102 kB 14.7 MB/s eta 0:00:01[K     |█████████████▏                  | 112 kB 14.7 MB/s eta 0:00:01[K     |██████████████▍                 | 122 kB 14.7 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 14.7 MB/s eta 0:00:01[

In [None]:
%cd /content/detectron2_repo/

/content/detectron2_repo


In [None]:
# You may need to restart your runtime prior to this, to let your installation take effect
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import cv2
import torch
from google.colab.patches import cv2_imshow

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog
coco_metadata = MetadataCatalog.get("coco_2017_val")

# import PointRend project
from detectron2.projects import point_rend

# Evaluation on STCN model using pretrained PointRend as 1st frame segmentation

## Calling pretrained model

In [None]:
cfg = get_cfg()
point_rend.add_pointrend_config(cfg)
cfg.merge_from_file("./projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml") 
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.01  # set threshold for this model
cfg.MODEL.WEIGHTS = "detectron2://PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco/164955410/model_final_edd263.pkl"

predictor = DefaultPredictor(cfg)

model_final_edd263.pkl: 241MB [00:11, 20.1MB/s]                           




In [None]:
### Hyper parameters ###

VIZ = False ## Option to generate video with mask (add more processing time)

mean = torch.tensor([0.485, 0.456, 0.406])[:, None, None]
std = torch.tensor([0.229, 0.224, 0.225])[:, None, None]

output_path = '/content/drive/MyDrive/STCN/experiment/Davis2017/Yujin/val_pointrend_22jan/'

In [None]:
total_process_time = 0
total_frames = 0

for data in test_loader:
    with torch.cuda.amp.autocast(enabled=args.amp):
        rgb = data['rgb']
        msk = data['gt'][0].cuda() # original annotation
        info = data['info']
        name = info['name'][0]
        k = len(info['labels'][0])
        size = info['size_480p']

        #############################################################################
        ######################### SEGMENTATION ALGORTIMH ############################
        #############################################################################
        msk_p = algo_pointrend(predictor,msk,rgb,std,mean)
        #############################################################################

        torch.cuda.synchronize()
        process_begin = time.time()

        processor = InferenceCore(prop_model, rgb, k, top_k=top_k, 
                        mem_every=args.mem_every, include_last=args.include_last)
        processor.interact(msk_p, 0, rgb.shape[1]) #msk_p changed here

        # Do unpad -> upsample to original size 
        out_masks = torch.zeros((processor.t, 1, *size), dtype=torch.uint8, device='cuda')
        for ti in range(processor.t):
            prob = unpad(processor.prob[:,ti], processor.pad)
            prob = F.interpolate(prob, size, mode='bilinear', align_corners=False)
            out_masks[ti] = torch.argmax(prob, dim=0)
        
        out_masks = (out_masks.detach().cpu().numpy()[:,0]).astype(np.uint8)

        torch.cuda.synchronize()
        total_process_time += time.time() - process_begin
        total_frames += out_masks.shape[0]

        # Save the results
        this_out_path = path.join(out_path, name)
        os.makedirs(this_out_path, exist_ok=True)
        for f in range(out_masks.shape[0]):
            img_E = Image.fromarray(out_masks[f])
            img_E.putpalette(palette)
            img_E.save(os.path.join(this_out_path, '{:05d}.png'.format(f)))


        # Adapted from the github of STM
        # https://github.com/seoungwugoh/STM/blob/master/eval_DAVIS.py
        if VIZ:
          from helpers import overlay_davis
          # visualize results
          viz_path = os.path.join(output_path, name) 
          if not os.path.exists(viz_path):
              os.makedirs(viz_path)

          for f in range(out_masks.shape[0]):
              im = rgb[0,f]  
              im = im * std + mean
              pF = (im.permute(1,2,0).cpu().numpy() * 255.).astype(np.uint8)
              pE = out_masks[f]
              canvas = overlay_davis(pF, pE, palette)
              canvas = Image.fromarray(canvas)
              canvas.save(os.path.join(viz_path, 'f{}.jpg'.format(f)))

          vid_path = os.path.join(output_path, '{}.mp4'.format(name))
          frame_path = os.path.join(output_path, name, 'f%d.jpg')
          os.system('ffmpeg -framerate 10 -i {} {} -vcodec libx264 -crf 10  -pix_fmt yuv420p  -nostats -loglevel 0 -y'.format(frame_path, vid_path))


        del rgb
        del msk_p
        del processor

print('Total processing time: ', total_process_time)
print('Total processed frames: ', total_frames)
print('FPS: ', total_frames / total_process_time)


  max_size = (max_size + (stride - 1)) // stride * stride
  point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step


Total processing time:  134.72681999206543
Total processed frames:  1999
FPS:  14.837431775779526
