### **Applying LOST to segment the first video frame**

For implementing LOST: https://arxiv.org/pdf/2109.14279.pdf, we adapted the code of the paper, which is given on the following GitHub page: https://github.com/valeoai/LOST.

First we are going to apply LOST on all first frames of the Davis2017 dataset.

Then, to improve the masks, we are going to apply the CRF processing step from the paper: https://arxiv.org/pdf/1210.5644.pdf, by adapting the following GitHub code: https://github.com/lucasb-eyer/pydensecrf.

In [None]:
!git clone https://github.com/valeoai/LOST

fatal: destination path 'LOST' already exists and is not an empty directory.


In [None]:
!git clone https://github.com/facebookresearch/dino.git

fatal: destination path 'dino' already exists and is not an empty directory.


In [None]:
!pip install git+https://github.com/lucasb-eyer/pydensecrf.git

Collecting git+https://github.com/lucasb-eyer/pydensecrf.git
  Cloning https://github.com/lucasb-eyer/pydensecrf.git to /tmp/pip-req-build-idr_3azz
  Running command git clone -q https://github.com/lucasb-eyer/pydensecrf.git /tmp/pip-req-build-idr_3azz
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [None]:
# Access to dataset through Drive
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/My Drive/')

Mounted at /content/drive/


In [None]:
import sys
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torch

In [None]:
sys.path.insert(0, '/content/LOST')
sys.path.insert(1, '/content/dino')
sys.path.insert(2, '/content/drive/MyDrive/STCN/DAVIS/2017/trainval/JPEGImages/480p')
sys.path.insert(3, '/content/drive/MyDrive/STCN/experiment/LOST')
sys.path.insert(4, '/content/drive/MyDrive/STCN/DAVIS/2017/trainval/Annotations/480p')
sys.path.insert(5, '/content/drive/MyDrive/STCN/STCN-LOST-CRF/Annotations')

In [None]:
from object_discovery import patch_scoring
from skimage.transform import resize
import scipy
import itertools
import glob
import pydensecrf.densecrf as dcrf

In [None]:
# Model
k_patches = 100    # Number of patches with the lowest degree considered
patch_size = 16
from networks import get_model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = get_model("vit_base", patch_size = patch_size, resnet_dilate=0, device=device)

Since no pretrained weights have been provided, we load the reference pretrained DINO weights.
Pretrained weights found at dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth and loaded with msg: <All keys matched successfully>


In [None]:
# Image transformation
from torchvision import transforms as pth_transforms
transform = pth_transforms.Compose(
    [
        pth_transforms.ToTensor(),
        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)

In [None]:
# Slightly modified LOST
# Instead of returning the bounding box predictions, it returns the mask
def modified_lost(feats, dims, scales, init_image_size, k_patches=100):
    """
    Inputs
        feats: the pixel/patche features of an image
        dims: dimension of the map from which the features are used
        scales: from image to map scale
        init_image_size: size of the image
        k_patches: number of k patches retrieved that are compared to the seed at seed expansion
    Outputs
        M: mask
        seed: selected patch corresponding to an object
    """
    # Compute the similarity
    A = (feats @ feats.transpose(1, 2)).squeeze()

    # Compute the inverse degree centrality measure per patch
    sorted_patches, scores = patch_scoring(A)

    # Select the initial seed
    seed = sorted_patches[0]

    # Seed expansion
    potentials = sorted_patches[:k_patches]
    similars = potentials[A[seed, potentials] > 0.0]
    M = torch.sum(A[similars, :], dim=0)

    return M, seed

In [None]:
# Video folder names
for folder in os.walk(sys.path[2], topdown=True):
    video_names = folder[1]
    break

print('Video folder names: ', video_names)

Video folder names:  ['bear', 'bike-packing', 'blackswan', 'bmx-bumps', 'bmx-trees', 'boat', 'boxing-fisheye', 'breakdance', 'breakdance-flare', 'bus', 'camel', 'car-roundabout', 'car-shadow', 'car-turn', 'cat-girl', 'classic-car', 'color-run', 'cows', 'crossing', 'dance-jump', 'dance-twirl', 'dancing', 'disc-jockey', 'dog', 'dog-agility', 'dog-gooses', 'dogs-jump', 'dogs-scale', 'drift-chicane', 'drift-straight', 'drift-turn', 'drone', 'elephant', 'flamingo', 'goat', 'gold-fish', 'hike', 'hockey', 'horsejump-high', 'horsejump-low', 'india', 'judo', 'kid-football', 'kite-surf', 'kite-walk', 'koala', 'lab-coat', 'lady-running', 'libby', 'lindy-hop', 'loading', 'longboard', 'lucia', 'mallard-fly', 'mallard-water', 'mbike-trick', 'miami-surf', 'motocross-bumps', 'motocross-jump', 'motorbike', 'night-race', 'paragliding', 'paragliding-launch', 'parkour', 'pigs', 'planes-water', 'rallye', 'rhino', 'rollerblade', 'schoolgirls', 'scooter-black', 'scooter-board', 'scooter-gray', 'sheep', 'shoo

In [None]:
for video_name in video_names:
    image_path = os.path.join(sys.path[2], video_name, '00000.jpg')  # First frames in the folders
    image = Image.open(image_path)
    image = np.array(image)
    image_size = image.shape                    # array (image_height, image_width, 3)
    image_height, image_width, _ = image_size

    if not os.path.isdir(os.path.join(sys.path[3], video_name)):
        os.makedirs(os.path.join(sys.path[3], video_name))

    # Apply LOST
    img = image
    img = transform(img)
    init_image_size = img.shape    # tensor [3, image_height, image_width]
    im_name = '00000'

    # Padding the image with zeros to fit multiple of patch-size
    size_im = (
        img.shape[0],
        int(np.ceil(img.shape[1] / patch_size) * patch_size),
        int(np.ceil(img.shape[2] / patch_size) * patch_size),
    )
    paded = torch.zeros(size_im)
    paded[:, : img.shape[1], : img.shape[2]] = img
    img = paded
    # img = img.cuda(non_blocking=True)    # Move to gpu

    # Size for transformers
    w_featmap = img.shape[-2] // patch_size
    h_featmap = img.shape[-1] // patch_size

    which_features = "k"    # possible choices : "q", "k", "v"

    with torch.no_grad():
        # Store the outputs of qkv layer from the last attention layer
        feat_out = {}
        def hook_fn_forward_qkv(module, input, output):
            feat_out["qkv"] = output

        model._modules["blocks"][-1]._modules["attn"]._modules["qkv"].register_forward_hook(hook_fn_forward_qkv)

        # Forward pass in the model
        attentions = model.get_last_selfattention(img[None, :, :, :])

        # Scaling factor
        scales = [patch_size, patch_size]

        # Dimensions
        nb_im = attentions.shape[0]  # Batch size
        nh = attentions.shape[1]  # Number of heads
        nb_tokens = attentions.shape[2]  # Number of tokens

        # Extract the qkv features of the last attention layer
        qkv = (
            feat_out["qkv"]
            .reshape(nb_im, nb_tokens, 3, nh, -1 // nh)
            .permute(2, 0, 3, 1, 4)
        )
        q, k, v = qkv[0], qkv[1], qkv[2]
        k = k.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
        q = q.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
        v = v.transpose(1, 2).reshape(nb_im, nb_tokens, -1)

        # Modality selection
        if which_features == "k":     # keys of the patches
            feats = k[:, 1:, :]       # tensor [1, 1620, 768]
        elif which_features == "q":
            feats = q[:, 1:, :]
        elif which_features == "v":
            feats = v[:, 1:, :]

    # Apply modified LOST
    M, seed = modified_lost(
        feats,
        [w_featmap, h_featmap],
        scales,
        init_image_size,
        k_patches=k_patches,
    )

    # Mask extraction

    # Reshape M
    correl = M.reshape(w_featmap, h_featmap).float()    # tensor [w_featmap, h_featmap]

    # Compute connected components
    labeled_array, num_features = scipy.ndimage.label(correl.cpu().numpy() > 0.0)

    # Find connected component corresponding to the initial seed
    cc = labeled_array[np.unravel_index(seed.cpu().numpy(), (w_featmap, h_featmap))]

    # Compute mask containing the seed
    mask = (labeled_array == cc)            # array (w_featmap, h_featmap)

    # Size of the image not considering the channels
    image_size_2d = image_size[:2]

    # Resize the mask to the size of the image
    resized_mask = resize(mask, image_size_2d)    # Mask corresponding to the connected component containing the seed

    # Colour the mask red
    black_image = 0 * image                 # array (image_height, image_width, 3)
    black_image[resized_mask == 1, 0] = 255
    red_mask = black_image
    red_mask = Image.fromarray(red_mask, 'RGB')
    red_mask.save(os.path.join(sys.path[3], video_name, 'red_mask.png'))  # Save mask obtained by LOST

    # Apply CRF processing step
    # Unary potential
    # U[0, :, :] : channel of the background
    # U[1, :, :] : channel of the object
    U = np.ones((2, resized_mask.shape[0], resized_mask.shape[1]), dtype=np.float32)  # array (2, image_height, image_width)
    U[0, :, :] = 10
    U[1, resized_mask==1] = 100
    U = U / U.sum(0, keepdims=True)    # Probability of the classes at different positions

    d = dcrf.DenseCRF2D(image_width, image_height, 2)  # Width, height, nlabels

    U = - np.log(U)    # Minus log probability
    U = U.reshape((2,-1)) # Needs to be flat
    d.setUnaryEnergy(U)

    # This adds the color-independent term, features are the locations only
    d.addPairwiseGaussian(sxy=(3, 3), compat=3, kernel=dcrf.DIAG_KERNEL,
                            normalization=dcrf.NORMALIZE_SYMMETRIC)

    # This adds the color-dependent term, i.e. features are (x,y,r,g,b)
    d.addPairwiseBilateral(sxy=(40, 40), srgb=(13, 13, 13), rgbim=image,
                            compat=10,
                            kernel=dcrf.DIAG_KERNEL,
                            normalization=dcrf.NORMALIZE_SYMMETRIC)

    Q = d.inference(10)

    # Find out the most probable class for each pixel
    MAP = np.argmax(Q, axis=0)

    reshaped_MAP = MAP.reshape((image_height, image_width))

    # Colour the new mask red
    black_mask = 0 * image                   # array (image_height, image_width, 3)
    black_mask[reshaped_MAP == 1, 0] = 255
    new_mask = black_mask
    new_mask = Image.fromarray(new_mask, 'RGB')
    new_mask.save(os.path.join(sys.path[3], video_name, 'new_mask.png'))  # Save mask obtained by LOST+CRF

    if not os.path.isdir(os.path.join(sys.path[5], video_name)):
        os.makedirs(os.path.join(sys.path[5], video_name))
    new_mask.save(os.path.join(sys.path[5], video_name, '00000.png'))  # Save mask for applying STCN later

    # Save original mask
    original_mask_path = os.path.join(sys.path[4], video_name, '00000.png')
    original_mask = Image.open(original_mask_path)
    original_mask.save(os.path.join(sys.path[3], video_name, 'original_mask.png'))

    # Save original image
    image = Image.fromarray(image, 'RGB')
    image.save(os.path.join(sys.path[3], video_name, 'image.jpg'))


  "See the documentation of nn.Upsample for details.".format(mode)
  "The default behavior for interpolate/upsample with float scale_factor changed "
