### **Applying LOST to predict a bounding-box for the first video frames**

For implementing LOST: https://arxiv.org/pdf/2109.14279.pdf, we adapted the code of the paper, which is given on the following GitHub page: https://github.com/valeoai/LOST.

First we are going to apply LOST on all first frames of 13 videos of the Something-Something dataset and predict bounding-boxes. The 13 videos are given on: https://github.com/joaanna/something_else.



In [None]:
!git clone https://github.com/valeoai/LOST

Cloning into 'LOST'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects: 100% (83/83), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 83 (delta 31), reused 57 (delta 14), pack-reused 0[K
Unpacking objects: 100% (83/83), done.


In [None]:
!git clone https://github.com/facebookresearch/dino.git

Cloning into 'dino'...
remote: Enumerating objects: 168, done.[K
remote: Counting objects: 100% (104/104), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 168 (delta 88), reused 77 (delta 77), pack-reused 64[K
Receiving objects: 100% (168/168), 24.45 MiB | 33.20 MiB/s, done.
Resolving deltas: 100% (104/104), done.


In [None]:
# Access to dataset through Drive
import os
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/My Drive/')

Mounted at /content/drive/


In [None]:
import sys
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torch

In [None]:
sys.path.insert(0, '/content/LOST')
sys.path.insert(1, '/content/dino')
sys.path.insert(2, '/content/drive/MyDrive/STCN/LOST-bounding-box/JPEGImages')
sys.path.insert(3, '/content/drive/MyDrive/STCN/LOST-bounding-box/Annotations')

In [None]:
from object_discovery import patch_scoring, detect_box
from skimage.transform import resize
import scipy

In [None]:
# Model
k_patches = 100    # Number of patches with the lowest degree considered
patch_size = 16
from networks import get_model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = get_model("vit_base", patch_size = patch_size, resnet_dilate=0, device=device)

Since no pretrained weights have been provided, we load the reference pretrained DINO weights.
Pretrained weights found at dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth and loaded with msg: <All keys matched successfully>


In [None]:
# Image transformation
from torchvision import transforms as pth_transforms
transform = pth_transforms.Compose(
    [
        pth_transforms.ToTensor(),
        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)

In [None]:
# LOST: bounding-box predictions
def lost(feats, dims, scales, init_image_size, k_patches=100):
    """
    Inputs
        feats: the pixel/patche features of an image
        dims: dimension of the map from which the features are used
        scales: from image to map scale
        init_image_size: size of the image
        k_patches: number of k patches retrieved that are compared to the seed at seed expansion
    Outputs
        pred: box predictions
        A: binary affinity matrix
        scores: lowest degree scores for all patches
        seed: selected patch corresponding to an object
    """
    # Compute the similarity
    A = (feats @ feats.transpose(1, 2)).squeeze()

    # Compute the inverse degree centrality measure per patch
    sorted_patches, scores = patch_scoring(A)

    # Select the initial seed
    seed = sorted_patches[0]

    # Seed expansion
    potentials = sorted_patches[:k_patches]
    similars = potentials[A[seed, potentials] > 0.0]
    M = torch.sum(A[similars, :], dim=0)

    # Box extraction
    pred, _ = detect_box(
        M, seed, dims, scales=scales, initial_im_size=init_image_size[1:]
    )

    return np.asarray(pred), A, scores, seed

In [None]:
# Video folder names
for folder in os.walk(sys.path[2], topdown=True):
    video_names = folder[1]
    break

print('Video folder names: ', video_names)

Video folder names:  ['44862', '57082', '2', '151201', '4', '80962', '3201', '862', '77005', '13201', '2003', '6981', '22983']


In [None]:
for video_name in video_names:
    image_path = os.path.join(sys.path[2], video_name, '0001.jpg')  # First frames in the folders
    image = Image.open(image_path)
    image = np.array(image)
    image_size = image.shape                    # array (image_height, image_width, 3)

    # Apply LOST
    img = image
    img = transform(img)
    init_image_size = img.shape    # tensor [3, image_height, image_width]
    im_name = '0001'

    # Padding the image with zeros to fit multiple of patch-size
    size_im = (
        img.shape[0],
        int(np.ceil(img.shape[1] / patch_size) * patch_size),
        int(np.ceil(img.shape[2] / patch_size) * patch_size),
    )
    paded = torch.zeros(size_im)
    paded[:, : img.shape[1], : img.shape[2]] = img
    img = paded
    # img = img.cuda(non_blocking=True)    # Move to gpu

    # Size for transformers
    w_featmap = img.shape[-2] // patch_size
    h_featmap = img.shape[-1] // patch_size

    which_features = "k"    # possible choices : "q", "k", "v"

    with torch.no_grad():
        # Store the outputs of qkv layer from the last attention layer
        feat_out = {}
        def hook_fn_forward_qkv(module, input, output):
            feat_out["qkv"] = output

        model._modules["blocks"][-1]._modules["attn"]._modules["qkv"].register_forward_hook(hook_fn_forward_qkv)

        # Forward pass in the model
        attentions = model.get_last_selfattention(img[None, :, :, :])

        # Scaling factor
        scales = [patch_size, patch_size]

        # Dimensions
        nb_im = attentions.shape[0]  # Batch size
        nh = attentions.shape[1]  # Number of heads
        nb_tokens = attentions.shape[2]  # Number of tokens

        # Extract the qkv features of the last attention layer
        qkv = (
            feat_out["qkv"]
            .reshape(nb_im, nb_tokens, 3, nh, -1 // nh)
            .permute(2, 0, 3, 1, 4)
        )
        q, k, v = qkv[0], qkv[1], qkv[2]
        k = k.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
        q = q.transpose(1, 2).reshape(nb_im, nb_tokens, -1)
        v = v.transpose(1, 2).reshape(nb_im, nb_tokens, -1)

        # Modality selection
        if which_features == "k":     # keys of the patches
            feats = k[:, 1:, :]       # tensor [1, 1620, 768]
        elif which_features == "q":
            feats = q[:, 1:, :]
        elif which_features == "v":
            feats = v[:, 1:, :]

    # Bounding-box prediction
    pred, A, scores, seed = lost(
        feats,
        [w_featmap, h_featmap],
        scales,
        init_image_size,
        k_patches=k_patches,
    )

    # Bounding-box coordinates
    xmin, ymin, xmax, ymax = pred

    # Create a red bounding box
    mask = image * 0                         # array (image_height, image_width, 3)
    mask[ymin:ymax, xmin:xmax, 0] = 255
    mask = Image.fromarray(mask, 'RGB')

    # Save mask
    if not os.path.isdir(os.path.join(sys.path[3], video_name)):
        os.makedirs(os.path.join(sys.path[3], video_name))
    mask.save(os.path.join(sys.path[3], video_name, '0001.png'))


  "See the documentation of nn.Upsample for details.".format(mode)
  "The default behavior for interpolate/upsample with float scale_factor changed "
