In [1]:
from functools import partial
from pathlib import Path
from typing import Optional, Tuple

import cv2
import fire
import numpy as np
import torch
import torch.nn.functional as F
from accelerate import Accelerator
from PIL import Image
from scipy.sparse.linalg import eigsh
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from torchvision.utils import draw_bounding_boxes
from tqdm import tqdm

import extract_utils as utils

In [2]:
print (torch.__version__)

1.8.1


In [3]:
# Contains inputs to the extract_features method
class args:
    images_list = '/data/home/mukhotij/internship_2022/deep_spectral/data/VOC2012/lists/images.txt'
    images_root = '/data/home/mukhotij/internship_2022/deep_spectral/data/VOC2012/images'
    output_dir = '/data/home/mukhotij/internship_2022/deep_spectral/data/VOC2012/features/dino_vits16'
    model_name = 'dino_vits16'
    batch_size = 1
    which_block = -1

In [4]:
# Create output directory

utils.make_output_dir(args.output_dir)

In [5]:
# Load model

model_name = args.model_name.lower()
model, val_transform, patch_size, num_heads = utils.get_model(model_name)

Using cache found in /data/home/mukhotij/.cache/torch/hub/facebookresearch_dino_main


In [1]:
model

NameError: name 'model' is not defined

In [6]:
# Add a hook to capture features of the model (Could be useful for CLIP as well!!)

if 'dino' in model_name or 'mocov3' in model_name:
    feat_out = {}
    def hook_fn_forward_qkv(module, input, output):
        feat_out["qkv"] = output
    model._modules["blocks"][args.which_block]._modules["attn"]._modules["qkv"].register_forward_hook(hook_fn_forward_qkv)
else:
    raise ValueError(model_name)

In [7]:
# Create Dataset

filenames = Path(args.images_list).read_text().splitlines()
dataset = utils.ImagesDataset(filenames=filenames, images_root=args.images_root, transform=val_transform)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=8)
print(f'Dataset size: {len(dataset)=}')
print(f'Dataloader size: {len(dataloader)=}')

Dataset size: len(dataset)=17125
Dataloader size: len(dataloader)=17125


In [8]:
# Preparing model

accelerator = Accelerator(fp16=True, cpu=False)
model = model.to(accelerator.device)

In [None]:
# Seems like there's no reshaping of the images here!!!! (This won't be the case in CLIP I guess!)

pbar = tqdm(dataloader, desc='Processing')
for i, (images, files, indices) in enumerate(pbar):
    output_dict = {}
    
    # Check if file already exists
    id = Path(files[0]).stem
    
    # Create output file for each image separately for storage
    output_file = Path(args.output_dir) / f'{id}.pth'
    if output_file.is_file():
        pbar.write(f'Skipping existing file {str(output_file)}')
        continue

    # Reshape image (Pay very close attention to this part)
    P = patch_size # 16 in the case of Vit-S/B-16
    B, C, H, W = images.shape # Batch size, Channels, Height, Width - note that these are not reduced, same thing can be applied in case of CLIP.
    # As long as the image can be patched up, it can be passed through the vision transformer - remember it is designed to deal with any number of tokens.
    H_patch, W_patch = H // P, W // P # Number of patches along height and number of patches along width
    H_pad, W_pad = H_patch * P, W_patch * P # The resizing dimensions which are exactly equal to the number of patches * patch size - we want to resize the image to this size
    # T = number of tokens moving through the network
    T = H_patch * W_patch + 1 # Additional one token for [CLS]

    # Possible ablation: Bilinear interpolation
    # images = F.interpolate(images, size=(H_pad, W_pad), mode='bilinear')  # resize image
    images = images[:, :, :H_pad, :W_pad] # Simpler approach to above, just slice the image NOTE: This is where the resizing is happening. 
    # There is no resize operation which is necessary. This could be the same when applied to CLIP.
    images = images.to(accelerator.device)
    
    
    # Forward and collect features into the output dict
    if 'dino' in model_name or 'mocov3' in model_name:
        temp = model.get_intermediate_layers(images)[0].squeeze(0)
        # print (feat_out["qkv"].shape) # This has shape (B, T, num_heads * 3 * d_for_each_head(=64))
        output_qkv = feat_out["qkv"].reshape(B, T, 3, num_heads, -1 // num_heads).permute(2, 0, 3, 1, 4) # Reshaping to (B, T, 3, num_heads, d_for_each_head)
        # Permuted shape is (3, B, num_heads, T, d_for_each_head)
        
        # output_dict['q'] = output_qkv[0].transpose(1, 2).reshape(B, T, -1)[:, 1:, :]
        # The above is first getting (B, num_heads, T, d_for_each_head)
        # Then it is transposing and getting (B, T, num_heads, d_for_each_head)
        # Then it is reshaping and getting (B, T, (num_heads * d_for_each_head(=384)))
        # Then it is excluding the first token from the mix and taking only the rest of the tokens. (B, T-1, (num_heads * d_for_each_head))
        # Similar operations are happening in the following two cases as well.
        output_dict['k'] = output_qkv[1].transpose(1, 2).reshape(B, T, -1)[:, 1:, :] # Only storing the k part of the last attention layer of the vision transformer - why this in particular??
        # output_dict['v'] = output_qkv[2].transpose(1, 2).reshape(B, T, -1)[:, 1:, :]
    else:
        raise ValueError(model_name)
        
    # Storing meta data
    output_dict['indices'] = indices[0]
    output_dict['file']  = files[0]
    output_dict['id'] = id
    output_dict['model_name'] = model_name
    output_dict['patch_size'] = patch_size
    output_dict['shape'] = (B, C, H, W)
    output_dict = {k: (v.detach().cpu() if torch.is_tensor(v) else v) for k, v in output_dict.items()}
    
    # Saving the output_dict in the file
    accelerator.save(output_dict, str(output_file))
    accelerator.wait_for_everyone()

Processing:  45%|████▌     | 7751/17125 [10:08<12:40, 12.32it/s] 

In [10]:
indices[0]

tensor(17124)

## Random Tests

In [16]:
from torchvision import transforms 

def get_transform(name: str):
    if any(x in name for x in ('dino', 'mocov3', 'convnext', )):
        normalize = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        transform = transforms.Compose([transforms.ToTensor(), normalize])
    else:
        raise NotImplementedError()
    return transform

In [18]:
val_transform = get_transform(name)

In [22]:
patch_size = model.patch_embed.patch_size
num_heads = model.blocks[0].attn.num_heads

In [11]:
model

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0): Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (