# PASCAL VOC 

In [1]:
from torchvision import datasets
import torch
import numpy as np
from torch.utils.data import ConcatDataset
import os

In [2]:
trainval_dataset_2012 = datasets.VOCDetection(root="/home/ubuntu/mmdetection_od/mmdetection/data", year="2012", image_set="trainval", download=False)

In [3]:
len(trainval_dataset_2012)

11540

In [4]:
trainval_dataset_2007 = datasets.VOCDetection(root="/home/ubuntu/mmdetection_od/mmdetection/data",year="2007",image_set="trainval", download=False)

In [5]:
len(trainval_dataset_2007)

5011

In [6]:
train_dataset = ConcatDataset([trainval_dataset_2007, trainval_dataset_2012])

In [7]:
len(train_dataset)

16551

# DINOv2 

Use timm's names
* IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
* IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) 
* from the original github 


In [8]:
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')

Using cache found in /home/ubuntu/.cache/torch/hub/facebookresearch_dinov2_main
xFormers not available
xFormers not available


In [9]:
model=dinov2_vitl14 

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
model.to(device)

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-23): 24 x NestedTensorBlock(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )


In [12]:
from PIL import Image
import torchvision.transforms as transforms

def extract_features(img):

  # Convert images to RGB. This is important
  # as the model was trained on RGB images.
  image = img.convert("RGB")
  pipeline = transforms.Compose([
      transforms.Resize(224),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
  ])
  x = pipeline(image)
  
  x = x.to(device)
  features = model(x.unsqueeze(0))
  
  return torch.squeeze(features[0])

# Train embeddings

In [13]:
from tqdm import tqdm

image_features = []
image_file_names=[]

# iterate over the dataset 
for idx in tqdm(range(len(train_dataset))):
    # get the image and its annotations
    img, target = train_dataset[idx]
    
    with torch.no_grad():
        image_features.append(extract_features(img).to('cpu'))
        image_file_names.append(target['annotation']['filename'])

100%|██████████| 16551/16551 [20:18<00:00, 13.59it/s]


image_features

In [14]:
image_features[0].shape

torch.Size([1024])

In [15]:
len(image_features)

16551

In [17]:
image_features_cpu=[image_feature.cpu().numpy() for image_feature in image_features]

# Save features


In [None]:
#Save features in the correct results folder
dataset='pascalvoc'
results_folder = os.path.join('/home/ubuntu/master_thesis/covering_lens/TypiClust/scan','results')
if not os.path.exists(results_folder):
        os.makedirs(results_folder)
results_folder_dataset = os.path.join(results_folder,dataset)
if not os.path.exists(results_folder_dataset):
        os.makedirs(results_folder_dataset)
results_folder_dataset_pretext = os.path.join(results_folder_dataset,'pretext')
if not os.path.exists(results_folder_dataset_pretext):
        os.makedirs(results_folder_dataset_pretext)
np.save(os.path.join(results_folder_dataset_pretext,'features_seed1_dinov2'), image_features_cpu)

In [20]:
check_emb=np.load('/home/ubuntu/master_thesis/covering_lens/TypiClust/scan/results/mscoco/pretext/features_seed1_dinov2.npy')

In [22]:
check_emb.shape

(16551, 1024)

In [24]:
len(image_file_names)

16551

In [25]:
with open(os.path.join(results_folder_dataset_pretext, 'filenames.txt'), 'w') as f:
    for image_file_name in image_file_names:
        f.write(image_file_name + '\n')

: 