# MS COCO

In [1]:
from torchvision import datasets
import torch
import numpy as np
import os

In [2]:
path2data="/home/ubuntu/mmdetection_od/mmdetection/data/coco/images/train2017"
path2json="/home/ubuntu/mmdetection_od/mmdetection/data/coco/annotations/instances_train2017.json"

In [3]:
train_dataset = datasets.CocoDetection(root=path2data, annFile=path2json)

loading annotations into memory...
Done (t=19.11s)
creating index...
index created!


In [4]:
train_dataset

Dataset CocoDetection
    Number of datapoints: 118287
    Root location: ../CLIP/data/coco/images/train2017

In [5]:
# Filter the dataset to only include images with annotations
annotated_images_indices = []
for idx in range(len(train_dataset)):
    # get the annotations for the current image
    _, target = train_dataset[idx]
    
    # check if the current image has any annotations
    if len(target) > 0:
        annotated_images_indices.append(idx)

In [6]:
len(annotated_images_indices)

117266

In [7]:
with open('annotated_train_images_indices_2017.txt', 'w') as f:
    for idx in annotated_images_indices:
        f.write(str(idx) + '\n')

In [8]:
filtered_train_dataset=torch.utils.data.Subset(train_dataset, annotated_images_indices)

In [9]:
len(filtered_train_dataset)

117266

In [10]:
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')

Using cache found in /home/ubuntu/.cache/torch/hub/facebookresearch_dinov2_main
xFormers not available
xFormers not available


In [11]:
model=dinov2_vitl14 

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
model.to(device)

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-23): 24 x NestedTensorBlock(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )


In [14]:
from PIL import Image
import torchvision.transforms as transforms

def extract_features(img):
  #image = Image.open(path)

  # Convert images to RGB. This is important
  # as the model was trained on RGB images.
  image = img.convert("RGB")
  
  pipeline = transforms.Compose([
      transforms.Resize(224),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
  ])
  x = pipeline(image)
  
  x = x.to(device)
  features = model(x.unsqueeze(0))
  
  return torch.squeeze(features[0])

# Train embeddings

In [15]:
from tqdm import tqdm
image_features = []
image_file_names=[]

# iterate over the dataset 
for idx in tqdm(range(len(filtered_train_dataset))):
    # get the image and its annotations
    img, target = filtered_train_dataset[idx]
    
    with torch.no_grad():
        image_features.append(extract_features(img).to('cpu'))
    
     # extract the filename
    image_id = target[0]['image_id']
    image_info = train_dataset.coco.loadImgs(image_id)[0]
    image_file_names.append(image_info['file_name'])


100%|██████████| 117266/117266 [2:28:19<00:00, 13.18it/s] 


image_features

In [16]:
image_features[0].shape

torch.Size([1024])

In [17]:
len(image_features)

117266

In [19]:
image_features_cpu=[image_feature.cpu().numpy() for image_feature in image_features]

# Save features

In [None]:
#Save features in the correct results folder
dataset='mscoco'
results_folder = os.path.join('/home/ubuntu/master_thesis/covering_lens/TypiClust/scan','results')
if not os.path.exists(results_folder):
        os.makedirs(results_folder)
results_folder_dataset = os.path.join(results_folder,dataset)
if not os.path.exists(results_folder_dataset):
        os.makedirs(results_folder_dataset)
results_folder_dataset_pretext = os.path.join(results_folder_dataset,'pretext')
if not os.path.exists(results_folder_dataset_pretext):
        os.makedirs(results_folder_dataset_pretext)
np.save(os.path.join(results_folder_dataset_pretext,'features_seed1_dinov2'), image_features_cpu)

In [22]:

check_emb=np.load('/home/ubuntu/master_thesis/covering_lens/TypiClust/scan/results/mscoco/pretext/features_seed1_dinov2.npy')

In [24]:
check_emb.shape

(117266, 1024)

In [26]:
len(image_file_names)

117266

In [27]:
with open(os.path.join(results_folder_dataset_pretext, 'filenames.txt'), 'w') as f:
    for image_file_name in image_file_names:
        f.write(image_file_name + '\n')

: 