<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/DINOv2_Encoder_Downstream_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DINOv2 for Classification:

In [None]:
from transformers import AutoImageProcessor, Dinov2Model
import torch
import torch.nn as nn
from PIL import Image
import requests

class DINO_Classification(nn.Module):
    def __init__(self, number_classes=1000):
        super(DINO_Classification, self).__init__()
        self.dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")
        self.classifier = nn.Linear(self.dinov2.config.hidden_size * 2, number_classes)

    def forward(self, input):
        outputs = self.dinov2(input)
        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
        cls_token = sequence_output[:, 0]
        patch_tokens = sequence_output[:, 1:]
        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
        logits = self.classifier(linear_input)

        return logits

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = DINO_Classification(number_classes=20)

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    logits = model(inputs['pixel_values'])

print('logits:', logits.shape)

logits: torch.Size([1, 20])


DINOv2 for Regression:

In [1]:
from transformers import AutoImageProcessor, Dinov2Model
import torch
import torch.nn as nn
from PIL import Image
import requests

class DINO_Regression(nn.Module):
    def __init__(self, ):
        super(DINO_Regression, self).__init__()
        self.dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")
        self.regressor = nn.Sequential(
            nn.Linear(self.dinov2.config.hidden_size * 2, 1),
            nn.Sigmoid()
        )

    def forward(self, input):
        outputs = self.dinov2(input)
        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
        cls_token = sequence_output[:, 0]
        patch_tokens = sequence_output[:, 1:]
        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
        logits = self.regressor(linear_input)

        return logits

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = DINO_Regression()

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    logits = model(inputs['pixel_values'])

print('logits:', logits)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

logits: tensor([[0.2030]])


Feature Extraction:

In [2]:
from transformers import AutoImageProcessor, Dinov2Model
import torch
import torch.nn as nn
from PIL import Image
import requests

class DINO_Regression(nn.Module):
    def __init__(self, ):
        super(DINO_Regression, self).__init__()
        self.dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")
        self.regressor = nn.Sequential(
            nn.Linear(self.dinov2.config.hidden_size * 2, 1),
            nn.Sigmoid()
        )

    def forward(self, input):
        outputs = self.dinov2(input)
        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
        # cls_token = sequence_output[:, 0]
        # patch_tokens = sequence_output[:, 1:]
        # linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
        # logits = self.regressor(linear_input)

        return sequence_output

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = DINO_Regression()

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    features = model(inputs['pixel_values'])

print('logits:', features.shape)

logits: torch.Size([1, 257, 768])


In [4]:
dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")

In [5]:
dinov2

Dinov2Model(
  (embeddings): Dinov2Embeddings(
    (patch_embeddings): Dinov2PatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): Dinov2Encoder(
    (layer): ModuleList(
      (0-11): 12 x Dinov2Layer(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attention): Dinov2Attention(
          (attention): Dinov2SelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): Dinov2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (layer_scale1): Dinov2LayerScale()
        (drop_path): Ide

In [16]:
dinov2.layernorm

LayerNorm((768,), eps=1e-06, elementwise_affine=True)

In [14]:
dinov2.encoder.layer[0]

Dinov2Layer(
  (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
  (attention): Dinov2Attention(
    (attention): Dinov2SelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (output): Dinov2SelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
  )
  (layer_scale1): Dinov2LayerScale()
  (drop_path): Identity()
  (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
  (mlp): Dinov2MLP(
    (fc1): Linear(in_features=768, out_features=3072, bias=True)
    (activation): GELUActivation()
    (fc2): Linear(in_features=3072, out_features=768, bias=True)
  )
  (layer_scale2): Dinov2LayerScale()
)

In [21]:
class DINO_Custom(nn.Module):
    def __init__(self, ):
        super(DINO_Custom, self).__init__()
        dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")
        self.embeddings = dinov2.embeddings
        self.encoder = dinov2.encoder
        self.layernorm = dinov2.layernorm


    def forward(self, input):
        # outputs = self.dinov2(input)
        emb = self.embeddings(input)
        outputs = self.encoder.layer[0](emb)
        for t_layer in self.encoder.layer[0]:

        print(len(outputs), outputs[0].shape)
        sequence_output = outputs[0]
        outputs = self.layernorm(sequence_output)
        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size

        return sequence_output

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = DINO_Custom()

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    features = model(inputs['pixel_values'])

print('logits:', features.shape)

1 torch.Size([1, 257, 768])
logits: torch.Size([257, 768])
