<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/DINOv2_Encoder_Downstream_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from transformers import Dinov2Model
import numpy as np

from transformers import AutoImageProcessor, Dinov2Model
import torch
import torch.nn as nn
from PIL import Image
import requests

class DINOEncoder(nn.Module):
    """PyTorch module for a DINOv2 encoder, similar to ResnetEncoder."""
    def __init__(self, pretrained=True):
        super(DINOEncoder, self).__init__()

        dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base" if pretrained else None)
        self.embeddings = dinov2.embeddings
        self.encoder = dinov2.encoder
        self.layernorm = dinov2.layernorm
        self.num_ch_enc = np.array([256,256,256,256,256])


    def forward(self, input_image):

        self.features = []

        x = self.embeddings(input_image)

        for i, layer in enumerate(self.encoder.layer):
            x = layer(x)[0]
            if i in [7, 8, 9, 10, 11]:
                # self.features.append(self.layernorm(x[:, 1:]))
                self.features.append(x[:, 1:].view(-1, 256, 24, 32))
                print(x[:, 1:].view(-1, 256, 24, 32).shape)


        return self.features


In [2]:
def transformation_from_parameters(axisangle, translation, invert=False):
    """Convert the network's (axisangle, translation) output into a 4x4 matrix
    """
    R = rot_from_axisangle(axisangle)
    t = translation.clone()

    if invert:
        R = R.transpose(1, 2)
        t *= -1

    T = get_translation_matrix(t)

    if invert:
        M = torch.matmul(R, T)
    else:
        M = torch.matmul(T, R)

    return M

In [3]:
def rot_from_axisangle(vec):
    """Convert an axisangle rotation into a 4x4 transformation matrix
    (adapted from https://github.com/Wallacoloo/printipi)
    Input 'vec' has to be Bx1x3
    """
    angle = torch.norm(vec, 2, 2, True)
    axis = vec / (angle + 1e-7)

    ca = torch.cos(angle)
    sa = torch.sin(angle)
    C = 1 - ca

    x = axis[..., 0].unsqueeze(1)
    y = axis[..., 1].unsqueeze(1)
    z = axis[..., 2].unsqueeze(1)

    xs = x * sa
    ys = y * sa
    zs = z * sa
    xC = x * C
    yC = y * C
    zC = z * C
    xyC = x * yC
    yzC = y * zC
    zxC = z * xC

    rot = torch.zeros((vec.shape[0], 4, 4)).to(device=vec.device)

    rot[:, 0, 0] = torch.squeeze(x * xC + ca)
    rot[:, 0, 1] = torch.squeeze(xyC - zs)
    rot[:, 0, 2] = torch.squeeze(zxC + ys)
    rot[:, 1, 0] = torch.squeeze(xyC + zs)
    rot[:, 1, 1] = torch.squeeze(y * yC + ca)
    rot[:, 1, 2] = torch.squeeze(yzC - xs)
    rot[:, 2, 0] = torch.squeeze(zxC - ys)
    rot[:, 2, 1] = torch.squeeze(yzC + xs)
    rot[:, 2, 2] = torch.squeeze(z * zC + ca)
    rot[:, 3, 3] = 1

    return rot
def get_translation_matrix(translation_vector):
    """Convert a translation vector into a 4x4 transformation matrix
    """
    T = torch.zeros(translation_vector.shape[0], 4, 4).to(device=translation_vector.device)

    t = translation_vector.contiguous().view(-1, 3, 1)

    T[:, 0, 0] = 1
    T[:, 1, 1] = 1
    T[:, 2, 2] = 1
    T[:, 3, 3] = 1
    T[:, :3, 3, None] = t

    return T

In [4]:
from __future__ import absolute_import, division, print_function

import torch
import torch.nn as nn
from collections import OrderedDict


class PoseDecoder(nn.Module):
    def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1):
        super(PoseDecoder, self).__init__()

        self.num_ch_enc = num_ch_enc
        self.num_input_features = num_input_features

        if num_frames_to_predict_for is None:
            num_frames_to_predict_for = num_input_features - 1
        self.num_frames_to_predict_for = num_frames_to_predict_for

        self.convs = OrderedDict()
        self.convs[("squeeze")] = nn.Conv2d(self.num_ch_enc[-1], 256, 1)
        self.convs[("pose", 0)] = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1)
        self.convs[("pose", 1)] = nn.Conv2d(256, 256, 3, stride, 1)
        self.convs[("pose", 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for, 1)

        self.relu = nn.ReLU()

        self.net = nn.ModuleList(list(self.convs.values()))

    def forward(self, input_features):
        last_features = [f[-1] for f in input_features]

        cat_features = [self.relu(self.convs["squeeze"](f)) for f in last_features]
        cat_features = torch.cat(cat_features, 1)

        out = cat_features
        for i in range(3):
            out = self.convs[("pose", i)](out)
            if i != 2:
                out = self.relu(out)

        out = out.mean(3).mean(2)

        out = 0.001*out.view(-1, self.num_frames_to_predict_for, 1, 6)

        axisangle = out[..., :3]
        translation = out[..., 3:]

        return axisangle, translation



In [5]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model_enc = DINOEncoder()
model_dec = PoseDecoder(model_enc.num_ch_enc,num_input_features=1,num_frames_to_predict_for=2)

inputs = image_processor([image,image], return_tensors="pt")
print(inputs['pixel_values'].shape)
with torch.no_grad():
    features = model_enc(inputs['pixel_values'])
    axisangle, translation = model_dec([features])
    print("axisangle",axisangle.shape)
    print("translation", translation.shape)
    print("final",transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

torch.Size([2, 3, 224, 224])
torch.Size([2, 256, 24, 32])
torch.Size([2, 256, 24, 32])
torch.Size([2, 256, 24, 32])
torch.Size([2, 256, 24, 32])
torch.Size([2, 256, 24, 32])
axisangle torch.Size([2, 2, 1, 3])
translation torch.Size([2, 2, 1, 3])
final [[[ 1.0000000e+00  3.6350393e-05  4.5999255e-05 -1.2250722e-04]
  [-3.6350393e-05  1.0000000e+00  1.5550830e-04  6.0535578e-05]
  [-4.5999255e-05 -1.5550830e-04  1.0000000e+00  2.5301844e-05]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00]]

 [[ 1.0000000e+00  3.6350393e-05  4.5999255e-05 -1.2250722e-04]
  [-3.6350393e-05  1.0000000e+00  1.5550830e-04  6.0535578e-05]
  [-4.5999255e-05 -1.5550830e-04  1.0000000e+00  2.5301844e-05]
  [ 0.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00]]]


Feature Extraction of two frames:

In [23]:
import torch
import torch.nn as nn
from transformers import Dinov2Model
import numpy as np

from transformers import AutoImageProcessor, Dinov2Model
import torch
import torch.nn as nn
from PIL import Image
import requests

class DINOEncoder(nn.Module):
    """PyTorch module for a DINOv2 encoder, similar to ResnetEncoder."""
    def __init__(self, pretrained=True):
        super(DINOEncoder, self).__init__()

        dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")
        embeddings_weight = dinov2.embeddings.patch_embeddings.projection.weight
        embeddings_bias = dinov2.embeddings.patch_embeddings.projection.bias
        self.cls_token = dinov2.embeddings.cls_token
        self.projection = nn.Conv2d(6,768,kernel_size=(14, 14), stride=(14, 14))
        self.projection.weight = nn.Parameter(embeddings_weight.repeat(1,2,1,1))
        self.projection.bias = nn.Parameter(embeddings_bias)
        self.dropout = nn.Dropout(p=0.0, inplace=False)
        self.encoder = dinov2.encoder
        self.layernorm = dinov2.layernorm
        self.num_ch_enc = np.array([256,256,256,256,256])


    def forward(self, input_image):

        self.features = []
        batch_size = input_image.shape[0]

        x = self.projection(input_image).flatten(2).transpose(1, 2)
        x = self.dropout(x)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)


        for i, layer in enumerate(self.encoder.layer):
            x = layer(x)[0]
            if i in [7, 8, 9, 10, 11]:
                self.features.append(x[:, 1:].view(-1, 256, 24, 32))
                print(x[:, 1:].view(-1, 256, 24, 32).shape)


        return self.features

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
img1 = image_processor(image, return_tensors="pt")
img2 = image_processor(image, return_tensors="pt")
img_cat = torch.cat((img1['pixel_values'], img2['pixel_values']),dim=1)
print('input:', img_cat.shape)
model = DINOEncoder()
feat = model(img_cat)
print(feat[0].shape)

input: torch.Size([1, 6, 224, 224])
3 768 (14, 14) (14, 14)
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])


196

In [44]:
out_emb torch.Size([1, 257, 768])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])
torch.Size([1, 256, 24, 32])

torch.Size([480, 640, 3])


TypeError: Cannot handle this data type: (1, 1, 6), |u1

In [28]:
image.shape

AttributeError: 'JpegImageFile' object has no attribute 'shape'