In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import models
from timm.models import create_model

import cv2

In [2]:
backbone = create_model(
    "vit_small_patch16_224",
    img_size=224,
    pretrained=False,
    num_classes=710,
    all_frames=16 * 8,
    # tubelet_size=args.tubelet_size,
    # drop_rate=args.drop,
    # drop_path_rate=args.drop_path,
    # attn_drop_rate=args.attn_drop_rate,
    # head_drop_rate=args.head_drop_rate,
    # drop_block_rate=None,
    # use_mean_pooling=args.use_mean_pooling,
    # init_scale=args.init_scale,
    # with_cp=args.with_checkpoint,
)

load_dict = torch.load("/data/ephemeral/home/VideoMAEv2/pths/vit_s_k710_dl_from_giant.pth")
backbone.load_state_dict(load_dict["module"])
model = nn.Sequential(backbone, nn.Linear(710, 1), nn.Sigmoid())
model.to("cuda")
model.eval()

Sequential(
  (0): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (norm): Ident

In [3]:
test_t = torch.randn(1, 3, 16 * 8, 224, 224)
print(f"==>> test_t.shape: {test_t.shape}")
test_t = test_t.to("cuda")

==>> test_t.shape: torch.Size([1, 3, 128, 224, 224])


In [4]:
with torch.no_grad():
    output = model(test_t)
    print(f"==>> output.shape: {output.shape}")

==>> output.shape: torch.Size([1, 1])


In [6]:
output = torch.squeeze(output)
print(f"==>> output: {output}")
print(f"==>> output.shape: {output.shape}")

==>> output: 0.46524369716644287
==>> output.shape: torch.Size([])


In [7]:
print(f"==>> type(output.item()): {type(output.item())}")

==>> type(output.item()): <class 'float'>


In [2]:
model = create_model(
    "vit_small_patch16_224",
    img_size=224,
    pretrained=False,
    num_classes=710,
    all_frames=16*4,
    # tubelet_size=args.tubelet_size,
    # drop_rate=args.drop,
    # drop_path_rate=args.drop_path,
    # attn_drop_rate=args.attn_drop_rate,
    # head_drop_rate=args.head_drop_rate,
    # drop_block_rate=None,
    # use_mean_pooling=args.use_mean_pooling,
    # init_scale=args.init_scale,
    # with_cp=args.with_checkpoint,
)

In [3]:
print(f"==>> model: {model}")

==>> model: VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, element

In [4]:
load_dict = torch.load("/data/ephemeral/home/VideoMAEv2/pths/vit_s_k710_dl_from_giant.pth")
print(f"==>> load_dict.keys(): {load_dict.keys()}")

==>> load_dict.keys(): dict_keys(['module'])


In [5]:
model.load_state_dict(load_dict["module"])

<All keys matched successfully>

In [6]:
model.to("cuda")

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, elementwise_affine=

In [7]:
test_t = torch.randn(1, 3, 16*4, 224, 224)
print(f"==>> test_t.shape: {test_t.shape}")
test_t = test_t.to("cuda")

==>> test_t.shape: torch.Size([1, 3, 64, 224, 224])


In [8]:
model.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, elementwise_affine=

In [9]:
with torch.no_grad():
    output = model(test_t)
    print(f"==>> output.shape: {output.shape}")

==>> output.shape: torch.Size([1, 710])


In [10]:
test_tt = torch.randn(1, 3, 16 * 4, 224, 224)
print(f"==>> test_tt.shape: {test_tt.shape}")
test_tt = test_tt.to("cuda")

==>> test_tt.shape: torch.Size([1, 3, 64, 224, 224])


In [11]:
with torch.no_grad():
    output2 = model(test_tt)
    print(f"==>> output2.shape: {output2.shape}")

==>> output2.shape: torch.Size([1, 710])


In [12]:
test_ttt = torch.randn(1, 3, 16 * 4, 224, 224)
print(f"==>> test_ttt.shape: {test_ttt.shape}")
test_ttt = test_ttt.to("cuda")

==>> test_ttt.shape: torch.Size([1, 3, 64, 224, 224])


In [13]:
with torch.no_grad():
    output3 = model(test_ttt)
    print(f"==>> output3.shape: {output3.shape}")

==>> output3.shape: torch.Size([1, 710])


In [2]:
model = models.vit_small_patch16_224()
print(f"==>> model: {model}")

==>> model: VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, element

In [3]:
model.head = nn.Linear(384,710)
print(f"==>> model: {model}")

==>> model: VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, element

In [4]:
load_dict = torch.load("/data/ephemeral/home/VideoMAEv2/pths/vit_s_k710_dl_from_giant.pth")
print(f"==>> load_dict.keys(): {load_dict.keys()}")

==>> load_dict.keys(): dict_keys(['module'])


In [5]:
model.load_state_dict(load_dict["module"])

<All keys matched successfully>

In [6]:
model.to("cuda")


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, elementwise_affine=

In [7]:
test_t = torch.randn(4,3,16,224,224)
print(f"==>> test_t.shape: {test_t.shape}")

==>> test_t.shape: torch.Size([4, 3, 16, 224, 224])


In [8]:
test_t = test_t.to("cuda")

In [9]:
model.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, elementwise_affine=

In [10]:
output = model(test_t)
print(f"==>> output.shape: {output.shape}")

==>> output.shape: torch.Size([4, 710])
