In [17]:
from tqdm import tqdm
import torch
import json
import numpy as np
import os

In [18]:
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [19]:
device = torch.device('cuda:0')
torch.cuda.empty_cache()

In [20]:
model_name = "i3d_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)
model = model.to(device)
model = model.eval()

Using cache found in /home/ichuviliaeva/.cache/torch/hub/facebookresearch_pytorchvideo_master


In [41]:
import torch.nn as nn
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)

In [22]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 21
sampling_rate = 1
frames_per_second = 30
alpha = 4

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size)
        ]
    ),
)

In [23]:
clip_duration = (num_frames * sampling_rate) / frames_per_second
video_path = "/DATA/ichuviliaeva/videos/50salads_vid/rgb/rgb-01-1.avi"
video = EncodedVideo.from_path(video_path)
start_sec = 0
end_sec = start_sec + clip_duration

In [24]:
print(video._duration)

11687/30


In [25]:
print(end_sec)

0.7


In [26]:
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
video_data = transform(video_data)

In [27]:
inputs = video_data["video"]
print(len(inputs))

3


In [28]:
inputs = [i.to(device)[None, ...] for i in inputs]
print(len(inputs))

3


In [29]:
inputs = torch.cat(inputs).unsqueeze(0)

In [30]:
print(inputs.shape)

torch.Size([1, 3, 21, 256, 256])


In [31]:
res = model(inputs)
print(res.shape)

torch.Size([1, 400])


Это результат для Kinetics - надо обрезать модель

Фичи от авторов ASFormer

In [32]:
asf_features_path = "/DATA/ichuviliaeva/videos/data/50salads/features/rgb-01-1.npy"

In [33]:
asf_features = np.load(asf_features_path)
print(asf_features)
print(asf_features.shape)

[[5.7630528e-02 5.7510488e-02 5.8088370e-02 ... 1.3417178e-01
  1.4612103e-01 1.4192495e-01]
 [3.6829415e-01 3.7212294e-01 3.7371206e-01 ... 2.9188153e-01
  2.9564372e-01 2.9331002e-01]
 [1.7417416e-01 1.7289409e-01 1.7225347e-01 ... 1.2861495e-01
  1.2145682e-01 1.0264049e-01]
 ...
 [1.6405284e-03 7.7179502e-05 6.3469826e-04 ... 2.9542649e-01
  3.5986567e-01 2.1430099e-01]
 [3.9347415e-03 5.0248839e-03 8.0781560e-03 ... 4.0529925e-01
  3.3248940e-01 2.1924818e-01]
 [2.1453551e-03 1.2880345e-03 1.6674149e-03 ... 5.0064940e-02
  1.4004254e-01 6.4414769e-02]]
(2048, 11679)


Проверка на последнюю активацию.

In [35]:
np.sum(asf_features < 0)

0

да, все фичи положительные и активация была

In [37]:
model_corrected = model

In [38]:
print(model_corrected)

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=[2, 3, 3], bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=[1, 0, 0], bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1,

In [21]:
# print(model_corrected.blocks[6])

In [39]:
model_block = model.blocks[6]
model_pool = model.blocks[6].pool
model_dropout = model.blocks[6].dropout
model_proj = model.blocks[6].proj
model_output_pool = model.blocks[6].output_pool

In [122]:
model_corrected.blocks[6] = model_output_pool

In [123]:
print(model_corrected)

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=[2, 3, 3], bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=[1, 0, 0], bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1,

In [124]:
model_corrected = model_corrected.to(device)
model_corrected = model_corrected.eval()

In [125]:
features_corrected = model_corrected(inputs)

In [126]:
print(features_corrected.shape)

torch.Size([1, 2048, 1, 1, 1])


In [127]:
print(features_corrected.squeeze()[0:20])
print(features_corrected.squeeze().shape)

tensor([3.1444e-05, 9.4848e-02, 2.1775e-02, 4.7639e-02, 4.8868e-03, 4.4512e-02,
        1.0908e-01, 3.6642e-01, 8.8811e-02, 4.2758e-03, 1.4307e-01, 1.0361e-01,
        4.2775e-02, 3.4515e-02, 7.3974e-02, 2.3381e-02, 4.8130e-03, 2.5617e-02,
        1.7150e-01, 1.0875e-01], device='cuda:0', grad_fn=<SliceBackward>)
torch.Size([2048])


In [128]:
model_corrected.blocks[6] = nn.Sequential(
    model_pool,
    model_dropout,
    model_output_pool
)

In [129]:
print(model_corrected)

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=[2, 3, 3], bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=[1, 0, 0], bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1,

In [130]:
model_corrected = model_corrected.to(device)
model_corrected = model_corrected.eval()

In [131]:
features_corrected = model_corrected(inputs)

In [132]:
print(features_corrected.shape)

torch.Size([1, 2048, 1, 1, 1])


In [133]:
print(features_corrected.squeeze()[0:20])
print(features_corrected.squeeze().shape)

tensor([7.3339e-06, 3.6088e-02, 7.8960e-03, 1.6036e-02, 2.1285e-03, 2.2434e-02,
        4.6213e-02, 2.7959e-01, 3.2626e-02, 1.5766e-03, 1.1716e-01, 4.1090e-02,
        1.6393e-02, 1.1028e-02, 3.2041e-02, 7.6254e-03, 2.0127e-03, 9.9489e-03,
        6.8912e-02, 3.3493e-02], device='cuda:0', grad_fn=<SliceBackward>)
torch.Size([2048])


In [134]:
model_corrected.blocks[6] = nn.Sequential(
    model_pool,
    model_output_pool
)

In [135]:
print(model_corrected)

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=[2, 3, 3], bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=[1, 0, 0], bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1,

In [136]:
model_corrected = model_corrected.to(device)
model_corrected = model_corrected.eval()

In [137]:
features_corrected = model_corrected(inputs)

In [138]:
print(features_corrected.shape)

torch.Size([1, 2048, 1, 1, 1])


In [139]:
print(features_corrected.squeeze()[0:20])
print(features_corrected.squeeze().shape)

tensor([7.3339e-06, 3.6088e-02, 7.8960e-03, 1.6036e-02, 2.1285e-03, 2.2434e-02,
        4.6213e-02, 2.7959e-01, 3.2626e-02, 1.5766e-03, 1.1716e-01, 4.1090e-02,
        1.6393e-02, 1.1028e-02, 3.2041e-02, 7.6254e-03, 2.0127e-03, 9.9489e-03,
        6.8912e-02, 3.3493e-02], device='cuda:0', grad_fn=<SliceBackward>)
torch.Size([2048])


Странно, что оно не одинаково. Пока будет модель, где только AdaptiveAvgPool3d(output_size=1)