In [1]:
from tqdm import tqdm
import torch
import json
import numpy as np
import os
import torch.nn as nn

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
device = torch.device('cuda:0')
torch.cuda.empty_cache()

In [4]:
model_name = "i3d_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

Using cache found in /home/ichuviliaeva/.cache/torch/hub/facebookresearch_pytorchvideo_master


In [5]:
model_last_block = model.blocks[6]
model_pool = model.blocks[6].pool
model_dropout = model.blocks[6].dropout
model_proj = model.blocks[6].proj
model_output_pool = model.blocks[6].output_pool

In [6]:
model.blocks[6] = model_output_pool

In [7]:
print(model)

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv3d(3, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=[2, 3, 3], bias=False)
      (norm): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
      (pool): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=[0, 1, 1], dilation=1, ceil_mode=False)
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(64, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (branch1_norm): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(64, 64, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=[1, 0, 0], bias=False)
            (norm_a): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(64, 64, kernel_size=(1,

In [8]:
model = model.to(device)
model = model.eval()

In [9]:
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)

from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)

In [10]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 21
sampling_rate = 1
frames_per_second = 30
alpha = 4

transform = Compose([
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size)
        ])

Теперь делаем трансформации после того как достанем часть с видео, иначе ядро умрёт

In [12]:
video_path = "/DATA/ichuviliaeva/videos/50salads_vid/rgb/rgb-01-1.avi"
video = EncodedVideo.from_path(video_path)

In [13]:
one_frame_duration = 1 * sampling_rate / frames_per_second
print(one_frame_duration)
start_sec = 0
end_sec = (video._duration.numerator - 1) * one_frame_duration
video_buff = video.get_clip(start_sec=start_sec, end_sec=end_sec)

0.03333333333333333


In [14]:
%cd /DATA/ichuviliaeva/videos/i3d_experemental

/DATA/ichuviliaeva/videos/i3d_experemental


In [15]:
inputs_buff = video_buff["video"]
print(inputs_buff.shape)

torch.Size([3, 11684, 480, 640])


In [16]:
for start_frame in tqdm(range(video._duration.numerator - num_frames)):
    inputs = inputs_buff[:, start_frame:(start_frame + num_frames), :, :]
    inputs = transform(inputs)
    inputs = [i.to(device)[None, ...] for i in inputs]
    inputs = torch.cat(inputs).unsqueeze(0)
    features = model(inputs)
    output_buff.append(features.detach().cpu().numpy())
    
    
res = np.concatenate(output_buff, axis = 2)
print(res.squeeze().shape)
np.save(output_features_file, res.squeeze())

100%|███████████████████████████████████████████████████████████████████| 11666/11666 [19:14<00:00, 10.11it/s]


(2048, 11666)


In [17]:
res_get = np.load(output_features_file)
print(res_get.shape)

(2048, 11666)


In [None]:
output_features_file = "features-01-1.npy"
window_side = 10
output = []

for start_frame in tqdm(range(video._duration.numerator)):
    from_frame = max(0, start_frame - window_side)
    to_frame = min(video._duration.numerator - 1, start_frame + window_side + 1)
    before_padding_sz, after_padding_sz = 0, 0
    if from_frame == 0:
        before_padding_sz = window_side - start_frame
    if to_frame == (video._duration.numerator - 1):
        after_padding_sz = window_side + 2 + start_frame - video._duration.numerator
    
    buff_shape = inputs_buff.shape
    before_padding = torch.zeros([buff_shape[0], before_padding_sz, buff_shape[2], buff_shape[3]])
    after_padding = torch.zeros([buff_shape[0], after_padding_sz, buff_shape[2], buff_shape[3]])
    inputs = inputs_buff[:, from_frame:to_frame, :, :]
    inputs = torch.cat([before_padding, inputs, after_padding], 1)
    inputs = transform(inputs)
    inputs = [i.to(device)[None, ...] for i in inputs]
    inputs = torch.cat(inputs).unsqueeze(0)
    features = model(inputs)
    output.append(features.detach().cpu().numpy())
    
res = np.concatenate(output, axis = 2)
print(res.squeeze().shape)
np.save(output_features_file, res.squeeze())

 99%|████████████████████████████████████████████████████████████████████ | 11518/11687 [16:41<00:14, 12.03it/s]

In [None]:
res_get = np.load(output_features_file)
print(res_get.shape)