In [1]:
import os
import ast
import torch
import itertools
import pandas as pd
from einops import rearrange
from torchvision.models.video import r3d_18
from pytorchvideo.data import LabeledVideoDataset, UniformClipSampler
from pytorchvideo.transforms import create_video_transform



**Data Handling**

In [None]:
train = pd.read_csv('annotations/train.csv')
val = pd.read_csv('annotations/val.csv')
test = pd.read_csv('annotations/test.csv')

In [None]:
# Initialise train dataset
prefix = '/home/dl18206/Desktop/phd/data/panaf/acp/videos/all'
videos = [(f"{prefix}/{x['video']}.mp4", {'video_label': ast.literal_eval(x['label'])}) for x in train[['video', 'label']].to_dict(orient='records')]

In [None]:
class LimitDataset(torch.utils.data.Dataset):
    """
    To ensure a constant number of samples are retrieved from the dataset we use this
    LimitDataset wrapper. This is necessary because several of the underlying videos
    may be corrupted while fetching or decoding, however, we always want the same
    number of steps per epoch.
    """

    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.dataset_iter = itertools.chain.from_iterable(
            itertools.repeat(iter(dataset), 2)
        )

    def __getitem__(self, index):
        return next(self.dataset_iter)

    def __len__(self):
        return self.dataset.num_videos

In [None]:
clip_sampler = UniformClipSampler(clip_duration=15)
transform = create_video_transform(
    mode='train',
    video_key='video',
    num_samples=16,
)

In [None]:
dataset = LabeledVideoDataset(
    labeled_video_paths=videos,
    clip_sampler=clip_sampler,
    transform=transform,
    decode_audio=False,
)
dataset = LimitDataset(dataset)

**Flash**

In [13]:
import os
import pandas as pd
from flash.video import VideoClassifier, VideoClassificationData
from flash.video.classification.input_transform import VideoClassificationInputTransform

In [14]:
def resolver(root, file_id):
    return os.path.join(root, f"{file_id}.mp4")

In [15]:
train = pd.read_csv('annotations/flash/train.csv')
targets = list(train.columns[1:].values)

In [16]:
t = VideoClassificationInputTransform()

  rank_zero_deprecation(


In [17]:
data_root = '/home/dl18206/Desktop/phd/data/panaf/acp/videos/all'
datamodule = VideoClassificationData.from_csv(
    "video",
    targets,
    train_file="annotations/flash/train.csv",
    train_videos_root=data_root,
    train_resolver=resolver,
    transform=t,
    batch_size=2,
)

In [18]:
print(datamodule.num_classes, datamodule.labels)

18 ['camera_reaction', 'tool_use', 'object_carrying', 'bipedal', 'feeding', 'carrying', 'vocalisation', 'climbing', 'aggression', 'travel', 'sex', 'piloerection', 'social_interaction', 'grooming', 'display', 'cross_species_interaction', 'resting', 'no_behaviour']


In [19]:
x = torch.rand(8, 3, 16, 244, 244)

In [20]:
model = r3d_18()

In [21]:
with torch.no_grad():
    out = model(x)

In [22]:
model

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [23]:
from torch import nn
model = r3d_18()
model.fc = nn.Linear(in_features=512, out_features=18, bias=True)

In [24]:
with torch.no_grad():
    out = model(x)

In [6]:
out.shape

torch.Size([8, 18])

In [26]:
from torch import tensor

In [27]:
x = [tensor([0, 0]), tensor([0, 1]), tensor([0, 1]), tensor([0, 0]), tensor([0, 1]), tensor([0, 0]), tensor([1, 0]), tensor([1, 0]), tensor([0, 0]), tensor([1, 1]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0])]

In [31]:
tensor(x)

TypeError: only integer tensors of a single element can be converted to an index