input
```
video_file_0, duration_0
video_file_1, duration_1
video_file_2, duration_2
...

```

construct dataset
```
video_file_0, num_clips, i-th
video_file_0, num_clips, i+1-th
...

```

chunks are determined from the duration of video and configurations (fps, sampling_rate, etc.)

 num_frames_video = duration * fps / (sampling_rate + 1)
 
 num_clips = num_frames_video / num_frames
 

In [1]:
cd ../../

/mnt/sda/otani/Experiments/moment_retrieval


In [2]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from skvideo.io import vwrite

In [94]:
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

import os
import random
from io import BytesIO
import torch
import torch.utils.data

import slowfast.datasets.decoder as decoder
import slowfast.datasets.transform as transform
import slowfast.datasets.video_container as container
import slowfast.utils.logging as logging
import slowfast.datasets.kinetics as kinetics
from tqdm import tqdm
import math

logger = logging.get_logger(__name__)
        
class Charade(kinetics.Kinetics):
    """
    Kinetics video loader. Construct the Kinetics video loader, then sample
    clips from the videos. For training and validation, a single clip is
    randomly sampled from every video with random cropping, scaling, and
    flipping. For testing, multiple clips are uniformaly sampled from every
    video with uniform cropping. For uniform cropping, we take the left, center,
    and right crop if the width is larger than height, or take top, center, and
    bottom crop if the height is larger than the width.
    """

    def __init__(self, cfg, mode, num_retries=10, san_check=False):
        """
        Args:
            cfg (CfgNode): configs.
            mode (string): Options includes `train`, `val`, or `test` mode.
                For the train and val mode, the data loader will take data
                from the train or val set, and sample one clip per video.
                For the test mode, the data loader will take data from test set,
                and sample multiple clips per video.
            num_retries (int): number of retries.
        """
        self.cfg = cfg
        self.mode = mode
        self._num_retries = num_retries
        self._construct_loader(san_check)
        
    
    def _construct_loader(self, san_check):
        """
        Construct the video loader.
        """
        path_to_file = os.path.join(
            self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(self.mode)
        )
        assert os.path.exists(path_to_file), "{} dir not found".format(
            path_to_file
        )
        
        # configurations
        target_fps = 30
        sampling_rate = self.cfg.DATA.SAMPLING_RATE
        num_frames = self.cfg.DATA.NUM_FRAMES
        
        videos = []
        durations = []
        with open(path_to_file) as f:
            for line in f:
                vid_id, dur = line.rstrip().split(', ')
                videos.append(vid_id)
                durations.append(float(dur))

        self._load_indices = []
        
        clip_duration = 2 * 32 / 30
        
        for video, dur in tqdm(zip(videos, durations), total=len(videos)):
            video_path = f"{self.cfg.DATA.PATH_PREFIX}/{video}.mp4"
            video_container = container.get_video_container(
                    video_path
            )
            fps = float(video_container.streams.video[0].average_rate)
            total_frames = video_container.streams.video[0].frames
            num_clips = int(total_frames / fps / clip_duration + 1)

            for ci in range(num_clips + 1):
                self._load_indices.append((video_path, num_clips, ci))
            
            if san_check:
                if len(self._load_indices) > 100:
                    break
                
        logger.info(
            "Constructing kinetics dataloader (size: {}) from {}".format(
                len(self._load_indices), path_to_file
            )
        )

    def __getitem__(self, index):
        """
        Given the video index, return the list of frames, label, and video
        index if the video can be fetched and decoded successfully, otherwise
        repeatly find a random video that can be decoded as a replacement.
        Args:
            index (int): the video index provided by the pytorch sampler.
        Returns:
            frames (tensor): the frames of sampled from the video. The dimension
                is `channel` x `num frames` x `height` x `width`.
            label (int): the label of the current video.
            index (int): if the video provided by pytorch sampler can be
                decoded, then return the index of the video. If not, return the
                index of the video replacement that can be decoded.
        """
        video, num_clips, ci = self._load_indices[index]
        min_scale, max_scale, crop_size = [self.cfg.DATA.TEST_CROP_SIZE] * 3
        
        # Try to decode and sample a clip from a video. If the video can not be
        # decoded, repeatly find a random video replacement that can be decoded.
        for _ in range(self._num_retries):
            video_container = None
            try:
                video_container = container.get_video_container(
                    video,
                    multi_thread_decode=True,
                )
            except Exception as e:
                logger.info(
                    "Failed to load video from {} with error {}".format(
                        video, e
                    )
                )
            
            if video_container is None:
                raise RuntimeError(f'could not construct video_container: {video}')

            # Decode video. Meta info is used to perform selective decoding.
            frames = decoder.decode(
                video_container,
                self.cfg.DATA.SAMPLING_RATE,
                self.cfg.DATA.NUM_FRAMES,
                ci,
                num_clips,
                video_meta=None,
                target_fps=30,
            )

            # If decoding failed (wrong format, video is too short, and etc),
            # select another video.
            if frames is None:
                raise RuntimeError('output frames are empty')

            # Perform color normalization.
            frames = frames.float()
            frames = frames / 255.0
            frames = frames - torch.tensor(self.cfg.DATA.MEAN)
            frames = frames / torch.tensor(self.cfg.DATA.STD)
            
            # T H W C -> C T H W.
            frames = frames.permute(3, 0, 1, 2)
            # Perform data augmentation.
            frames = self.spatial_sampling(
                frames,
                spatial_idx=1,
                min_scale=min_scale,
                max_scale=max_scale,
                crop_size=crop_size,
            )

            frames = self.pack_pathway_output(frames)
            return frames, video, num_clips, ci
        else:
            raise RuntimeError(
                "Failed to fetch video after {} retries.".format(
                    self._num_retries
                )
            )

    def __len__(self):
        """
        Returns:
            (int): the number of videos in the dataset.
        """
        return len(self._load_indices)
    
    def spatial_sampling(
        self,
        frames,
        spatial_idx=-1,
        min_scale=256,
        max_scale=320,
        crop_size=224,
    ):
        """
        Perform spatial sampling on the given video frames. If spatial_idx is
        -1, perform random scale, random crop, and random flip on the given
        frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
        with the given spatial_idx.
        Args:
            frames (tensor): frames of images sampled from the video. The
                dimension is `num frames` x `height` x `width` x `channel`.
            spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
                or 2, perform left, center, right crop if width is larger than
                height, and perform top, center, buttom crop if height is larger
                than width.
            min_scale (int): the minimal size of scaling.
            max_scale (int): the maximal size of scaling.
            crop_size (int): the size of height and width used to crop the
                frames.
        Returns:
            frames (tensor): spatially sampled frames.
        """
        assert spatial_idx in [-1, 0, 1, 2]
        if spatial_idx == -1:
            frames = transform.random_short_side_scale_jitter(
                frames, min_scale, max_scale
            )
            frames = transform.random_crop(frames, crop_size)
            frames = transform.horizontal_flip(0.5, frames)
        else:
            # The testing is deterministic and no jitter should be performed.
            # min_scale, max_scale, and crop_size are expect to be the same.
            assert len({min_scale, max_scale, crop_size}) == 1
#             frames = transform.random_short_side_scale_jitter(
#                 frames, min_scale, max_scale
#             )
#             print(frames.shape)
            frames = short_side_scale_padding(frames, crop_size)
        return frames
    
def short_side_scale_padding(images, size):
    height = images.shape[2]
    width = images.shape[3]
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return images
    new_width = size
    new_height = size
    if width < height:
        new_width = int(math.floor((float(width) / height) * size))
        pad_w = (size - new_width) // 2
        pad = (pad_w, size-pad_w-new_width, 0, 0)
    else:
        new_height = int(math.floor((float(height) / width) * size))
        pad_h = (size - new_height) // 2
        pad = (0, 0, pad_h, size-pad_h-new_height)

    images = torch.nn.functional.interpolate(
        images,
        size=(new_height, new_width),
        mode="bilinear",
        align_corners=False,
    )
    return torch.nn.functional.pad(
        images,
        pad=pad,
    )

In [75]:
from slowfast.config.defaults import get_cfg

cfg = get_cfg()
cfg.merge_from_file('data/external/slowfast_cfg/slowfast_8x8_r50.yml')
dataset = Charade(cfg, mode='test', san_check=True)

0it [00:00, ?it/s]


In [76]:
loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=3,
    shuffle=False,
    sampler=None,
    num_workers=3,
    pin_memory=True,
    drop_last=False,
)

In [77]:
for batch in loader:
    break

In [79]:
len(batch)

4

In [85]:
batch[1]

['data/raw/video/YSKX3.mp4',
 'data/raw/video/YSKX3.mp4',
 'data/raw/video/YSKX3.mp4']

In [62]:
vid_arr = []
for i in range(9, 9+16):
    frames = dataset[i][1]
    vid_arr.append(frames.numpy().transpose(1,2,3,0))

vid_arr = np.concatenate(vid_arr, axis=0)
vwrite('test.mp4', vid_arr, outputdict={'-pix_fmt': 'yuv420p'})

256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)
256 144 (0, 0, 56, 56)


In [9]:
from IPython.display import Video

Video(f'http://localhost:8888/files/data/raw/video/T5ECU.mp4')

In [39]:
Video(f'http://localhost:8888/files/test.mp4')

In [86]:
from slowfast.models.video_model_builder import SlowFastModel

In [95]:
cfg = get_cfg()
cfg.merge_from_file('data/external/slowfast_cfg/slowfast_8x8_r50.yml')
dataset = Charade(cfg, mode='test', san_check=True)

  0%|          | 0/1863 [00:00<?, ?it/s]


In [96]:
from slowfast.models.video_model_builder import SlowFastModel
from slowfast.config.defaults import get_cfg

class SlowFastExtractor(SlowFastModel):
    def forward(self, x):
        x = self.s1(x)
        x = self.s1_fuse(x)
        x = self.s2(x)
        x = self.s2_fuse(x)
        for pathway in range(self.num_pathways):
            pool = getattr(self, "pathway{}_pool".format(pathway))
            x[pathway] = pool(x[pathway])
        x = self.s3(x)
        x = self.s3_fuse(x)
        x = self.s4(x)
        x = self.s4_fuse(x)
        x = self.s5(x)
        x = [self.head.pathway0_avgpool(x[0]), self.head.pathway1_avgpool(x[1])]
        x = torch.cat(x, 1)
        x = x.permute(0,2,3,4,1)
        return x

In [97]:
model = SlowFastExtractor(cfg)

In [98]:
loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=3,
    shuffle=False,
    sampler=None,
    num_workers=3,
    pin_memory=True,
    drop_last=False,
)

In [105]:
for batch in loader:
    inputs, _, _, _ = batch
    y = model(inputs)
    break

In [107]:
y.shape

torch.Size([3, 1, 2, 2, 2304])

In [102]:
y = y.mean([1,2,3])
y.shape

torch.Size([3, 2304])

In [109]:
len(dataset)

106