# SLOWFAST
https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/ (accessed 22-03-2023)

By FAIR PyTorchVideo

SlowFast networks pretrained on the Kinetics 400 dataset

| arch     | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
|----------|-------|----------------------------|-------|-------|-----------|------------|
| SlowFast | R50   | 8x8                        | 76.94 | 92.69 | 65.71     | 34.57      |
| SlowFast | R101  | 8x8                        | 77.90 | 93.27 | 127.20    | 62.83      |

Best performing of the available models in PyTorch Hub

**References:**

[1] Christoph Feichtenhofer et al, “SlowFast Networks for Video Recognition” https://arxiv.org/pdf/1812.03982.pdf

In [None]:
import torch
import numpy as np
import os
import sys
import glob
sys.path.append('..')
# from utils import 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [None]:
# Choose the `slowfast_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)

In [None]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo # type: ignore
from pytorchvideo.transforms import ( # type: ignore
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
) 

#### Setup

Set the model to eval mode and move to desired device.

In [None]:
# Set to GPU or CPU
device = "cuda"
model = model.eval()
model = model.to(device)

Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids.

In [None]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [None]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

#### Define input transform

In [None]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors. 
    """
    def __init__(self):
        super().__init__()
        
    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second
clip_duration

#### Run Inference


In [None]:
video_path = "/homes/lm004/commercials/annotated_commercials/_3GbX2_IaI8_trimmed.mp4"

In [None]:
def preprocess_video(video_path):
    # Initialize an EncodedVideo helper class and load the video
    video = EncodedVideo.from_path(video_path)


    start_sec = 4 # initial offset to avoid eventual black frames

    # take a maximum of 4 clips if the video is long enough
    num_clips = min(4, (video.duration.__float__() - start_sec) // clip_duration)

    inputs_list = []

    for _ in range(num_clips):
        end_sec = start_sec + clip_duration

        # Load the desired clip
        video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

        # Apply a transform to normalize the video input
        video_data = transform(video_data)

        # Move the inputs to the desired device
        inputs = video_data["video"]
        inputs = [i.to(device)[None, ...] for i in inputs]
        inputs_list.append(inputs)

        # start from the end of the previous clip
        start_sec = end_sec

    return inputs_list

##### REGISTER HOOK

In [None]:
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach() # type: ignore
    return hook

In [None]:
model.blocks.get_submodule("5").pool.get_submodule("1").register_forward_hook(get_features('layer5_fast'))
model.blocks.get_submodule("5").pool.get_submodule("0").register_forward_hook(get_features('layer5_slow'))

#### TEST Predictions

In [None]:
features_slow_list = []
features_fast_list = []

for inputs in preprocess_video(video_path):
    # placeholder for features
    features = {}

    # Pass the input clip through the model
    preds = model(inputs)

    # Get the features from the last layer
    # NB: compute the mean of the features across the last dimensions
    features_fast_list.append(features['layer5_fast'].cpu().numpy().mean(axis=(2,3,4)))
    features_slow_list.append(features['layer5_slow'].cpu().numpy().mean(axis=(2,3,4)))

    # Get the predicted classes
    post_act = torch.nn.Softmax(dim=1)
    preds = post_act(preds)
    pred_classes = preds.topk(k=5).indices[0]

    # Map the predicted classes to the label names
    pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
    print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

In [None]:
np.concatenate(features_fast_list, axis=0).shape

## Save the embeddings to disk

In [None]:
for video_path in glob.glob("/homes/lm004/commercials/annotated_commercials/*.mp4"):
    
    features_slow_list = []
    features_fast_list = []

    for clip in preprocess_video(video_path):
        # placeholder for features
        features = {}

        # Pass the input clip through the model
        _ = model(clip)

        # Get the features from the last layer
        # NB: compute the mean of the features across the last dimensions
        features_fast_list.append(features['layer5_fast'].cpu().numpy().mean(axis=(2,3,4)))
        features_slow_list.append(features['layer5_slow'].cpu().numpy().mean(axis=(2,3,4)))
    
    features_fast = np.concatenate(features_fast_list, axis=0)
    features_slow = np.concatenate(features_slow_list, axis=0)

    stimulus_id = video_path.split('/')[-1].replace('_trimmed.mp4','')

    np.save(open(f"embeddings/{stimulus_id}_fast.npy", 'wb'), features_fast)
    np.save(open(f"embeddings/{stimulus_id}_slow.npy", 'wb'), features_slow)