# R(2+1)D Model Fine-tuning on HMDB51  

In this notebook, we show how to finetune the pretrained R(2+1)D model. We use [HMDB51](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) human action dataset for this example.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("../../")
import time
import os
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.cuda as cuda
import torch.nn as nn
import torchvision

from utils_cv.action_recognition.data import show_batch, VideoDataset
from utils_cv.action_recognition.model import R2Plus1D 
from utils_cv.action_recognition import system_info

system_info()

### Prerequisites
* Download and extract HMDB51 videos under `./data/hmdb51/videos` directory. 

In [None]:
DATA_ROOT = os.path.join("data", "hmdb51")
VIDEO_DIR = os.path.join(DATA_ROOT, "videos")
# This split is known as "split1"
TRAIN_SPLIT = os.path.join(DATA_ROOT, "hmdb51_vid_train_split_1.txt")
TEST_SPLIT = os.path.join(DATA_ROOT, "hmdb51_vid_val_split_1.txt")

## Model and Training Configurations

In [None]:
# 8-frame or 32-frame models
MODEL_INPUT_SIZE = 32
# 16 for 8-frame model.
BATCH_SIZE = 8

# Model configuration
r2plus1d_hmdb51_cfgs = dict(
    # HMDB51 dataset spec
    num_classes=51,
    video_dir=VIDEO_DIR,
    train_split=TRAIN_SPLIT,
    valid_split=TEST_SPLIT,
    # Pre-trained model spec ("Closer look" and "Large-scale" papers)
    base_model='ig65m',
    sample_length=MODEL_INPUT_SIZE,     
    sample_step=1,        # Frame sampling step
    im_scale=128,         # After scaling, the frames will be cropped to (112 x 112)
    mean=(0.43216, 0.394666, 0.37645),
    std=(0.22803, 0.22145, 0.216989),
    random_shift=True,
    temporal_jitter_step=2,    # Temporal jitter step in frames (only for training set)
    flip_ratio=0.5,
    random_crop=True,
    video_ext='avi',
)

# Training configuration
train_cfgs = dict(
    mixed_prec=False,
    batch_size=BATCH_SIZE,
    grad_steps=2,
    lr=0.001,         # 0.001 ("Closer look" paper, HMDB51)
    momentum=0.95,
    # warmup_pct=0.3,  # First 30% of the steps will be used for warming-up
    lr_decay_factor=0.001,
    weight_decay=0.0001,
    epochs=1, # 48,
    model_name='hmdb51',
    model_dir=os.path.join("checkpoints", "ig65m_kinetics"),
)

## Load Model and Data

Load R(2+1)D 34-layer model pre-trained on IG65M. There are two versions of the model: 8-frame model and 32-frame model based on the input clip length.

In [None]:
learn = R2Plus1D(r2plus1d_hmdb51_cfgs)

Model structure

In [None]:
learn.model

Three examples of training (transformed) clips. 

In [None]:
learn.show_batch(num_samples=3)

In [None]:
learn.show_batch(which_data='valid', num_samples=3)

## Fine-tuning

In [None]:
learn.fit(train_cfgs)

Each accuracy is averaged batch-wise accuracy.

## Model Test

Reported accuracy from "Closer look" paper: 74.5% (clip accuracy of 66.1% on split1 based on VMZ repo)

1. sample 10 clips uniformly sampled from each test video: [10 x 3 x 8 x 112 x 112]
2. calculate clip-level accuracy: Use 10 batch and infer
3. calculate video-level accuracy by averaging them
4. average over the clips

In [None]:
# ### Load Saved Weights if needed
#learn.load(body_train_cfgs['model_name'] + "_032", body_train_cfgs['model_dir'])

In [None]:
if cuda.is_available():
    device = torch.device("cuda")
    num_gpus = cuda.device_count()
    # Look for the optimal set of algorithms to use in cudnn. Use this only with fixed-size inputs.
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")
    num_gpus = 0

In [None]:
# Uniformly sample 10 clips per videos ("Closer look" papers)
num_segments = 10
test_ds = VideoDataset(
    split_file=r2plus1d_hmdb51_cfgs['valid_split'],
    video_dir=r2plus1d_hmdb51_cfgs['video_dir'],
    num_segments=num_segments,
    sample_length=r2plus1d_hmdb51_cfgs['sample_length'],
    sample_step=1,
    input_size=112,
    im_scale=r2plus1d_hmdb51_cfgs['im_scale'],
    resize_keep_ratio=True,
    mean=r2plus1d_hmdb51_cfgs['mean'],
    std=r2plus1d_hmdb51_cfgs['std'],
    random_shift=False,
    temporal_jitter=False,
    flip_ratio=0.0,
    random_crop=False,
    random_crop_scales=None,
    video_ext=r2plus1d_hmdb51_cfgs['video_ext'],
)

In [None]:
print("{} samples of {}".format(len(test_ds), test_ds[0][0][0].shape))

In [None]:
show_batch(
    test_ds[0][0],
    r2plus1d_hmdb51_cfgs['sample_length'],
    r2plus1d_hmdb51_cfgs['mean'],
    r2plus1d_hmdb51_cfgs['std']
)

In [None]:
model = learn.model
model.to(device)
if num_gpus > 1:
    model = nn.DataParallel(model)

model.eval()
infer_times = []
video_preds = []
video_trues = []
clip_preds = []
clip_trues = []

report_every = 100
with torch.no_grad():
    for i, (inputs, label) in enumerate(test_ds, start=1):
        if i % report_every == 0:
            print("{} samples have processed".format(i))
        
        inputs = inputs.to(device, non_blocking=True)
        
        start_time = time.time()
        outputs = model(inputs)
        infer_time = time.time() - start_time
        
        outputs = outputs.cpu().numpy()
        
        infer_times.append(infer_time)
        video_preds.append(outputs.sum(axis=0).argmax())
        video_trues.append(label)
        clip_preds.extend(outputs.argmax(axis=1))
        clip_trues.extend([label] * num_segments)
        
print("Done! {} samples have processed".format(len(test_ds)))

print("Avg. inference time per video (10 clips) =", np.array(infer_times).mean() * 1000, "ms")
print("Video prediction accuracy =", accuracy_score(video_trues, video_preds))
print("Clip prediction accuracy =", accuracy_score(clip_trues, clip_preds))