In [3]:
import copy
# Check nvcc version
!nvcc -V
# Check GCC version
!gcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Mon_Sep_13_20:11:50_Pacific_Daylight_Time_2021
Cuda compilation tools, release 11.5, V11.5.50
Build cuda_11.5.r11.5/compiler.30411180_0
gcc (MinGW.org GCC-6.3.0-1) 6.3.0
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [4]:
# Check Pytorch installation
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())

# Check MMAction2 installation
import mmaction
print(mmaction.__version__)

# Check MMCV installation
from mmcv.ops import get_compiling_cuda_version, get_compiler_version
print(get_compiling_cuda_version())
print(get_compiler_version())

1.13.1+cu116 True


AttributeError: module 'mmaction' has no attribute '__version__'

In [None]:
from mmaction.apis import init_recognizer

# Choose to use a config and initialize the recognizer
config = '../mmaction2/configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py'
# Setup a checkpoint file to load
checkpoint = 'checkpoints/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth'
# Initialize the recognizer
model = init_recognizer(config, checkpoint, device='cuda:0')

In [None]:
model

In [None]:
model.cls_head.fc_cls

In [None]:
import os
import os.path as osp
import re
import warnings
from operator import itemgetter

import mmcv
import numpy as np
import torch
from mmcv.parallel import collate, scatter
from mmcv.runner import load_checkpoint

from mmaction.core import OutputHook
from mmaction.datasets.pipelines import Compose
from mmaction.models import build_recognizer

In [None]:
def inference_recognizer(model, video, outputs=None, as_tensor=True, **kwargs):
    """Inference a video with the recognizer.

    Args:
        model (nn.Module): The loaded recognizer.
        video (str | dict | ndarray): The video file path / url or the
            rawframes directory path / results dictionary (the input of
            pipeline) / a 4D array T x H x W x 3 (The input video).
        outputs (list(str) | tuple(str) | str | None) : Names of layers whose
            outputs need to be returned, default: None.
        as_tensor (bool): Same as that in ``OutputHook``. Default: True.

    Returns:
        dict[tuple(str, float)]: Top-5 recognition result dict.
        dict[torch.tensor | np.ndarray]:
            Output feature maps from layers specified in `outputs`.
    """
    if 'use_frames' in kwargs:
        warnings.warn('The argument `use_frames` is deprecated PR #1191. '
                      'Now you can use models trained with frames or videos '
                      'arbitrarily. ')
    if 'label_path' in kwargs:
        warnings.warn('The argument `use_frames` is deprecated PR #1191. '
                      'Now the label file is not needed in '
                      'inference_recognizer. ')

    input_flag = None
    if isinstance(video, dict):
        input_flag = 'dict'
    elif isinstance(video, np.ndarray):
        assert len(video.shape) == 4, 'The shape should be T x H x W x C'
        input_flag = 'array'
    elif isinstance(video, str) and video.startswith('http'):
        input_flag = 'video'
    elif isinstance(video, str) and osp.exists(video):
        if osp.isfile(video):
            if video.endswith('.npy'):
                input_flag = 'audio'
            else:
                input_flag = 'video'
        if osp.isdir(video):
            input_flag = 'rawframes'
    else:
        raise RuntimeError('The type of argument video is not supported: '
                           f'{type(video)}')

    if isinstance(outputs, str):
        outputs = (outputs, )
    assert outputs is None or isinstance(outputs, (tuple, list))

    cfg = model.cfg
    device = next(model.parameters()).device  # model device
    # build the data pipeline
    test_pipeline = cfg.data.test.pipeline
    # Alter data pipelines & prepare inputs
    if input_flag == 'dict':
        data = video
    if input_flag == 'array':
        modality_map = {2: 'Flow', 3: 'RGB'}
        modality = modality_map.get(video.shape[-1])
        data = dict(
            total_frames=video.shape[0],
            label=-1,
            start_index=0,
            array=video,
            modality=modality)
        for i in range(len(test_pipeline)):
            if 'Decode' in test_pipeline[i]['type']:
                test_pipeline[i] = dict(type='ArrayDecode')
        test_pipeline = [x for x in test_pipeline if 'Init' not in x['type']]
    if input_flag == 'video':
        data = dict(filename=video, label=-1, start_index=0, modality='RGB')
        if 'Init' not in test_pipeline[0]['type']:
            test_pipeline = [dict(type='OpenCVInit')] + test_pipeline
        else:
            test_pipeline[0] = dict(type='OpenCVInit')
        for i in range(len(test_pipeline)):
            if 'Decode' in test_pipeline[i]['type']:
                test_pipeline[i] = dict(type='OpenCVDecode')
    if input_flag == 'rawframes':
        filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg')
        modality = cfg.data.test.get('modality', 'RGB')
        start_index = cfg.data.test.get('start_index', 1)

        # count the number of frames that match the format of `filename_tmpl`
        # RGB pattern example: img_{:05}.jpg -> ^img_\d+.jpg$
        # Flow patteren example: {}_{:05d}.jpg -> ^x_\d+.jpg$
        pattern = f'^{filename_tmpl}$'
        if modality == 'Flow':
            pattern = pattern.replace('{}', 'x')
        pattern = pattern.replace(
            pattern[pattern.find('{'):pattern.find('}') + 1], '\\d+')
        total_frames = len(
            list(
                filter(lambda x: re.match(pattern, x) is not None,
                       os.listdir(video))))
        data = dict(
            frame_dir=video,
            total_frames=total_frames,
            label=-1,
            start_index=start_index,
            filename_tmpl=filename_tmpl,
            modality=modality)
        if 'Init' in test_pipeline[0]['type']:
            test_pipeline = test_pipeline[1:]
        for i in range(len(test_pipeline)):
            if 'Decode' in test_pipeline[i]['type']:
                test_pipeline[i] = dict(type='RawFrameDecode')
    if input_flag == 'audio':
        data = dict(
            audio_path=video,
            total_frames=len(np.load(video)),
            start_index=cfg.data.test.get('start_index', 1),
            label=-1)

    test_pipeline = Compose(test_pipeline)
    data = test_pipeline(data)
    data = collate([data], samples_per_gpu=1)

    if next(model.parameters()).is_cuda:
        # scatter to specified GPU
        data = scatter(data, [device])[0]

    # forward the model
    # with OutputHook(model, outputs=outputs, as_tensor=as_tensor) as h:
    # model.cls_head.register_forward_hook(get_activation('dropout'))
    # with torch.no_grad():
    #     result = model(return_loss=False, **data)
    with torch.no_grad():
        my_output = None
        
        def my_hook(module_, input_, output_):
            nonlocal my_output
            my_output = output_

        #a_hook = model.backbone.layer3.register_forward_hook(my_hook)
        a_hook = model.cls_head.fc_cls.register_forward_hook(my_hook)
        model(return_loss=False, **data)
        a_hook.remove()
        # return my_output
    #     returned_features = h.layer_outputs if outputs else None

    # num_classes = scores.shape[-1]
    # score_tuples = tuple(zip(range(num_classes), scores))
    # score_sorted = sorted(score_tuples, key=itemgetter(1), reverse=True)

    # top5_label = score_sorted[:5]
    # if outputs:
    #     return top5_label, returned_features
    return my_output.squeeze()

In [13]:
# Use the recognizer to do inference
video = '../data/nipun_video_dataset/PAMAP2_K10_V1/ascending stairs/as9.mp4'
#label = '../mmaction2/tools/data/kinetics/label_map_k400.txt'
results = inference_recognizer(model, video)

# print(results)
# labels = open(label).readlines()
# labels = [x.strip() for x in labels]
# results = [(labels[k[0]], k[1]) for k in results]

In [14]:
type(results), results.shape

(torch.Tensor, torch.Size([400]))

---

In [15]:
import os 
import glob
from tqdm import tqdm

In [16]:
dir_path = '../data/nipun_video_dataset/PAMAP2_K10_V1/'

In [17]:
video_ft_dict = {}
for p in tqdm(glob.glob(dir_path+'*/*.mp4',recursive=True)):
  action = p.split(os.path.sep)[-2]
  ft_vector = inference_recognizer(model, p).cpu().numpy()
  try:
    video_ft_dict[action].append(ft_vector)
  except KeyError:
    video_ft_dict[action] = [ft_vector]
    
"""
for p in glob.glob(dir_path+'*/*.avi',recursive=True):
  action = p.split(os.path.sep)[-2]
  ft_vector = inference_recognizer(model, p).cpu().numpy()
  try:
    video_ft_dict[action].append(ft_vector)
  except:
    video_ft_dict[action] = [ft_vector]
"""


100%|██████████| 180/180 [13:53<00:00,  4.63s/it]


"\nfor p in glob.glob(dir_path+'*/*.avi',recursive=True):\n  action = p.split(os.path.sep)[-2]\n  ft_vector = inference_recognizer(model, p).cpu().numpy()\n  try:\n    video_ft_dict[action].append(ft_vector)\n  except:\n    video_ft_dict[action] = [ft_vector]\n"

In [18]:
video_ft_dict.keys()

dict_keys(['ascending stairs', 'car driving', 'computer work', 'cycling', 'descending stairs', 'folding laundry', 'house cleaning', 'ironing', 'lying', 'Nordic walking', 'playing soccer', 'rope jumping', 'running', 'sitting', 'standing', 'vacuum cleaning', 'walking', 'watching TV'])

In [19]:
feat_dict = video_ft_dict

In [20]:
def save_file(feat_d:dict,save_loc:str):
    __class = []
    __features = []
    for k,v in feat_d.items():
        for feature in v:
            __class.append(k)
            __features.append(feature)

    __class = np.asarray(__class)
    __features = np.asarray(__features)

    np.savez(save_loc,activity=__class,features=__features)

In [21]:
save_file(feat_dict,"../data/I3D/video_feat/UTD_MHD/feat_dict_400.npz")