open-mmlab · innerlee · Oct 21, 2020 · Oct 1, 2020 · Oct 2, 2020 · Oct 3, 2020
diff --git a/configs/recognition/tsn/tsn_r101_1x1x5_50e_mmit_rgb.py b/configs/recognition/tsn/tsn_r101_1x1x5_50e_mmit_rgb.py
@@ -117,7 +117,7 @@
 lr_config = dict(policy='step', step=[20, 40])
 total_epochs = 50
 checkpoint_config = dict(interval=5)
-evaluation = dict(interval=5, metrics=['mean_average_precision'])
+evaluation = dict(interval=5, metrics=['mmit_mean_average_precision'])
 # yapf:disable
 log_config = dict(
     interval=20,

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -4,6 +4,7 @@
 
 **Improvements**
 - Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232))
+- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258))
 - Support data preparation for Kinetics-600 and Kinetics-700 ([#254](https://github.com/open-mmlab/mmaction2/pull/254))
 - Add `cfg-options` in arguments to override some settings in the used config for convenience ([#212](https://github.com/open-mmlab/mmaction2/pull/212))
 

diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -57,7 +57,7 @@ python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [-
 
 Optional arguments:
 - `RESULT_FILE`: Filename of the output results. If not specified, the results will not be saved to a file.
-- `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset, e.g., `top_k_accuracy`, `mean_class_accuracy` are available for all datasets in recognition, `mean_average_precision` for Multi-Moments in Time, `AR@AN` for ActivityNet, etc.
+- `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset, e.g., `top_k_accuracy`, `mean_class_accuracy` are available for all datasets in recognition, `mmit_mean_average_precision` for Multi-Moments in Time, `mean_average_precision` for Multi-Moments in Time and HVU single category. `AR@AN` for ActivityNet, etc.
 - `--gpu-collect`: If specified, recognition results will be collected using gpu communication. Otherwise, it will save the results on different gpus to `TMPDIR` and collect them by the rank 0 worker.
 - `TMPDIR`: Temporary directory used for collecting results from multiple workers, available when `--gpu-collect` is not specified.
 - `OPTIONS`: Custom options used for evaluation. Allowed values depend on the arguments of the `evaluate` function in dataset.

diff --git a/mmaction/core/evaluation/__init__.py b/mmaction/core/evaluation/__init__.py
@@ -2,7 +2,8 @@
                        average_recall_at_avg_proposals, confusion_matrix,
                        get_weighted_score, interpolated_precision_recall,
                        mean_average_precision, mean_class_accuracy,
-                       pairwise_temporal_iou, softmax, top_k_accuracy)
+                       mmit_mean_average_precision, pairwise_temporal_iou,
+                       softmax, top_k_accuracy)
 from .eval_detection import ActivityNetDetection
 from .eval_hooks import DistEvalHook, EvalHook
 
@@ -11,5 +12,5 @@
     'confusion_matrix', 'mean_average_precision', 'get_weighted_score',
     'average_recall_at_avg_proposals', 'pairwise_temporal_iou',
     'average_precision_at_temporal_iou', 'ActivityNetDetection', 'softmax',
-    'interpolated_precision_recall'
+    'interpolated_precision_recall', 'mmit_mean_average_precision'
 ]
diff --git a/mmaction/core/evaluation/accuracy.py b/mmaction/core/evaluation/accuracy.py
@@ -104,17 +104,46 @@ def top_k_accuracy(scores, labels, topk=(1, )):
     return res
 
 
+def mmit_mean_average_precision(scores, labels):
+    """Mean average precision for multi-label recognition. Used for reporting
+    MMIT style mAP on Multi-Moments in Times. The difference is that this
+    method calculates average-precision for each sample and averages them among
+    samples.
+
+    Args:
+        scores (list[np.ndarray]): Prediction scores of different classes for
+            each sample.
+        labels (list[np.ndarray]): Ground truth many-hot vector for each
+            sample.
+
+    Returns:
+        np.float: The MMIT style mean average precision.
+    """
+    results = []
+    for i in range(len(scores)):
+        precision, recall, _ = binary_precision_recall_curve(
+            scores[i], labels[i])
+        ap = -np.sum(np.diff(recall) * np.array(precision)[:-1])
+        results.append(ap)
+    return np.mean(results)
+
+
 def mean_average_precision(scores, labels):
     """Mean average precision for multi-label recognition.
 
     Args:
-        scores (list[np.ndarray]): Prediction scores for each class.
-        labels (list[np.ndarray]): Ground truth many-hot vector.
+        scores (list[np.ndarray]): Prediction scores of different classes for
+            each sample.
+        labels (list[np.ndarray]): Ground truth many-hot vector for each
+            sample.
 
     Returns:
         np.float: The mean average precision.
     """
     results = []
+    scores = np.stack(scores).T
+    labels = np.stack(labels).T
+
     for i in range(len(scores)):
         precision, recall, _ = binary_precision_recall_curve(
             scores[i], labels[i])

diff --git a/mmaction/core/evaluation/eval_hooks.py b/mmaction/core/evaluation/eval_hooks.py
@@ -25,9 +25,10 @@ class EvalHook(Hook):
             checkpoint during evaluation when ``save_best`` is set to True.
             Options are the evaluation metrics to the test dataset. e.g.,
              ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``,
-            ``mean_average_precision`` for action recognition dataset
-            (RawframeDataset and VideoDataset). ``AR@AN``, ``auc`` for action
-            localization dataset (ActivityNetDataset). Default: `top1_acc`.
+            ``mean_average_precision``, ``mmit_mean_average_precision``
+            for action recognition dataset (RawframeDataset and VideoDataset).
+            ``AR@AN``, ``auc`` for action localization dataset
+            (ActivityNetDataset). Default: `top1_acc`.
         rule (str | None): Comparison rule for best score. If set to None,
             it will infer a reasonable rule. Default: 'None'.
         eval_kwargs (dict, optional): Arguments for evaluation.
@@ -144,10 +145,11 @@ class DistEvalHook(EvalHook):
         key_indicator (str | None): Key indicator to measure the best
             checkpoint during evaluation when ``save_best`` is set to True.
             Options are the evaluation metrics to the test dataset. e.g.,
-            ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``,
-            ``mean_average_precision`` for action recognition dataset
-            (RawframeDataset and VideoDataset). ``AR@AN``, ``auc`` for action
-            localization dataset (ActivityNetDataset). Default: `top1_acc`.
+             ``top1_acc``, ``top5_acc``, ``mean_class_accuracy``,
+            ``mean_average_precision``, ``mmit_mean_average_precision``
+            for action recognition dataset (RawframeDataset and VideoDataset).
+            ``AR@AN``, ``auc`` for action localization dataset
+            (ActivityNetDataset). Default: `top1_acc`.
         rule (str | None): Comparison rule for best score. If set to None,
             it will infer a reasonable rule. Default: 'None'.
         eval_kwargs (dict, optional): Arguments for evaluation.

diff --git a/mmaction/datasets/__init__.py b/mmaction/datasets/__init__.py
@@ -2,11 +2,13 @@
 from .base import BaseDataset
 from .builder import build_dataloader, build_dataset
 from .dataset_wrappers import RepeatDataset
+from .hvu_dataset import HVUDataset
 from .rawframe_dataset import RawframeDataset
 from .ssn_dataset import SSNDataset
 from .video_dataset import VideoDataset
 
 __all__ = [
     'VideoDataset', 'build_dataloader', 'build_dataset', 'RepeatDataset',
-    'RawframeDataset', 'BaseDataset', 'ActivityNetDataset', 'SSNDataset'
+    'RawframeDataset', 'BaseDataset', 'ActivityNetDataset', 'SSNDataset',
+    'HVUDataset'
 ]
diff --git a/mmaction/datasets/base.py b/mmaction/datasets/base.py
@@ -74,15 +74,12 @@ def load_json_annotations(self):
         num_videos = len(video_infos)
         path_key = 'frame_dir' if 'frame_dir' in video_infos[0] else 'filename'
         for i in range(num_videos):
+            path_value = video_infos[i][path_key]
             if self.data_prefix is not None:
-                path_value = video_infos[i][path_key]
                 path_value = osp.join(self.data_prefix, path_value)
-                video_infos[i][path_key] = path_value
+            video_infos[i][path_key] = path_value
             if self.multi_class:
                 assert self.num_classes is not None
-                onehot = torch.zeros(self.num_classes)
-                onehot[video_infos[i]['label']] = 1.
-                video_infos[i]['label'] = onehot
             else:
                 assert len(video_infos[i]['label']) == 1
                 video_infos[i]['label'] = video_infos[i]['label'][0]
@@ -111,13 +108,29 @@ def prepare_train_frames(self, idx):
         results = copy.deepcopy(self.video_infos[idx])
         results['modality'] = self.modality
         results['start_index'] = self.start_index
+
+        # prepare tensor in getitem
+        # If HVU, type(results['label']) is dict
+        if self.multi_class and type(results['label']) is list:
+            onehot = torch.zeros(self.num_classes)
+            onehot[results['label']] = 1.
+            results['label'] = onehot
+
         return self.pipeline(results)
 
     def prepare_test_frames(self, idx):
         """Prepare the frames for testing given the index."""
         results = copy.deepcopy(self.video_infos[idx])
         results['modality'] = self.modality
         results['start_index'] = self.start_index
+
+        # prepare tensor in getitem
+        # If HVU, type(results['label']) is dict
+        if self.multi_class and type(results['label']) is list:
+            onehot = torch.zeros(self.num_classes)
+            onehot[results['label']] = 1.
+            results['label'] = onehot
+
         return self.pipeline(results)
 
     def __len__(self):

diff --git a/mmaction/datasets/hvu_dataset.py b/mmaction/datasets/hvu_dataset.py
@@ -0,0 +1,179 @@
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+
+from ..core import mean_average_precision
+from .base import BaseDataset
+from .registry import DATASETS
+
+
+@DATASETS.register_module()
+class HVUDataset(BaseDataset):
+    """HVU dataset, which supports the recognition tags of multiple categories.
+    Accept both video annotation files or rawframe annotation files.
+
+    The dataset loads videos or raw frames and applies specified transforms to
+    return a dict containing the frame tensors and other information.
+
+    The ann_file is a json file with multiple dictionaries, and each dictionary
+    indicates a sample video with the filename and tags, the tags are organized
+    as different categories. Example of a video dictionary:
+
+    .. code-block:: txt
+
+        {
+            'filename': 'gD_G1b0wV5I_001015_001035.mp4',
+            'label': {
+                'concept': [250, 131, 42, 51, 57, 155, 122],
+                'object': [1570, 508],
+                'event': [16],
+                'action': [180],
+                'scene': [206]
+            }
+        }
+
+    Example of a rawframe dictionary:
+
+    .. code-block:: txt
+
+        {
+            'frame_dir': 'gD_G1b0wV5I_001015_001035',
+            'total_frames': 61
+            'label': {
+                'concept': [250, 131, 42, 51, 57, 155, 122],
+                'object': [1570, 508],
+                'event': [16],
+                'action': [180],
+                'scene': [206]
+            }
+        }
+
+
+    Args:
+        ann_file (str): Path to the annotation file, should be a json file.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        tag_categories (list[str]): List of category names of tags.
+        tag_category_nums (list[int]): List of number of tags in each category.
+        filename_tmpl: Template for each filename. `filename_tmpl is None`
+            indicates video dataset is used. Default: None.
+        **kwargs: Keyword arguments for ``BaseDataset``.
+    """
+
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 tag_categories,
+                 tag_category_nums,
+                 filename_tmpl=None,
+                 **kwargs):
+        assert len(tag_categories) == len(tag_category_nums)
+        self.tag_categories = tag_categories
+        self.tag_category_nums = tag_category_nums
+        self.filename_tmpl = filename_tmpl
+        self.num_categories = len(self.tag_categories)
+        self.num_tags = sum(self.tag_category_nums)
+        self.category2num = dict(zip(tag_categories, tag_category_nums))
+        self.start_idx = [0]
+        for i in range(self.num_categories - 1):
+            self.start_idx.append(self.start_idx[-1] +
+                                  self.tag_category_nums[i])
+        self.category2startidx = dict(zip(tag_categories, self.start_idx))
+        self.start_index = kwargs.pop('start_index', 0)
+        self.dataset_type = None
+        super().__init__(
+            ann_file, pipeline, start_index=self.start_index, **kwargs)
+
+    def load_annotations(self):
+        """Load annotation file to get video information."""
+        assert self.ann_file.endswith('.json')
+        return self.load_json_annotations()
+
+    def load_json_annotations(self):
+        video_infos = mmcv.load(self.ann_file)
+        num_videos = len(video_infos)
+
+        video_info0 = video_infos[0]
+        assert ('filename' in video_info0) != ('frame_dir' in video_info0)
+        path_key = 'filename' if 'filename' in video_info0 else 'frame_dir'
+        self.dataset_type = 'video' if path_key == 'filename' else 'rawframe'
+        if self.dataset_type == 'rawframe':
+            assert self.filename_tmpl is not None
+
+        for i in range(num_videos):
+            path_value = video_infos[i][path_key]
+            if self.data_prefix is not None:
+                path_value = osp.join(self.data_prefix, path_value)
+            video_infos[i][path_key] = path_value
+
+            # We will convert label to torch tensors in the pipeline
+            video_infos[i]['categories'] = self.tag_categories
+            video_infos[i]['category_nums'] = self.tag_category_nums
+            if self.dataset_type == 'rawframe':
+                video_infos[i]['filename_tmpl'] = self.filename_tmpl
+                video_infos[i]['start_index'] = self.start_index
+                video_infos[i]['modality'] = self.modality
+
+        return video_infos
+
+    @staticmethod
+    def label2array(num, label):
+        arr = np.zeros(num, dtype=np.float32)
+        arr[label] = 1.
+        return arr
+
+    def evaluate(self, results, metrics='mean_average_precision', logger=None):
+        """Evaluation in HVU Video Dataset. We only support evaluating mAP for
+        each tag categories. Since some tag categories are missing for some
+        videos, we can not evaluate mAP for all tags.
+
+        Args:
+            results (list): Output results.
+            metrics (str | sequence[str]): Metrics to be performed.
+                Defaults: 'mean_average_precision'.
+            logger (logging.Logger | None): Logger for recording.
+                Default: None.
+
+        Return:
+            dict: Evaluation results dict.
+        """
+        if not isinstance(results, list):
+            raise TypeError(f'results must be a list, but got {type(results)}')
+        assert len(results) == len(self), (
+            f'The length of results is not equal to the dataset len: '
+            f'{len(results)} != {len(self)}')
+
+        metrics = metrics if isinstance(metrics, (list, tuple)) else [metrics]
+
+        # There should be only one metric in the metrics list:
+        # 'mean_average_precision'
+        assert len(metrics) == 1
+        metric = metrics[0]
+        assert metric == 'mean_average_precision'
+
+        gt_labels = [ann['label'] for ann in self.video_infos]
+
+        eval_results = {}
+        for i, category in enumerate(self.tag_categories):
+
+            start_idx = self.category2startidx[category]
+            num = self.category2num[category]
+            preds = [
+                result[start_idx:start_idx + num]
+                for video_idx, result in enumerate(results)
+                if category in gt_labels[video_idx]
+            ]
+            gts = [
+                gt_label[category] for gt_label in gt_labels
+                if category in gt_label
+            ]
+
+            gts = [self.label2array(num, item) for item in gts]
+
+            mAP = mean_average_precision(preds, gts)
+            eval_results[f'{category}_mAP'] = mAP
+            log_msg = f'\n{category}_mAP\t{mAP:.4f}'
+            print_log(log_msg, logger=logger)
+
+        return eval_results
diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py
@@ -6,7 +6,7 @@
 from .formating import (Collect, FormatShape, ImageToTensor, ToDataContainer,
                         ToTensor, Transpose)
 from .loading import (DecordDecode, DecordInit, DenseSampleFrames,
-                      FrameSelector, GenerateLocalizationLabels,
+                      FrameSelector, GenerateLocalizationLabels, LoadHVULabel,
                       LoadLocalizationFeature, LoadProposals, OpenCVDecode,
                       OpenCVInit, PyAVDecode, PyAVInit, RawFrameDecode,
                       SampleFrames, SampleProposalFrames,
@@ -21,5 +21,5 @@
     'GenerateLocalizationLabels', 'LoadLocalizationFeature', 'LoadProposals',
     'DecordInit', 'OpenCVInit', 'PyAVInit', 'SampleProposalFrames',
     'UntrimmedSampleFrames', 'RawFrameDecode', 'DecordInit', 'OpenCVInit',
-    'PyAVInit', 'SampleProposalFrames', 'ColorJitter'
+    'PyAVInit', 'SampleProposalFrames', 'ColorJitter', 'LoadHVULabel'
 ]