[Feature] Support Imgaug for augmentations in the data pipeline. (#492)

* imgaug first commit. * update changelog * add unittest & fix a few bugs * add imgaug in optional.txt * add docs & add iaa.Augmenter as input & add unittest * improve codecov * fix * fix __repr__ * fix changelog * fix docs/typo/class name, etc. * add modality assert for imgaug * remove iaa.Rotate sample * 1. fix multi-gpu bug 2. add tsn/i3d demo config 3. add assert for in&out dtype 4. update docs
open-mmlab · Jan 20, 2021 · 3a3e10a · 3a3e10a
1 parent 910d2fb
commit 3a3e10a
Show file tree

Hide file tree

Showing 7 changed files with 572 additions and 3 deletions.
diff --git a/configs/recognition/i3d/i3d_r50_video_imgaug_32x2x1_100e_kinetics400_rgb.py b/configs/recognition/i3d/i3d_r50_video_imgaug_32x2x1_100e_kinetics400_rgb.py
@@ -0,0 +1,132 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3d',
+        pretrained2d=True,
+        pretrained='torchvision://resnet50',
+        depth=50,
+        conv_cfg=dict(type='Conv3d'),
+        norm_eval=False,
+        inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.8),
+        random_crop=False,
+        max_wh_scale_gap=0),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(
+        type='Imgaug',
+        transforms=[
+            dict(type='Fliplr', p=0.5),
+            dict(type='Rotate', rotate=(-20, 20)),
+            dict(type='Dropout', p=(0, 0.05))
+        ]),
+    # dict(type='Imgaug', transforms='default'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.01, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40, 80])
+total_epochs = 100
+checkpoint_config = dict(interval=5)
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+    ])
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/i3d_r50_video_3d_32x2x1_100e_kinetics400_rgb/'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/recognition/tsn/tsn_r50_video_imgaug_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/tsn/tsn_r50_video_imgaug_1x1x8_100e_kinetics400_rgb.py
@@ -0,0 +1,128 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips=None)
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Imgaug', transforms='default'),
+    # dict(
+    #     type='Imgaug',
+    #     transforms=[
+    #         dict(type='Rotate', rotate=(-20, 20)),
+    #         dict(type='Dropout', p=(0, 0.05))
+    #     ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.01, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40, 80])
+total_epochs = 100
+checkpoint_config = dict(interval=1)
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+    ])
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/tsn_r50_video_1x1x8_100e_kinetics400_rgb/'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -6,6 +6,8 @@
 
 **New Features**
 
+- Support [imgaug](https://imgaug.readthedocs.io/en/latest/index.html) for augmentations in the data pipeline ([#492](https://github.com/open-mmlab/mmaction2/pull/492))
+
 **Improvements**
 
 - Support setting `max_testing_views` for extremely large models to save GPU memory used ([#511](https://github.com/open-mmlab/mmaction2/pull/511))

diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py
@@ -1,6 +1,6 @@
 from .augmentations import (AudioAmplify, CenterCrop, ColorJitter,
                             EntityBoxCrop, EntityBoxFlip, EntityBoxRescale,
-                            Flip, Fuse, MelSpectrogram, MultiGroupCrop,
+                            Flip, Fuse, Imgaug, MelSpectrogram, MultiGroupCrop,
                             MultiScaleCrop, Normalize, RandomCrop,
                             RandomRescale, RandomResizedCrop, RandomScale,
                             Resize, TenCrop, ThreeCrop)
@@ -31,5 +31,5 @@
     'FormatAudioShape', 'LoadAudioFeature', 'AudioFeatureSelector',
     'AudioDecodeInit', 'EntityBoxFlip', 'EntityBoxCrop', 'EntityBoxRescale',
     'RandomScale', 'ImageDecode', 'BuildPseudoClip', 'RandomRescale',
-    'PyAVDecodeMotionVector', 'Rename'
+    'PyAVDecodeMotionVector', 'Rename', 'Imgaug'
 ]