open-mmlab · hellock · Jan 25, 2021 · Dec 18, 2020 · Dec 25, 2020 · Dec 28, 2020
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
@@ -0,0 +1,13 @@
+checkpoint_config = dict(interval=1)
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+    ])
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/_base_/models/audioonly_r50.py b/configs/_base_/models/audioonly_r50.py
@@ -0,0 +1,18 @@
+# model settings
+model = dict(
+    type='AudioRecognizer',
+    backbone=dict(
+        type='ResNetAudio',
+        depth=50,
+        pretrained=None,
+        in_channels=1,
+        norm_eval=False),
+    cls_head=dict(
+        type='AudioTSNHead',
+        num_classes=400,
+        in_channels=1024,
+        dropout_ratio=0.5,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/bmn_400x100.py b/configs/_base_/models/bmn_400x100.py
@@ -0,0 +1,15 @@
+# model settings
+model = dict(
+    type='BMN',
+    temporal_dim=100,
+    boundary_ratio=0.5,
+    num_samples=32,
+    num_samples_per_bin=3,
+    feat_dim=400,
+    soft_nms_alpha=0.4,
+    soft_nms_low_threshold=0.5,
+    soft_nms_high_threshold=0.9,
+    post_process_top_k=100)
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='score')
diff --git a/configs/_base_/models/bsm_pem.py b/configs/_base_/models/bsm_pem.py
@@ -0,0 +1,16 @@
+# model settings
+model = dict(
+    type='PEM',
+    pem_feat_dim=32,
+    pem_hidden_dim=256,
+    pem_u_ratio_m=1,
+    pem_u_ratio_l=2,
+    pem_high_temporal_iou_threshold=0.6,
+    pem_low_temporal_iou_threshold=2.2,
+    soft_nms_alpha=0.75,
+    soft_nms_low_threshold=0.65,
+    soft_nms_high_threshold=0.9,
+    post_process_top_k=100)
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='score')
diff --git a/configs/_base_/models/bsn_tem.py b/configs/_base_/models/bsn_tem.py
@@ -0,0 +1,11 @@
+# model settings
+model = dict(
+    type='TEM',
+    temporal_dim=100,
+    boundary_ratio=0.1,
+    tem_feat_dim=400,
+    tem_hidden_dim=512,
+    tem_match_threshold=0.5)
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='score')
diff --git a/configs/_base_/models/c3d_sports1m_pretrained.py b/configs/_base_/models/c3d_sports1m_pretrained.py
@@ -0,0 +1,24 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='C3D',
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/recognition/c3d/c3d_sports1m_pretrain_20201016-dcc47ddc.pth',  # noqa: E501
+        style='pytorch',
+        conv_cfg=dict(type='Conv3d'),
+        norm_cfg=None,
+        act_cfg=dict(type='ReLU'),
+        dropout_ratio=0.5,
+        init_std=0.005),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=101,
+        in_channels=4096,
+        spatial_type=None,
+        dropout_ratio=0.5,
+        init_std=0.01))
+
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='score')
diff --git a/configs/_base_/models/csn_ig65m_pretrained.py b/configs/_base_/models/csn_ig65m_pretrained.py
@@ -0,0 +1,23 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dCSN',
+        pretrained2d=False,
+        pretrained=  # noqa: E251
+        'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth',  # noqa: E501
+        depth=152,
+        with_pool2=False,
+        bottleneck_mode='ir',
+        norm_eval=False,
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/i3d_r50.py b/configs/_base_/models/i3d_r50.py
@@ -0,0 +1,22 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3d',
+        pretrained2d=True,
+        pretrained='torchvision://resnet50',
+        depth=50,
+        conv_cfg=dict(type='Conv3d'),
+        norm_eval=False,
+        inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/r2plus1d_r34.py b/configs/_base_/models/r2plus1d_r34.py
@@ -0,0 +1,28 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet2Plus1d',
+        depth=34,
+        pretrained=None,
+        pretrained2d=False,
+        norm_eval=False,
+        conv_cfg=dict(type='Conv2plus1d'),
+        norm_cfg=dict(type='SyncBN', requires_grad=True, eps=1e-3),
+        conv1_kernel=(3, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(1, 1, 1, 1),
+        spatial_strides=(1, 2, 2, 2),
+        temporal_strides=(1, 2, 2, 2),
+        zero_init_residual=False),
+    cls_head=dict(
+        type='I3DHead',
+        num_classes=400,
+        in_channels=512,
+        spatial_type='avg',
+        dropout_ratio=0.5,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/slowfast_r50.py b/configs/_base_/models/slowfast_r50.py
@@ -0,0 +1,38 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowFast',
+        pretrained=None,
+        resample_rate=8,  # tau
+        speed_ratio=8,  # alpha
+        channel_ratio=8,  # beta_inv
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            norm_eval=False),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            norm_eval=False)),
+    cls_head=dict(
+        type='SlowFastHead',
+        in_channels=2304,  # 2048+256
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5))
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/slowonly_r50.py b/configs/_base_/models/slowonly_r50.py
@@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained='torchvision://resnet50',
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=400,
+        spatial_type='avg',
+        dropout_ratio=0.5))
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py
@@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTIN',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        shift_div=4),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=False))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips=None)
diff --git a/configs/_base_/models/tpn_slowonly_r50.py b/configs/_base_/models/tpn_slowonly_r50.py
@@ -0,0 +1,39 @@
+# model settings
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained='torchvision://resnet50',
+        lateral=False,
+        out_indices=(2, 3),
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    neck=dict(
+        type='TPN',
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=400, loss_weight=0.5)),
+    cls_head=dict(
+        type='TPNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.01))
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/tpn_tsm_r50.py b/configs/_base_/models/tpn_tsm_r50.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTSM',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        out_indices=(2, 3),
+        norm_eval=False,
+        shift_div=8),
+    neck=dict(
+        type='TPN',
+        in_channels=(1024, 2048),
+        out_channels=1024,
+        spatial_modulation_cfg=dict(
+            in_channels=(1024, 2048), out_channels=2048),
+        temporal_modulation_cfg=dict(downsample_scales=(8, 8)),
+        upsample_cfg=dict(scale_factor=(1, 1, 1)),
+        downsample_cfg=dict(downsample_scale=(1, 1, 1)),
+        level_fusion_cfg=dict(
+            in_channels=(1024, 1024),
+            mid_channels=(1024, 1024),
+            out_channels=2048,
+            downsample_scales=((1, 1, 1), (1, 1, 1))),
+        aux_head_cfg=dict(out_channels=174, loss_weight=0.5)),
+    cls_head=dict(
+        type='TPNHead',
+        num_classes=174,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.01))
+train_cfg = None
+test_cfg = dict(average_clips=None)
diff --git a/configs/_base_/models/tsm_r50.py b/configs/_base_/models/tsm_r50.py
@@ -0,0 +1,21 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNetTSM',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False,
+        shift_div=8),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001,
+        is_shift=True))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')
diff --git a/configs/_base_/models/tsn_r50.py b/configs/_base_/models/tsn_r50.py
@@ -0,0 +1,19 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips=None)
diff --git a/configs/_base_/models/tsn_r50_audio.py b/configs/_base_/models/tsn_r50_audio.py
@@ -0,0 +1,13 @@
+# model settings
+model = dict(
+    type='AudioRecognizer',
+    backbone=dict(type='ResNet', depth=50, in_channels=1, norm_eval=False),
+    cls_head=dict(
+        type='AudioTSNHead',
+        num_classes=400,
+        in_channels=2048,
+        dropout_ratio=0.5,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips='prob')