open-mmlab · Tau-J · Jul 20, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 19, 2023
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.md
@@ -40,14 +40,16 @@ Testing results on Human3.6M dataset with ground truth 2D detections
 
 | Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
 | :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 35.3  |     35.3      |  27.7   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 27.5  |     27.4      |  21.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 34.5  |     34.6      |  27.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 26.9  |     26.8      |  21.0   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
 
-Testing results on Human3.6M dataset from the [official repo](https://github.com/Walter0807/MotionBERT) with ground truth 2D detections
+Testing results on Human3.6M dataset converted from the [official repo](https://github.com/Walter0807/MotionBERT)<sup>1</sup> with ground truth 2D detections
 
 | Arch                                                                                    | MPJPE | average MPJPE | P-MPJPE |                                           ckpt                                           |
 | :-------------------------------------------------------------------------------------- | :---: | :-----------: | :-----: | :--------------------------------------------------------------------------------------: |
-| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 40.5  |     39.9      |  34.1   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
-| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py) | 38.2  |     37.7      |  32.6   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+| [MotionBERT\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 39.8  |     39.2      |  33.4   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth) |
+| [MotionBERT-finetuned\*](/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py) | 37.7  |     37.2      |  32.2   | [ckpt](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth) |
+
+<sup>1</sup> To test with the dataset from official repo, please download the [test annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_test_original.npz), [train annotation file](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_train_original.npz) and [factors](https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/h36m_factors.npy) under `$MMPOSE/data/h36m/annotation_body3d/fps50`.
 
 *Models with * are converted from the [official repo](https://github.com/Walter0807/MotionBERT). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.*
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml b/configs/body_3d_keypoint/pose_lift/h36m/motionbert_h36m.yml
@@ -15,11 +15,11 @@ Models:
   Results:
   - Dataset: Human3.6M
     Metrics:
-      MPJPE: 35.3
-      P-MPJPE: 27.7
+      MPJPE: 34.5
+      P-MPJPE: 27.1
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_h36m-f554954f_20230531.pth
-- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert_8xb32-120e_h36m.py
+- Config: configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft_8xb32-120e_h36m.py
   In Collection: MotionBERT
   Metadata:
     Architecture: *id001
@@ -28,7 +28,7 @@ Models:
   Results:
   - Dataset: Human3.6M
     Metrics:
-      MPJPE: 27.5
-      P-MPJPE: 21.6
+      MPJPE: 26.9
+      P-MPJPE: 21.0
     Task: Body 3D Keypoint
   Weights: https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/pose_lift/h36m/motionbert_ft_h36m-d80af323_20230531.pth
diff --git a/...s/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py b/...s/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m-original.py
@@ -0,0 +1,137 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=120, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=120, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+    test_cfg=dict(flip_test=True))
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train_original.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test_original.npz',
+        factor_file='annotation_body3d/fps50/h36m_factors.npy',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator
diff --git a/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py b/configs/body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-243frm_8xb32-120e_h36m.py
@@ -32,11 +32,7 @@
 
 # codec settings
 train_codec = dict(
-    type='MotionBERTLabel',
-    num_keypoints=17,
-    concat_vis=True,
-    rootrel=True,
-    factor_label=False)
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
 val_codec = dict(
     type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
 
@@ -61,20 +57,20 @@
         loss=dict(type='MPJPEVelocityJointLoss'),
         decoder=val_codec,
     ),
-)
+    test_cfg=dict(flip_test=True))
 
 # base dataset settings
 dataset_type = 'Human36mDataset'
 data_root = 'data/h36m/'
 
 # pipelines
 train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
     dict(
         type='RandomFlipAroundRoot',
-        keypoints_flip_cfg={},
-        target_flip_cfg={},
-        flip_image=True),
-    dict(type='GenerateTarget', encoder=train_codec),
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
     dict(
         type='PackPoseInputs',
         meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',

diff --git a/...body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py b/...body_3d_keypoint/pose_lift/h36m/pose-lift_motionbert-ft-243frm_8xb32-60e_h36m-original.py
@@ -0,0 +1,142 @@
+_base_ = ['../../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+train_cfg = dict(max_epochs=60, val_interval=10)
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01))
+
+# learning policy
+param_scheduler = [
+    dict(type='ExponentialLR', gamma=0.99, end=60, by_epoch=True)
+]
+
+auto_scale_lr = dict(base_batch_size=512)
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1),
+    logger=dict(type='LoggerHook', interval=20),
+)
+
+# codec settings
+train_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, mode='train')
+val_codec = dict(
+    type='MotionBERTLabel', num_keypoints=17, concat_vis=True, rootrel=True)
+
+# model settings
+model = dict(
+    type='PoseLifter',
+    backbone=dict(
+        type='DSTFormer',
+        in_channels=3,
+        feat_size=512,
+        depth=5,
+        num_heads=8,
+        mlp_ratio=2,
+        seq_len=243,
+        att_fuse=True,
+    ),
+    head=dict(
+        type='MotionRegressionHead',
+        in_channels=512,
+        out_channels=3,
+        embedding_size=512,
+        loss=dict(type='MPJPEVelocityJointLoss'),
+        decoder=val_codec,
+    ),
+    test_cfg=dict(flip_test=True),
+    init_cfg=dict(
+        type='Pretrained',
+        checkpoint='https://download.openmmlab.com/mmpose/v1/body_3d_keypoint/'
+        'pose_lift/h36m/motionbert_pretrain_h36m-29ffebf5_20230719.pth'),
+)
+
+# base dataset settings
+dataset_type = 'Human36mDataset'
+data_root = 'data/h36m/'
+
+# pipelines
+train_pipeline = [
+    dict(type='GenerateTarget', encoder=train_codec),
+    dict(
+        type='RandomFlipAroundRoot',
+        keypoints_flip_cfg=dict(center_mode='static', center_x=0.),
+        target_flip_cfg=dict(center_mode='static', center_x=0.),
+        flip_label=True),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+val_pipeline = [
+    dict(type='GenerateTarget', encoder=val_codec),
+    dict(
+        type='PackPoseInputs',
+        meta_keys=('id', 'category_id', 'target_img_path', 'flip_indices',
+                   'factor', 'camera_param'))
+]
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_train_original.npz',
+        seq_len=1,
+        multiple_target=243,
+        multiple_target_step=81,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=train_pipeline,
+    ))
+
+val_dataloader = dict(
+    batch_size=32,
+    prefetch_factor=4,
+    pin_memory=True,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file='annotation_body3d/fps50/h36m_test_original.npz',
+        factor_file='annotation_body3d/fps50/h36m_factors.npy',
+        seq_len=1,
+        seq_step=1,
+        multiple_target=243,
+        camera_param_file='annotation_body3d/cameras.pkl',
+        data_root=data_root,
+        data_prefix=dict(img='images/'),
+        pipeline=val_pipeline,
+        test_mode=True,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+skip_list = [
+    'S9_Greet', 'S9_SittingDown', 'S9_Wait_1', 'S9_Greeting', 'S9_Waiting_1'
+]
+val_evaluator = [
+    dict(type='MPJPE', mode='mpjpe', skip_list=skip_list),
+    dict(type='MPJPE', mode='p-mpjpe', skip_list=skip_list)
+]
+test_evaluator = val_evaluator