diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
index a6e37c330a..1dfd9f976c 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -31,13 +48,13 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth',  # noqa: E501
             prefix='backbone.')),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=339,
         in_channels=768,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -52,53 +69,46 @@
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 train_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_train,
         data_prefix=dict(video=data_root),
         pipeline=train_pipeline))
@@ -106,9 +116,9 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_val,
         data_prefix=dict(video=data_root_val),
         pipeline=val_pipeline,
@@ -117,38 +127,38 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 2e-5
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=1 / 20,
         by_epoch=True,
         begin=0,
         end=5,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         eta_min_ratio=1 / 20,
         by_epoch=True,
         begin=5,
@@ -156,8 +166,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
index 4e47cabb84..5b57aacfc6 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -31,7 +48,7 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
             prefix='backbone.')),
     cls_head=dict(
-        type='UniFormerHead',
+        type=UniFormerHead,
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=768,
@@ -44,7 +61,7 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
             prefix='cls_head.')),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -59,53 +76,46 @@
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 train_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_train,
         data_prefix=dict(video=data_root),
         pipeline=train_pipeline))
@@ -113,9 +123,9 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_val,
         data_prefix=dict(video=data_root_val),
         pipeline=val_pipeline,
@@ -124,38 +134,38 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 2e-6
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=0.5,
         by_epoch=True,
         begin=0,
         end=1,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         T_max=4,
         eta_min_ratio=0.5,
         by_epoch=True,
@@ -164,8 +174,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
index a9f6f61413..4616065b4c 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -31,7 +48,7 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
             prefix='backbone.')),
     cls_head=dict(
-        type='UniFormerHead',
+        type=UniFormerHead,
         dropout_ratio=0.5,
         num_classes=600,
         in_channels=768,
@@ -44,7 +61,7 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
             prefix='cls_head.')),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -59,53 +76,46 @@
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 train_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_train,
         data_prefix=dict(video=data_root),
         pipeline=train_pipeline))
@@ -113,9 +123,9 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_val,
         data_prefix=dict(video=data_root_val),
         pipeline=val_pipeline,
@@ -124,38 +134,38 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 2e-6
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=0.5,
         by_epoch=True,
         begin=0,
         end=1,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         T_max=4,
         eta_min_ratio=0.5,
         by_epoch=True,
@@ -164,8 +174,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
index 5c59ad46f4..32e1bc72e9 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -31,7 +48,7 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
             prefix='backbone.')),
     cls_head=dict(
-        type='UniFormerHead',
+        type=UniFormerHead,
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=768,
@@ -44,7 +61,7 @@
             'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
             prefix='cls_head.')),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -59,53 +76,46 @@
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 train_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_train,
         data_prefix=dict(video=data_root),
         pipeline=train_pipeline))
@@ -113,9 +123,9 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_val,
         data_prefix=dict(video=data_root_val),
         pipeline=val_pipeline,
@@ -124,38 +134,38 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 2e-6
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=0.5,
         by_epoch=True,
         begin=0,
         end=1,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         T_max=4,
         eta_min_ratio=0.5,
         by_epoch=True,
@@ -164,8 +174,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
index 7d055c4fb4..84e6f6729f 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=710,
         in_channels=768,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
index 6e9c4f3908..6db31b373e 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -27,13 +44,13 @@
         clip_pretrained=True,
         pretrained='ViT-B/16'),
     cls_head=dict(
-        type='UniFormerHead',
+        type=UniFormerHead,
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=768,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -48,53 +65,46 @@
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 train_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_train,
         data_prefix=dict(video=data_root),
         pipeline=train_pipeline))
@@ -102,9 +112,9 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_val,
         data_prefix=dict(video=data_root_val),
         pipeline=val_pipeline,
@@ -113,38 +123,38 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 1e-5
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=0.1,
         by_epoch=True,
         begin=0,
         end=5,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         T_max=50,
         eta_min_ratio=0.1,
         by_epoch=True,
@@ -153,8 +163,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
index 4a5b41d8c7..6b8cb00c13 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -27,13 +44,13 @@
         clip_pretrained=True,
         pretrained='ViT-B/16'),
     cls_head=dict(
-        type='UniFormerHead',
+        type=UniFormerHead,
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=768,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -48,53 +65,46 @@
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 train_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_train,
         data_prefix=dict(video=data_root),
         pipeline=train_pipeline))
@@ -102,9 +112,9 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_val,
         data_prefix=dict(video=data_root_val),
         pipeline=val_pipeline,
@@ -113,38 +123,38 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 1e-5
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=0.1,
         by_epoch=True,
         begin=0,
         end=5,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         T_max=50,
         eta_min_ratio=0.1,
         by_epoch=True,
@@ -153,8 +163,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
index 72dada4766..72527a79be 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import ConcatDataset, DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, ConcatDataset, DecordDecode,
+                               DecordInit, Flip, FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=16,
         width=768,
@@ -27,57 +44,50 @@
         clip_pretrained=True,
         pretrained='ViT-B/16'),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=710,
         in_channels=768,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
 
 file_client_args = dict(io_backend='disk')
 train_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 256)),
     dict(
-        type='PytorchVideoWrapper',
-        op='RandAugment',
-        magnitude=7,
-        num_layers=4),
-    dict(type='RandomResizedCrop'),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+        type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4),
+    dict(type=RandomResizedCrop),
+    dict(type=Resize, scale=(224, 224), keep_ratio=False),
+    dict(type=Flip, flip_ratio=0.5),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 val_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=1,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=CenterCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_pipeline = [
-    dict(type='DecordInit', **file_client_args),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit, **file_client_args),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 # dataset settings
@@ -97,35 +107,35 @@
 k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt'
 
 k400_trainset = dict(
-    type='VideoDataset',
+    type=VideoDataset,
     ann_file=k400_ann_file_train,
     data_prefix=dict(video=k400_data_root),
     pipeline=train_pipeline)
 k600_trainset = dict(
-    type='VideoDataset',
+    type=VideoDataset,
     ann_file=k600_ann_file_train,
     data_prefix=dict(video=k600_data_root),
     pipeline=train_pipeline)
 k700_trainset = dict(
-    type='VideoDataset',
+    type=VideoDataset,
     ann_file=k700_ann_file_train,
     data_prefix=dict(video=k700_data_root),
     pipeline=train_pipeline)
 
 k400_valset = dict(
-    type='VideoDataset',
+    type=VideoDataset,
     ann_file=k400_ann_file_val,
     data_prefix=dict(video=k400_data_root_val),
     pipeline=val_pipeline,
     test_mode=True)
 k600_valset = dict(
-    type='VideoDataset',
+    type=VideoDataset,
     ann_file=k600_ann_file_val,
     data_prefix=dict(video=k600_data_root_val),
     pipeline=val_pipeline,
     test_mode=True)
 k700_valset = dict(
-    type='VideoDataset',
+    type=VideoDataset,
     ann_file=k700_ann_file_val,
     data_prefix=dict(video=k700_data_root_val),
     pipeline=val_pipeline,
@@ -139,12 +149,11 @@
 k700_testset['pipeline'] = test_pipeline
 
 k710_trainset = dict(
-    type='ConcatDataset',
-    datasets=[k400_trainset, k600_trainset, k700_trainset])
+    type=ConcatDataset, datasets=[k400_trainset, k600_trainset, k700_trainset])
 k710_valset = dict(
-    type='ConcatDataset', datasets=[k400_valset, k600_valset, k700_valset])
+    type=ConcatDataset, datasets=[k400_valset, k600_valset, k700_valset])
 k710_testset = dict(
-    type='ConcatDataset',
+    type=ConcatDataset,
     datasets=[k400_testset, k600_testset, k700_testset],
 )
 
@@ -152,45 +161,45 @@
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
+    sampler=dict(type=DefaultSampler, shuffle=True),
     dataset=k710_trainset)
 val_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=k710_valset)
 test_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=k710_testset)
 
-val_evaluator = dict(type='AccMetric')
-test_evaluator = dict(type='AccMetric')
+val_evaluator = dict(type=AccMetric)
+test_evaluator = dict(type=AccMetric)
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
+    type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
 
 base_lr = 1e-5
 optim_wrapper = dict(
     optimizer=dict(
-        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+        type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
     paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
     clip_grad=dict(max_norm=20, norm_type=2))
 
 param_scheduler = [
     dict(
-        type='LinearLR',
+        type=LinearLR,
         start_factor=0.5,
         by_epoch=True,
         begin=0,
         end=5,
         convert_to_iter_based=True),
     dict(
-        type='CosineAnnealingLR',
+        type=CosineAnnealingLR,
         T_max=50,
         eta_min_ratio=0.5,
         by_epoch=True,
@@ -199,8 +208,10 @@
         convert_to_iter_based=True)
 ]
 
-default_hooks = dict(
-    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+default_hooks.update(
+    dict(
+        checkpoint=dict(interval=3, max_keep_ckpts=5),
+        logger=dict(interval=100)))
 
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
index 5f21a078f8..8c53a18dee 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 16
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k400/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=16,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
index 284c313e3d..84d1b295ef 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 16
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=600,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k600/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=16,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
index f137564572..b94bb75abf 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 16
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k700/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=16,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
index 94b92cf99e..f1b8def59a 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k400/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=16,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
index 7a7ba254df..c6e16ef759 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=600,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k600/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=16,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
index abf8ff5f06..e715fca14f 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k700/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=16,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
index 751a1cc7a8..6391e01825 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,22 +59,20 @@
 ann_file_test = 'data/k400/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=32,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
         type=dataset_type,
         ann_file=ann_file_test,
@@ -66,5 +81,5 @@
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
index ea6eea9a9a..dec1a65b6b 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=600,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,22 +59,20 @@
 ann_file_test = 'data/k600/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=32,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
         type=dataset_type,
         ann_file=ann_file_test,
@@ -66,5 +81,5 @@
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
index b68593afa3..8bc6cb4407 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k700/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=32,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
index 46a60758d8..c85b802da4 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=710,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
index 5385c2aa07..373fe9f3bf 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=336,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k400/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=2,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 336)),
-    dict(type='ThreeCrop', crop_size=336),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=4,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
index 3e495771bc..3f1964d2c7 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=336,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=600,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k600/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=2,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 336)),
-    dict(type='ThreeCrop', crop_size=336),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=4,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
index 9a09934ca0..0ef24778f9 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=336,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/k700/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=2,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 336)),
-    dict(type='ThreeCrop', crop_size=336),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=4,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=','))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
index e47b8a7148..798a215bd1 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 32
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=336,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=710,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
index 19af3d1eac..2687bec030 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=224,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=339,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/mit_v1/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 224)),
-    dict(type='ThreeCrop', crop_size=224),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 224)),
+    dict(type=ThreeCrop, crop_size=224),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=32,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=' '))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)
diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
index 4bd6537603..bddc27e89a 100644
--- a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -1,11 +1,28 @@
-_base_ = ['../../_base_/default_runtime.py']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from ..._base_.default_runtime import *
+
+from mmengine.dataset import DefaultSampler
+from mmengine.optim import CosineAnnealingLR, LinearLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import AdamW
+
+from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip,
+                               FormatShape, PackActionInputs,
+                               PytorchVideoWrapper, RandomResizedCrop, Resize,
+                               ThreeCrop, UniformSample, VideoDataset)
+from mmaction.evaluation import AccMetric
+from mmaction.models import (ActionDataPreprocessor, Recognizer3D,
+                             TimeSformerHead, UniFormerHead, UniFormerV2)
 
 # model settings
 num_frames = 8
 model = dict(
-    type='Recognizer3D',
+    type=Recognizer3D,
     backbone=dict(
-        type='UniFormerV2',
+        type=UniFormerV2,
         input_resolution=336,
         patch_size=14,
         width=1024,
@@ -25,13 +42,13 @@
         drop_path_rate=0.,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
     cls_head=dict(
-        type='TimeSformerHead',
+        type=TimeSformerHead,
         dropout_ratio=0.5,
         num_classes=339,
         in_channels=1024,
         average_clips='prob'),
     data_preprocessor=dict(
-        type='ActionDataPreprocessor',
+        type=ActionDataPreprocessor,
         mean=[114.75, 114.75, 114.75],
         std=[57.375, 57.375, 57.375],
         format_shape='NCTHW'))
@@ -42,29 +59,27 @@
 ann_file_test = 'data/mit_v1/val.csv'
 
 test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='UniformSample', clip_len=num_frames, num_clips=4,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 336)),
-    dict(type='ThreeCrop', crop_size=336),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='PackActionInputs')
+    dict(type=DecordInit),
+    dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True),
+    dict(type=DecordDecode),
+    dict(type=Resize, scale=(-1, 336)),
+    dict(type=ThreeCrop, crop_size=336),
+    dict(type=FormatShape, input_format='NCTHW'),
+    dict(type=PackActionInputs)
 ]
 
 test_dataloader = dict(
     batch_size=8,
     num_workers=8,
     persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
+    sampler=dict(type=DefaultSampler, shuffle=False),
     dataset=dict(
-        type=dataset_type,
+        type=VideoDataset,
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
         test_mode=True,
         delimiter=' '))
 
-test_evaluator = dict(type='AccMetric')
-test_cfg = dict(type='TestLoop')
+test_evaluator = dict(type=AccMetric)
+test_cfg = dict(type=TestLoop)