diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py index a6e37c330a..1dfd9f976c 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -31,13 +48,13 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth', # noqa: E501 prefix='backbone.')), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=339, in_channels=768, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -52,53 +69,46 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] train_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) @@ -106,9 +116,9 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, @@ -117,38 +127,38 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 2e-5 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=1 / 20, by_epoch=True, begin=0, end=5, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, eta_min_ratio=1 / 20, by_epoch=True, begin=5, @@ -156,8 +166,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py index 4e47cabb84..5b57aacfc6 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -31,7 +48,7 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 prefix='backbone.')), cls_head=dict( - type='UniFormerHead', + type=UniFormerHead, dropout_ratio=0.5, num_classes=400, in_channels=768, @@ -44,7 +61,7 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 prefix='cls_head.')), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -59,53 +76,46 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] train_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) @@ -113,9 +123,9 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, @@ -124,38 +134,38 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 2e-6 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=0.5, by_epoch=True, begin=0, end=1, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, T_max=4, eta_min_ratio=0.5, by_epoch=True, @@ -164,8 +174,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py index a9f6f61413..4616065b4c 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -31,7 +48,7 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 prefix='backbone.')), cls_head=dict( - type='UniFormerHead', + type=UniFormerHead, dropout_ratio=0.5, num_classes=600, in_channels=768, @@ -44,7 +61,7 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 prefix='cls_head.')), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -59,53 +76,46 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] train_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) @@ -113,9 +123,9 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, @@ -124,38 +134,38 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 2e-6 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=0.5, by_epoch=True, begin=0, end=1, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, T_max=4, eta_min_ratio=0.5, by_epoch=True, @@ -164,8 +174,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py index 5c59ad46f4..32e1bc72e9 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -31,7 +48,7 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 prefix='backbone.')), cls_head=dict( - type='UniFormerHead', + type=UniFormerHead, dropout_ratio=0.5, num_classes=700, in_channels=768, @@ -44,7 +61,7 @@ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 prefix='cls_head.')), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -59,53 +76,46 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] train_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) @@ -113,9 +123,9 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, @@ -124,38 +134,38 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 2e-6 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=0.5, by_epoch=True, begin=0, end=1, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, T_max=4, eta_min_ratio=0.5, by_epoch=True, @@ -164,8 +174,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py index 7d055c4fb4..84e6f6729f 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=710, in_channels=768, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py index 6e9c4f3908..6db31b373e 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -27,13 +44,13 @@ clip_pretrained=True, pretrained='ViT-B/16'), cls_head=dict( - type='UniFormerHead', + type=UniFormerHead, dropout_ratio=0.5, num_classes=400, in_channels=768, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -48,53 +65,46 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] train_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) @@ -102,9 +112,9 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, @@ -113,38 +123,38 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 1e-5 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=0.1, by_epoch=True, begin=0, end=5, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, T_max=50, eta_min_ratio=0.1, by_epoch=True, @@ -153,8 +163,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py index 4a5b41d8c7..6b8cb00c13 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -27,13 +44,13 @@ clip_pretrained=True, pretrained='ViT-B/16'), cls_head=dict( - type='UniFormerHead', + type=UniFormerHead, dropout_ratio=0.5, num_classes=700, in_channels=768, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -48,53 +65,46 @@ file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] train_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) @@ -102,9 +112,9 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, @@ -113,38 +123,38 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 1e-5 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=0.1, by_epoch=True, begin=0, end=5, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, T_max=50, eta_min_ratio=0.1, by_epoch=True, @@ -153,8 +163,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py index 72dada4766..72527a79be 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import ConcatDataset, DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, ConcatDataset, DecordDecode, + DecordInit, Flip, FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=16, width=768, @@ -27,57 +44,50 @@ clip_pretrained=True, pretrained='ViT-B/16'), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=710, in_channels=768, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) file_client_args = dict(io_backend='disk') train_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict(type='UniformSample', clip_len=num_frames, num_clips=1), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 256)), + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), dict( - type='PytorchVideoWrapper', - op='RandAugment', - magnitude=7, - num_layers=4), - dict(type='RandomResizedCrop'), - dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] val_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=1, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='CenterCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_pipeline = [ - dict(type='DecordInit', **file_client_args), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] # dataset settings @@ -97,35 +107,35 @@ k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt' k400_trainset = dict( - type='VideoDataset', + type=VideoDataset, ann_file=k400_ann_file_train, data_prefix=dict(video=k400_data_root), pipeline=train_pipeline) k600_trainset = dict( - type='VideoDataset', + type=VideoDataset, ann_file=k600_ann_file_train, data_prefix=dict(video=k600_data_root), pipeline=train_pipeline) k700_trainset = dict( - type='VideoDataset', + type=VideoDataset, ann_file=k700_ann_file_train, data_prefix=dict(video=k700_data_root), pipeline=train_pipeline) k400_valset = dict( - type='VideoDataset', + type=VideoDataset, ann_file=k400_ann_file_val, data_prefix=dict(video=k400_data_root_val), pipeline=val_pipeline, test_mode=True) k600_valset = dict( - type='VideoDataset', + type=VideoDataset, ann_file=k600_ann_file_val, data_prefix=dict(video=k600_data_root_val), pipeline=val_pipeline, test_mode=True) k700_valset = dict( - type='VideoDataset', + type=VideoDataset, ann_file=k700_ann_file_val, data_prefix=dict(video=k700_data_root_val), pipeline=val_pipeline, @@ -139,12 +149,11 @@ k700_testset['pipeline'] = test_pipeline k710_trainset = dict( - type='ConcatDataset', - datasets=[k400_trainset, k600_trainset, k700_trainset]) + type=ConcatDataset, datasets=[k400_trainset, k600_trainset, k700_trainset]) k710_valset = dict( - type='ConcatDataset', datasets=[k400_valset, k600_valset, k700_valset]) + type=ConcatDataset, datasets=[k400_valset, k600_valset, k700_valset]) k710_testset = dict( - type='ConcatDataset', + type=ConcatDataset, datasets=[k400_testset, k600_testset, k700_testset], ) @@ -152,45 +161,45 @@ batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), + sampler=dict(type=DefaultSampler, shuffle=True), dataset=k710_trainset) val_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=k710_valset) test_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=k710_testset) -val_evaluator = dict(type='AccMetric') -test_evaluator = dict(type='AccMetric') +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) -val_cfg = dict(type='ValLoop') -test_cfg = dict(type='TestLoop') + type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) base_lr = 1e-5 optim_wrapper = dict( optimizer=dict( - type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=20, norm_type=2)) param_scheduler = [ dict( - type='LinearLR', + type=LinearLR, start_factor=0.5, by_epoch=True, begin=0, end=5, convert_to_iter_based=True), dict( - type='CosineAnnealingLR', + type=CosineAnnealingLR, T_max=50, eta_min_ratio=0.5, by_epoch=True, @@ -199,8 +208,10 @@ convert_to_iter_based=True) ] -default_hooks = dict( - checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py index 5f21a078f8..8c53a18dee 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 16 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=400, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k400/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py index 284c313e3d..84d1b295ef 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 16 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=600, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k600/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py index f137564572..b94bb75abf 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 16 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=700, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k700/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py index 94b92cf99e..f1b8def59a 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=400, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k400/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py index 7a7ba254df..c6e16ef759 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=600, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k600/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py index abf8ff5f06..e715fca14f 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=700, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k700/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=16, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py index 751a1cc7a8..6391e01825 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=400, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,22 +59,20 @@ ann_file_test = 'data/k400/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=32, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( type=dataset_type, ann_file=ann_file_test, @@ -66,5 +81,5 @@ test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py index ea6eea9a9a..dec1a65b6b 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=600, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,22 +59,20 @@ ann_file_test = 'data/k600/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=32, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( type=dataset_type, ann_file=ann_file_test, @@ -66,5 +81,5 @@ test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py index b68593afa3..8bc6cb4407 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=700, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k700/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=32, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py index 46a60758d8..c85b802da4 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=710, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py index 5385c2aa07..373fe9f3bf 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=336, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=400, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k400/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=2, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 336)), - dict(type='ThreeCrop', crop_size=336), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=4, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py index 3e495771bc..3f1964d2c7 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=336, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=600, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k600/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=2, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 336)), - dict(type='ThreeCrop', crop_size=336), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=4, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py index 9a09934ca0..0ef24778f9 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=336, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=700, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/k700/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=2, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 336)), - dict(type='ThreeCrop', crop_size=336), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=4, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=',')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py index e47b8a7148..798a215bd1 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 32 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=336, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=710, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py index 19af3d1eac..2687bec030 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=224, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=339, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/mit_v1/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 224)), - dict(type='ThreeCrop', crop_size=224), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=32, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=' ')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py index 4bd6537603..bddc27e89a 100644 --- a/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -1,11 +1,28 @@ -_base_ = ['../../_base_/default_runtime.py'] +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) # model settings num_frames = 8 model = dict( - type='Recognizer3D', + type=Recognizer3D, backbone=dict( - type='UniFormerV2', + type=UniFormerV2, input_resolution=336, patch_size=14, width=1024, @@ -25,13 +42,13 @@ drop_path_rate=0., mlp_dropout=[0.5, 0.5, 0.5, 0.5]), cls_head=dict( - type='TimeSformerHead', + type=TimeSformerHead, dropout_ratio=0.5, num_classes=339, in_channels=1024, average_clips='prob'), data_preprocessor=dict( - type='ActionDataPreprocessor', + type=ActionDataPreprocessor, mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], format_shape='NCTHW')) @@ -42,29 +59,27 @@ ann_file_test = 'data/mit_v1/val.csv' test_pipeline = [ - dict(type='DecordInit'), - dict( - type='UniformSample', clip_len=num_frames, num_clips=4, - test_mode=True), - dict(type='DecordDecode'), - dict(type='Resize', scale=(-1, 336)), - dict(type='ThreeCrop', crop_size=336), - dict(type='FormatShape', input_format='NCTHW'), - dict(type='PackActionInputs') + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) ] test_dataloader = dict( batch_size=8, num_workers=8, persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), + sampler=dict(type=DefaultSampler, shuffle=False), dataset=dict( - type=dataset_type, + type=VideoDataset, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True, delimiter=' ')) -test_evaluator = dict(type='AccMetric') -test_cfg = dict(type='TestLoop') +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop)