diff --git a/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py b/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py index e67b4b33b..3e0740109 100644 --- a/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py +++ b/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py @@ -6,8 +6,9 @@ img_size=224, stage_cfgs=dict(block_cfgs=dict(window_size=7)))) # train pipeline +file_client_args = dict(backend='disk') train_pipeline = [ - dict(type='LoadImageFromFile'), + dict(type='LoadImageFromFile', file_client_args=file_client_args), dict( type='RandomResizedCrop', scale=224, @@ -35,7 +36,7 @@ # test pipeline test_pipeline = [ - dict(type='LoadImageFromFile'), + dict(type='LoadImageFromFile', file_client_args=file_client_args), dict( type='mmcls.ResizeEdge', scale=256, @@ -46,5 +47,11 @@ dict(type='PackClsInputs') ] -train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) -val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +train_dataloader = dict( + dataset=dict(pipeline=train_pipeline), + collate_fn=dict(type='default_collate'), + pin_memory=True) +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline), + collate_fn=dict(type='default_collate'), + pin_memory=True) diff --git a/configs/benchmarks/classification/imagenet/swin-large_ft-8xb256-coslr-ws14-100e_in1k-224.py b/configs/benchmarks/classification/imagenet/swin-large_ft-8xb256-coslr-ws14-100e_in1k-224.py new file mode 100644 index 000000000..45de9c606 --- /dev/null +++ b/configs/benchmarks/classification/imagenet/swin-large_ft-8xb256-coslr-ws14-100e_in1k-224.py @@ -0,0 +1,33 @@ +_base_ = 'swin-base_ft-8xb256-coslr-100e_in1k-224.py' + +# model settings +model = dict( + backbone=dict( + arch='L', + img_size=224, + drop_path_rate=0.2, + stage_cfgs=dict(block_cfgs=dict(window_size=14)), + pad_small_map=True), + head=dict(in_channels=1536)) + +# schedule settings +optim_wrapper = dict(optimizer=dict(layer_decay_rate=0.7)) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=2.5e-7 / 1.25e-3, + by_epoch=True, + begin=0, + end=20, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=100, + eta_min=1e-6, + by_epoch=True, + begin=20, + end=100, + convert_to_iter_based=True) +] diff --git a/configs/selfsup/_base_/datasets/imagenet_simmim.py b/configs/selfsup/_base_/datasets/imagenet_simmim.py index e3baac40e..ba44acd1b 100644 --- a/configs/selfsup/_base_/datasets/imagenet_simmim.py +++ b/configs/selfsup/_base_/datasets/imagenet_simmim.py @@ -3,7 +3,6 @@ dataset_type = 'mmcls.ImageNet' data_root = 'data/imagenet/' file_client_args = dict(backend='disk') - train_pipeline = [ dict(type='LoadImageFromFile', file_client_args=file_client_args), dict( diff --git a/configs/selfsup/simmim/README.md b/configs/selfsup/simmim/README.md index f76017052..b13169163 100644 --- a/configs/selfsup/simmim/README.md +++ b/configs/selfsup/simmim/README.md @@ -16,10 +16,76 @@ This paper presents SimMIM, a simple framework for masked image modeling. We sim Here, we report the results of the model, and more results will be coming soon. -| Backbone | Pre-train epoch | Pre-train resolution | Fine-tuning resolution | Fine-tuning Top-1 | Pre-train Config | Fine-tuning Config | Download | -| :-------: | :-------------: | :------------------: | :--------------------: | :---------------: | :----------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Swin-Base | 100 | 192 | 192 | 82.9 | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k.py) | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) \| [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json) | -| Swin-Base | 100 | 192 | 224 | 83.5 | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/selfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192.py) | [config](https://github.com/open-mmlab/mmselfsup/blob/master/configs/benchmarks/classification/imagenet/swin-base_ft-8xb256-coslr-100e_in1k-224.py) | [model](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.pth) \| [log](https://download.openmmlab.com/mmselfsup/simmim/simmim_swin-base_16xb128-coslr-100e_in1k-192_20220316-1d090125.log.json) | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AlgorithmBackboneEpochFine-tuning SizeBatch SizeResults (Top-1 %)Links
Linear EvalFine-tuningPretrainLinear EvalFine-tuning
SimMIMSwin-base1001922048/82.7config | model | log/config | model | log
SimMIMSwin-base1002242048/83.5config | model | log/config | model | log
SimMIMSwin-base8002242048/83.8config | model | log/config | model | log
SimMIMSwin-large8002242048/84.8config | model | log/config | model | log
## Citation diff --git a/configs/selfsup/simmim/simmim_swin-base_16xb128-amp-coslr-800e_in1k-192.py b/configs/selfsup/simmim/simmim_swin-base_16xb128-amp-coslr-800e_in1k-192.py new file mode 100644 index 000000000..b774d4423 --- /dev/null +++ b/configs/selfsup/simmim/simmim_swin-base_16xb128-amp-coslr-800e_in1k-192.py @@ -0,0 +1,27 @@ +_base_ = 'simmim_swin-base_16xb128-amp-coslr-100e_in1k-192.py' + +# optimizer wrapper +optimizer = dict( + type='AdamW', lr=1e-4 * 2048 / 512, betas=(0.9, 0.999), eps=1e-8) +optim_wrapper = dict(optimizer=optimizer) + +# learning rate scheduler +param_scheduler = [ + dict( + type='LinearLR', + start_factor=5e-7 / 1e-4, + by_epoch=True, + begin=0, + end=10, + convert_to_iter_based=True), + dict( + type='MultiStepLR', + milestones=[700], + by_epoch=True, + begin=10, + end=800, + convert_to_iter_based=True) +] + +# schedule +train_cfg = dict(max_epochs=800) diff --git a/configs/selfsup/simmim/simmim_swin-large_16xb128-amp-coslr-800e_in1k-192.py b/configs/selfsup/simmim/simmim_swin-large_16xb128-amp-coslr-800e_in1k-192.py new file mode 100644 index 000000000..4f59cc36b --- /dev/null +++ b/configs/selfsup/simmim/simmim_swin-large_16xb128-amp-coslr-800e_in1k-192.py @@ -0,0 +1,13 @@ +_base_ = 'simmim_swin-base_16xb128-amp-coslr-800e_in1k-192.py' + +# model settings +model = dict( + backbone=dict( + arch='L', + stage_cfgs=dict(block_cfgs=dict(window_size=12)), + pad_small_map=True), + neck=dict(type='SimMIMNeck', in_channels=192 * 2**3, encoder_stride=32), + head=dict( + type='SimMIMHead', + patch_size=4, + loss=dict(type='SimMIMReconstructionLoss', encoder_in_channels=3))) diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md index dd7f31309..9a579006d 100644 --- a/docs/en/model_zoo.md +++ b/docs/en/model_zoo.md @@ -332,8 +332,8 @@ ImageNet has multiple versions, but the most commonly used one is ILSVRC 2012. T config | model | log - SimMIM - Swin-base + SimMIM + Swin-base-FT192 100 2048 / @@ -342,5 +342,35 @@ ImageNet has multiple versions, but the most commonly used one is ILSVRC 2012. T / config | model | log + + Swin-base-FT224 + 100 + 2048 + / + 83.5 + config | model | log + / + config | model | log + + + Swin-base-FT224 + 800 + 2048 + / + 83.8 + config | model | log + / + config | model | log + + + Swin-large-FT224 + 800 + 2048 + / + 84.8 + config | model | log + / + config | model | log + diff --git a/mmselfsup/models/backbones/simmim_swin.py b/mmselfsup/models/backbones/simmim_swin.py index 56c6fe5b7..1b0d4f3f9 100644 --- a/mmselfsup/models/backbones/simmim_swin.py +++ b/mmselfsup/models/backbones/simmim_swin.py @@ -42,6 +42,11 @@ class SimMIMSwinTransformer(SwinTransformer): stage. Defaults to empty dict. patch_cfg (dict): Extra config dict for patch embedding. Defaults to empty dict. + pad_small_map (bool): If True, pad the small feature map to the window + size, which is common used in detection and segmentation. If False, + avoid shifting window and shrink the window size to the size of + feature map, which is common used in classification. + Defaults to False. init_cfg (dict, optional): The Config for initialization. Defaults to None. """ @@ -60,6 +65,7 @@ def __init__(self, norm_cfg: dict = dict(type='LN'), stage_cfgs: Union[Sequence, dict] = dict(), patch_cfg: dict = dict(), + pad_small_map: bool = False, init_cfg: Optional[dict] = None) -> None: super().__init__( arch=arch, @@ -75,6 +81,7 @@ def __init__(self, norm_cfg=norm_cfg, stage_cfgs=stage_cfgs, patch_cfg=patch_cfg, + pad_small_map=pad_small_map, init_cfg=init_cfg) self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))