-
Notifications
You must be signed in to change notification settings - Fork 425
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [Feature] Add MoCo v3 (#194) * [Feature] add position embedding function * [Fature] modify nonlinear neck for vit backbone * [Feature] add mocov3 head * [Feature] modify cls_head for vit backbone * [Feature] add ViT backbone * [Feature] add mocov3 algorithm * [Docs] revise BYOL hook docstring * [Feature] add mocov3 vit small config files * [Feature] add mocov3 vit small linear eval config files * [Fix] solve conflict * [Fix] add mmcls * [Fix] fix docstring format * [Fix] fix isort * [Fix] add mmcls to runtime requirements * [Feature] remove duplicated codes * [Feature] add mocov3 related unit test * [Feature] revise position embedding function * [Feature] add UT codes * [Docs] add README.md * [Docs] add model links and results to model zoo * [Docs] fix model links * [Docs] add metafile * [Docs] modify install.md and add mmcls requirements * [Docs] modify description * [Fix] using specific arch name `mocov3-small` rather than general arch name `small` * [Fix] add mmcls * [Fix] fix arch name * [Feature] change name to `MoCoV3` * [Fix] fix unit test bug * [Feature] change `BYOLHook` name to `MomentumUpdateHook` * [Feature] change name to MoCoV3 * [Docs] modify description Co-authored-by: fangyixiao18 <fangyx18@hotmail.com> Co-authored-by: Yixiao Fang <36138628+fangyixiao18@users.noreply.github.com> * [Docs] update model zoo results (#195) * Bump version to v0.6.0 (#198) * [Docs] update model zoo results * Bump version to v0.6.0 Co-authored-by: fangyixiao18 <fangyx18@hotmail.com> Co-authored-by: Yixiao Fang <36138628+fangyixiao18@users.noreply.github.com>
- Loading branch information
1 parent
6a89f45
commit fc69e38
Showing
38 changed files
with
979 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
configs/benchmarks/classification/_base_/models/vit-small-p16.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# model settings | ||
model = dict( | ||
type='Classification', | ||
backbone=dict( | ||
type='VisionTransformer', | ||
arch='mocov3-small', # embed_dim = 384 | ||
img_size=224, | ||
patch_size=16, | ||
stop_grad_conv1=True), | ||
head=dict( | ||
type='ClsHead', | ||
in_channels=384, | ||
num_classes=1000, | ||
vit_backbone=True, | ||
)) |
23 changes: 23 additions & 0 deletions
23
configs/benchmarks/classification/imagenet/vit-small-p16_8xb128-coslr-90e_in1k.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
_base_ = [ | ||
'../_base_/models/vit-small-p16.py', | ||
'../_base_/datasets/imagenet.py', | ||
'../_base_/schedules/sgd_coslr-100e.py', | ||
'../_base_/default_runtime.py', | ||
] | ||
# MoCo v3 linear probing setting | ||
|
||
model = dict(backbone=dict(frozen_stages=12, norm_eval=True)) | ||
|
||
# dataset summary | ||
data = dict(imgs_per_gpu=128) # total 128*8=1024, 8 GPU linear cls | ||
|
||
# optimizer | ||
optimizer = dict(type='SGD', lr=12, momentum=0.9, weight_decay=0.) | ||
|
||
# runtime settings | ||
runner = dict(type='EpochBasedRunner', max_epochs=90) | ||
|
||
# the max_keep_ckpts controls the max number of ckpt file in your work_dirs | ||
# if it is 3, when CheckpointHook (in mmcv) saves the 4th ckpt | ||
# it will remove the oldest one to keep the number of total ckpts as 3 | ||
checkpoint_config = dict(interval=10, max_keep_ckpts=3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# dataset settings | ||
data_source = 'ImageNet' | ||
dataset_type = 'MultiViewDataset' | ||
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||
train_pipeline1 = [ | ||
dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)), | ||
dict( | ||
type='RandomAppliedTrans', | ||
transforms=[ | ||
dict( | ||
type='ColorJitter', | ||
brightness=0.4, | ||
contrast=0.4, | ||
saturation=0.2, | ||
hue=0.1) | ||
], | ||
p=0.8), | ||
dict(type='RandomGrayscale', p=0.2), | ||
dict(type='GaussianBlur', sigma_min=0.1, sigma_max=2.0, p=1.), | ||
dict(type='Solarization', p=0.), | ||
dict(type='RandomHorizontalFlip'), | ||
] | ||
train_pipeline2 = [ | ||
dict(type='RandomResizedCrop', size=224, scale=(0.2, 1.)), | ||
dict( | ||
type='RandomAppliedTrans', | ||
transforms=[ | ||
dict( | ||
type='ColorJitter', | ||
brightness=0.4, | ||
contrast=0.4, | ||
saturation=0.2, | ||
hue=0.1) | ||
], | ||
p=0.8), | ||
dict(type='RandomGrayscale', p=0.2), | ||
dict(type='GaussianBlur', sigma_min=0.1, sigma_max=2.0, p=0.1), | ||
dict(type='Solarization', p=0.2), | ||
dict(type='RandomHorizontalFlip'), | ||
] | ||
|
||
# prefetch | ||
prefetch = False | ||
if not prefetch: | ||
train_pipeline1.extend( | ||
[dict(type='ToTensor'), | ||
dict(type='Normalize', **img_norm_cfg)]) | ||
train_pipeline2.extend( | ||
[dict(type='ToTensor'), | ||
dict(type='Normalize', **img_norm_cfg)]) | ||
|
||
# dataset summary | ||
data = dict( | ||
imgs_per_gpu=256, # 256*16(gpu)=4096 | ||
workers_per_gpu=4, | ||
train=dict( | ||
type=dataset_type, | ||
data_source=dict( | ||
type=data_source, | ||
data_prefix='data/imagenet/train', | ||
ann_file='data/imagenet/meta/train.txt', | ||
), | ||
num_views=[1, 1], | ||
pipelines=[train_pipeline1, train_pipeline2], | ||
prefetch=prefetch, | ||
)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# model settings | ||
model = dict( | ||
type='MoCoV3', | ||
base_momentum=0.99, | ||
backbone=dict( | ||
type='VisionTransformer', | ||
arch='mocov3-small', # embed_dim = 384 | ||
img_size=224, | ||
patch_size=16, | ||
stop_grad_conv1=True), | ||
neck=dict( | ||
type='NonLinearNeck', | ||
in_channels=384, | ||
hid_channels=4096, | ||
out_channels=256, | ||
num_layers=3, | ||
with_bias=False, | ||
with_last_bn=True, | ||
with_last_bn_affine=False, | ||
with_last_bias=False, | ||
with_avg_pool=False, | ||
vit_backbone=True), | ||
head=dict( | ||
type='MoCoV3Head', | ||
predictor=dict( | ||
type='NonLinearNeck', | ||
in_channels=256, | ||
hid_channels=4096, | ||
out_channels=256, | ||
num_layers=2, | ||
with_bias=False, | ||
with_last_bn=True, | ||
with_last_bn_affine=False, | ||
with_last_bias=False, | ||
with_avg_pool=False), | ||
temperature=0.2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# optimizer | ||
optimizer = dict(type='AdamW', lr=6e-4, weight_decay=0.1) | ||
optimizer_config = dict() # grad_clip, coalesce, bucket_size_mb | ||
|
||
# learning policy | ||
lr_config = dict( | ||
policy='CosineAnnealing', | ||
by_epoch=False, | ||
min_lr=0., | ||
warmup='linear', | ||
warmup_iters=40, | ||
warmup_ratio=1e-4, # cannot be 0 | ||
warmup_by_epoch=True) | ||
|
||
# runtime settings | ||
runner = dict(type='EpochBasedRunner', max_epochs=300) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# MoCo v3 | ||
|
||
> [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057) | ||
<!-- [ALGORITHM] --> | ||
|
||
## Abstract | ||
|
||
This paper does not describe a novel method. Instead, it studies a straightforward, incremental, yet must-know baseline given the recent progress in computer vision: self-supervised learning for Vision Transformers (ViT). While the training recipes for standard convolutional networks have been highly mature and robust, the recipes for ViT are yet to be built, especially in the self-supervised scenarios where training becomes more challenging. In this work, we go back to basics and investigate the effects of several fundamental components for training self-supervised ViT. We observe that instability is a major issue that degrades accuracy, and it can be hidden by apparently good results. We reveal that these results are indeed partial failure, and they can be improved when training is made more stable. We benchmark ViT results in MoCo v3 and several other self-supervised frameworks, with ablations in various aspects. We discuss the currently positive evidence as well as challenges and open questions. We hope that this work will provide useful data points and experience for future research. | ||
|
||
<div align="center"> | ||
<img src="https://user-images.githubusercontent.com/36138628/151305362-e6e8ea35-b3b8-45f6-8819-634e67083218.png" width="500" /> | ||
</div> | ||
|
||
## Results and Models | ||
|
||
**Back to [model_zoo.md](../../../docs/en/model_zoo.md) to download models.** | ||
|
||
In this page, we provide benchmarks as much as possible to evaluate our pre-trained models. If not mentioned, all models were trained on ImageNet1k dataset. | ||
|
||
### Classification | ||
|
||
The classification benchmarks includes 4 downstream task datasets, **VOC**, **ImageNet**, **iNaturalist2018** and **Places205**. If not specified, the results are Top-1 (%). | ||
|
||
#### ImageNet Linear Evaluation | ||
|
||
The **Linear Evaluation** result is obtained by training a linear head upon the pre-trained backbone. Please refer to [vit-small-p16_8xb128-coslr-90e_in1k](../../benchmarks/classification/imagenet/vit-small-p16_8xb128-coslr-90e_in1k.py) for details of config. | ||
|
||
| Self-Supervised Config | Linear Evaluation | | ||
| ----------------------------------------------------------------------------------------------------------------- | ----------------- | | ||
| [mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224](mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py) | 73.07 | | ||
|
||
## Citation | ||
|
||
```bibtex | ||
@Article{chen2021mocov3, | ||
author = {Xinlei Chen* and Saining Xie* and Kaiming He}, | ||
title = {An Empirical Study of Training Self-Supervised Vision Transformers}, | ||
journal = {arXiv preprint arXiv:2104.02057}, | ||
year = {2021}, | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
Collections: | ||
- Name: MoCoV3 | ||
Metadata: | ||
Training Data: ImageNet-1k | ||
Training Techniques: | ||
- LARS | ||
Training Resources: 32x V100 GPUs | ||
Architecture: | ||
- ViT | ||
- MoCo | ||
Paper: | ||
URL: https://arxiv.org/abs/2104.02057 | ||
Title: "An Empirical Study of Training Self-Supervised Vision Transformers" | ||
README: configs/selfsup/mocov3/README.md | ||
|
||
Models: | ||
- Name: mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224 | ||
In Collection: MoCoV3 | ||
Metadata: | ||
Epochs: 300 | ||
Batch Size: 4096 | ||
Results: | ||
- Task: Self-Supervised Image Classification | ||
Dataset: ImageNet-1k | ||
Metrics: | ||
Top 1 Accuracy: 73.07 | ||
Config: configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py | ||
Weights: https://download.openmmlab.com/mmselfsup/moco/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224_20220127-e9332db2.pth |
75 changes: 75 additions & 0 deletions
75
configs/selfsup/mocov3/mocov3_vit-small-p16_32xb128-fp16-coslr-300e_in1k-224.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
_base_ = [ | ||
'../_base_/models/mocov3_vit-small-p16.py', | ||
'../_base_/datasets/imagenet_mocov3.py', | ||
'../_base_/schedules/adamw_coslr-300e_in1k.py', | ||
'../_base_/default_runtime.py', | ||
] | ||
|
||
# dataset settings | ||
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||
# the difference between ResNet50 and ViT pipeline is the `scale` in | ||
# `RandomResizedCrop`, `scale=(0.08, 1.)` in ViT pipeline | ||
train_pipeline1 = [ | ||
dict(type='RandomResizedCrop', size=224, scale=(0.08, 1.)), | ||
dict( | ||
type='RandomAppliedTrans', | ||
transforms=[ | ||
dict( | ||
type='ColorJitter', | ||
brightness=0.4, | ||
contrast=0.4, | ||
saturation=0.2, | ||
hue=0.1) | ||
], | ||
p=0.8), | ||
dict(type='RandomGrayscale', p=0.2), | ||
dict(type='GaussianBlur', sigma_min=0.1, sigma_max=2.0, p=1.), | ||
dict(type='Solarization', p=0.), | ||
dict(type='RandomHorizontalFlip'), | ||
] | ||
train_pipeline2 = [ | ||
dict(type='RandomResizedCrop', size=224, scale=(0.08, 1.)), | ||
dict( | ||
type='RandomAppliedTrans', | ||
transforms=[ | ||
dict( | ||
type='ColorJitter', | ||
brightness=0.4, | ||
contrast=0.4, | ||
saturation=0.2, | ||
hue=0.1) | ||
], | ||
p=0.8), | ||
dict(type='RandomGrayscale', p=0.2), | ||
dict(type='GaussianBlur', sigma_min=0.1, sigma_max=2.0, p=0.1), | ||
dict(type='Solarization', p=0.2), | ||
dict(type='RandomHorizontalFlip'), | ||
] | ||
|
||
# prefetch | ||
prefetch = False | ||
if not prefetch: | ||
train_pipeline1.extend( | ||
[dict(type='ToTensor'), | ||
dict(type='Normalize', **img_norm_cfg)]) | ||
train_pipeline2.extend( | ||
[dict(type='ToTensor'), | ||
dict(type='Normalize', **img_norm_cfg)]) | ||
|
||
# dataset summary | ||
data = dict( | ||
imgs_per_gpu=128, train=dict(pipelines=[train_pipeline1, train_pipeline2])) | ||
|
||
# MoCo v3 use the same momentum update method as BYOL | ||
custom_hooks = [dict(type='MomentumUpdateHook')] | ||
|
||
# optimizer | ||
optimizer = dict(type='AdamW', lr=2.4e-3, weight_decay=0.1) | ||
|
||
# fp16 | ||
fp16 = dict(loss_scale='dynamic') | ||
|
||
# the max_keep_ckpts controls the max number of ckpt file in your work_dirs | ||
# if it is 3, when CheckpointHook (in mmcv) saves the 4th ckpt | ||
# it will remove the oldest one to keep the number of total ckpts as 3 | ||
checkpoint_config = dict(interval=10, max_keep_ckpts=3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.