Skip to content

Commit

Permalink
[Feature] Support DeiT3. (#1065)
Browse files Browse the repository at this point in the history
* deit3

deit3

lint

* add tools and test

* deit3

* deit3

* fix preprocess

* lint

* Update config names and checkpoint paths

* Update convert tools to use mmengine, and fix docstring.

Co-authored-by: mzr1996 <mzr1996@163.com>
  • Loading branch information
okotaku and mzr1996 committed Oct 10, 2022
1 parent 043574c commit a49c307
Show file tree
Hide file tree
Showing 30 changed files with 1,546 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -133,3 +133,6 @@ venv.bak/
*.pvti-journal
/cache_engine
/report

# slurm
*.out
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -127,6 +127,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
- [x] [Res2Net](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/res2net)
- [x] [MLP-Mixer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mlp_mixer)
- [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit)
- [x] [DeiT-3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit3)
- [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/conformer)
- [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit)
- [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins)
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Expand Up @@ -126,6 +126,7 @@ mim install -e .
- [x] [Res2Net](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/res2net)
- [x] [MLP-Mixer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mlp_mixer)
- [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit)
- [x] [DeiT-3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit3)
- [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/conformer)
- [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit)
- [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins)
Expand Down
83 changes: 83 additions & 0 deletions configs/_base_/datasets/imagenet_bs64_deit3_224.py
@@ -0,0 +1,83 @@
# dataset settings
dataset_type = 'ImageNet'
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)

bgr_mean = data_preprocessor['mean'][::-1]
bgr_std = data_preprocessor['std'][::-1]

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=bgr_mean,
fill_std=bgr_std),
dict(type='PackClsInputs'),
]

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs'),
]

train_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)

val_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='Accuracy', topk=(1, 5))

# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
63 changes: 63 additions & 0 deletions configs/_base_/datasets/imagenet_bs64_deit3_384.py
@@ -0,0 +1,63 @@
# dataset settings
dataset_type = 'ImageNet'
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=384,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackClsInputs'),
]

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=384,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=384),
dict(type='PackClsInputs'),
]

train_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)

val_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='Accuracy', topk=(1, 5))

# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-base-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='b',
img_size=224,
patch_size=16,
drop_path_rate=0.2),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-base-p16-384.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='b',
img_size=384,
patch_size=16,
drop_path_rate=0.15),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-huge-p14-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='h',
img_size=224,
patch_size=14,
drop_path_rate=0.55),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1280,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-large-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='l',
img_size=224,
patch_size=16,
drop_path_rate=0.45),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-large-p16-384.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='l',
img_size=384,
patch_size=16,
drop_path_rate=0.4),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-medium-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='m',
img_size=224,
patch_size=16,
drop_path_rate=0.2),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=512,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-small-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='s',
img_size=224,
patch_size=16,
drop_path_rate=0.05),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-small-p16-384.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='s',
img_size=384,
patch_size=16,
drop_path_rate=0.0),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))

0 comments on commit a49c307

Please sign in to comment.