Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Support DeiT3 #1065

Merged
merged 8 commits into from Oct 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Expand Up @@ -133,3 +133,6 @@ venv.bak/
*.pvti-journal
/cache_engine
/report

# slurm
*.out
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -122,6 +122,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
- [x] [Res2Net](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/res2net)
- [x] [MLP-Mixer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mlp_mixer)
- [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit)
- [x] [DeiT-3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit3)
- [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/conformer)
- [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit)
- [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins)
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Expand Up @@ -121,6 +121,7 @@ mim install -e .
- [x] [Res2Net](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/res2net)
- [x] [MLP-Mixer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mlp_mixer)
- [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit)
- [x] [DeiT-3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit3)
- [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/conformer)
- [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit)
- [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins)
Expand Down
83 changes: 83 additions & 0 deletions configs/_base_/datasets/imagenet_bs64_deit3_224.py
@@ -0,0 +1,83 @@
# dataset settings
dataset_type = 'ImageNet'
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)

bgr_mean = data_preprocessor['mean'][::-1]
bgr_std = data_preprocessor['std'][::-1]

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=224,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(
type='RandAugment',
policies='timm_increasing',
num_policies=2,
total_level=10,
magnitude_level=9,
magnitude_std=0.5,
hparams=dict(
pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
dict(
type='RandomErasing',
erase_prob=0.25,
mode='rand',
min_area_ratio=0.02,
max_area_ratio=1 / 3,
fill_color=bgr_mean,
fill_std=bgr_std),
dict(type='PackClsInputs'),
]

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=224,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=224),
dict(type='PackClsInputs'),
]

train_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)

val_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='Accuracy', topk=(1, 5))

# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
63 changes: 63 additions & 0 deletions configs/_base_/datasets/imagenet_bs64_deit3_384.py
@@ -0,0 +1,63 @@
# dataset settings
dataset_type = 'ImageNet'
data_preprocessor = dict(
# RGB format normalization parameters
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
# convert image from BGR to RGB
to_rgb=True,
)

train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='RandomResizedCrop',
scale=384,
backend='pillow',
interpolation='bicubic'),
dict(type='RandomFlip', prob=0.5, direction='horizontal'),
dict(type='PackClsInputs'),
]

test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='ResizeEdge',
scale=384,
edge='short',
backend='pillow',
interpolation='bicubic'),
dict(type='CenterCrop', crop_size=384),
dict(type='PackClsInputs'),
]

train_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/train.txt',
data_prefix='train',
pipeline=train_pipeline),
sampler=dict(type='DefaultSampler', shuffle=True),
persistent_workers=True,
)

val_dataloader = dict(
batch_size=64,
num_workers=5,
dataset=dict(
type=dataset_type,
data_root='data/imagenet',
ann_file='meta/val.txt',
data_prefix='val',
pipeline=test_pipeline),
sampler=dict(type='DefaultSampler', shuffle=False),
persistent_workers=True,
)
val_evaluator = dict(type='Accuracy', topk=(1, 5))

# If you want standard test, please manually configure the test dataset
test_dataloader = val_dataloader
test_evaluator = val_evaluator
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-base-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='b',
img_size=224,
patch_size=16,
drop_path_rate=0.2),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-base-p16-384.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='b',
img_size=384,
patch_size=16,
drop_path_rate=0.15),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=768,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-huge-p14-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='h',
img_size=224,
patch_size=14,
drop_path_rate=0.55),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1280,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-large-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='l',
img_size=224,
patch_size=16,
drop_path_rate=0.45),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-large-p16-384.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='l',
img_size=384,
patch_size=16,
drop_path_rate=0.4),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=1024,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-medium-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='m',
img_size=224,
patch_size=16,
drop_path_rate=0.2),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=512,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-small-p16-224.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='s',
img_size=224,
patch_size=16,
drop_path_rate=0.05),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))
24 changes: 24 additions & 0 deletions configs/_base_/models/deit3/deit3-small-p16-384.py
@@ -0,0 +1,24 @@
model = dict(
type='ImageClassifier',
backbone=dict(
type='DeiT3',
arch='s',
img_size=384,
patch_size=16,
drop_path_rate=0.0),
neck=None,
head=dict(
type='VisionTransformerClsHead',
num_classes=1000,
in_channels=384,
loss=dict(
type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
),
init_cfg=[
dict(type='TruncNormal', layer='Linear', std=.02),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
],
train_cfg=dict(augments=[
dict(type='Mixup', alpha=0.8, num_classes=1000),
dict(type='CutMix', alpha=1.0, num_classes=1000)
]))