[Feature] Support DeiT3. (#1065)

* deit3 deit3 lint * add tools and test * deit3 * deit3 * fix preprocess * lint * Update config names and checkpoint paths * Update convert tools to use mmengine, and fix docstring. Co-authored-by: mzr1996 <mzr1996@163.com>
open-mmlab · Oct 10, 2022 · a49c307 · a49c307
1 parent 043574c
commit a49c307
Show file tree

Hide file tree

Showing 30 changed files with 1,546 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ venv.bak/
 *.pvti-journal
 /cache_engine
 /report
+
+# slurm
+*.out
diff --git a/README.md b/README.md
@@ -127,6 +127,7 @@ Results and models are available in the [model zoo](https://mmclassification.rea
 - [x] [Res2Net](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/res2net)
 - [x] [MLP-Mixer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mlp_mixer)
 - [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit)
+- [x] [DeiT-3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit3)
 - [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/conformer)
 - [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit)
 - [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins)

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -126,6 +126,7 @@ mim install -e .
 - [x] [Res2Net](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/res2net)
 - [x] [MLP-Mixer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/mlp_mixer)
 - [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit)
+- [x] [DeiT-3](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/deit3)
 - [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/conformer)
 - [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/t2t_vit)
 - [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/1.x/configs/twins)

diff --git a/configs/_base_/datasets/imagenet_bs64_deit3_224.py b/configs/_base_/datasets/imagenet_bs64_deit3_224.py
@@ -0,0 +1,83 @@
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+bgr_mean = data_preprocessor['mean'][::-1]
+bgr_std = data_preprocessor['std'][::-1]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=224,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(
+        type='RandAugment',
+        policies='timm_increasing',
+        num_policies=2,
+        total_level=10,
+        magnitude_level=9,
+        magnitude_std=0.5,
+        hparams=dict(
+            pad_val=[round(x) for x in bgr_mean], interpolation='bicubic')),
+    dict(
+        type='RandomErasing',
+        erase_prob=0.25,
+        mode='rand',
+        min_area_ratio=0.02,
+        max_area_ratio=1 / 3,
+        fill_color=bgr_mean,
+        fill_std=bgr_std),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=224,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True,
+)
+
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/imagenet_bs64_deit3_384.py b/configs/_base_/datasets/imagenet_bs64_deit3_384.py
@@ -0,0 +1,63 @@
+# dataset settings
+dataset_type = 'ImageNet'
+data_preprocessor = dict(
+    # RGB format normalization parameters
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    # convert image from BGR to RGB
+    to_rgb=True,
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='RandomResizedCrop',
+        scale=384,
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='PackClsInputs'),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeEdge',
+        scale=384,
+        edge='short',
+        backend='pillow',
+        interpolation='bicubic'),
+    dict(type='CenterCrop', crop_size=384),
+    dict(type='PackClsInputs'),
+]
+
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/train.txt',
+        data_prefix='train',
+        pipeline=train_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    persistent_workers=True,
+)
+
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=5,
+    dataset=dict(
+        type=dataset_type,
+        data_root='data/imagenet',
+        ann_file='meta/val.txt',
+        data_prefix='val',
+        pipeline=test_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    persistent_workers=True,
+)
+val_evaluator = dict(type='Accuracy', topk=(1, 5))
+
+# If you want standard test, please manually configure the test dataset
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
diff --git a/configs/_base_/models/deit3/deit3-base-p16-224.py b/configs/_base_/models/deit3/deit3-base-p16-224.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='b',
+        img_size=224,
+        patch_size=16,
+        drop_path_rate=0.2),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-base-p16-384.py b/configs/_base_/models/deit3/deit3-base-p16-384.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='b',
+        img_size=384,
+        patch_size=16,
+        drop_path_rate=0.15),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=768,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-huge-p14-224.py b/configs/_base_/models/deit3/deit3-huge-p14-224.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='h',
+        img_size=224,
+        patch_size=14,
+        drop_path_rate=0.55),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=1280,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-large-p16-224.py b/configs/_base_/models/deit3/deit3-large-p16-224.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='l',
+        img_size=224,
+        patch_size=16,
+        drop_path_rate=0.45),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-large-p16-384.py b/configs/_base_/models/deit3/deit3-large-p16-384.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='l',
+        img_size=384,
+        patch_size=16,
+        drop_path_rate=0.4),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=1024,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-medium-p16-224.py b/configs/_base_/models/deit3/deit3-medium-p16-224.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='m',
+        img_size=224,
+        patch_size=16,
+        drop_path_rate=0.2),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=512,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-small-p16-224.py b/configs/_base_/models/deit3/deit3-small-p16-224.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='s',
+        img_size=224,
+        patch_size=16,
+        drop_path_rate=0.05),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=384,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))
diff --git a/configs/_base_/models/deit3/deit3-small-p16-384.py b/configs/_base_/models/deit3/deit3-small-p16-384.py
@@ -0,0 +1,24 @@
+model = dict(
+    type='ImageClassifier',
+    backbone=dict(
+        type='DeiT3',
+        arch='s',
+        img_size=384,
+        patch_size=16,
+        drop_path_rate=0.0),
+    neck=None,
+    head=dict(
+        type='VisionTransformerClsHead',
+        num_classes=1000,
+        in_channels=384,
+        loss=dict(
+            type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'),
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=.02),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.),
+    ],
+    train_cfg=dict(augments=[
+        dict(type='Mixup', alpha=0.8, num_classes=1000),
+        dict(type='CutMix', alpha=1.0, num_classes=1000)
+    ]))