Update README and refine of MM-GDINO (#11298)

open-mmlab · Dec 26, 2023 · e5f9f35 · e5f9f35
1 parent 63a4bb8
commit e5f9f35
Show file tree

Hide file tree

Showing 51 changed files with 2,956 additions and 341 deletions.
diff --git a/configs/glip/README.md b/configs/glip/README.md
@@ -166,7 +166,8 @@ Learning visual representations from natural language supervision has recently s
 
 ### Results on Flickr30k
 
-| Model         | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
-| ------------- | -------- | -------------- | ------- | ------- | -------- | -------- | -------- | --------- |
-| **GLIP-T(C)** | ✔        | O365, GoldG    | 84.8    | 94.9    | 96.3     | 85.5     | 95.4     | 96.6      |
-| **GLIP-T(C)** |          | O365, GoldG    | 84.9    | 94.9    | 96.3     | 85.6     | 95.4     | 96.7      |
+| Model         | Official | Pre-Train Data      | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
+| ------------- | -------- | ------------------- | ------- | ------- | -------- | -------- | -------- | --------- |
+| **GLIP-T(C)** | ✔        | O365, GoldG         | 84.8    | 94.9    | 96.3     | 85.5     | 95.4     | 96.6      |
+| **GLIP-T(C)** |          | O365, GoldG         | 84.9    | 94.9    | 96.3     | 85.6     | 95.4     | 96.7      |
+| **GLIP-T**    |          | O365,GoldG,CC3M,SBU | 85.3    | 95.5    | 96.9     | 86.0     | 95.9     | 97.2      |
diff --git a/.../glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py b/.../glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py
@@ -2,10 +2,10 @@
 
 lang_model_name = 'bert-base-uncased'
 
-model = dict(bbox_head=dict(early_fuse=True), )
+model = dict(bbox_head=dict(early_fuse=True))
 
 dataset_type = 'Flickr30kDataset'
-data_root = 'data/flickr30k/'
+data_root = 'data/flickr30k_entities/'
 
 test_pipeline = [
     dict(
@@ -27,15 +27,15 @@
 dataset_Flickr30k_val = dict(
     type=dataset_type,
     data_root=data_root,
-    ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
+    ann_file='final_flickr_separateGT_val.json',
     data_prefix=dict(img='flickr30k_images/'),
     pipeline=test_pipeline,
 )
 
 dataset_Flickr30k_test = dict(
     type=dataset_type,
     data_root=data_root,
-    ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
+    ann_file='final_flickr_separateGT_test.json',
     data_prefix=dict(img='flickr30k_images/'),
     pipeline=test_pipeline,
 )

diff --git a/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py b/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
@@ -1,7 +1,7 @@
 _base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
 
 dataset_type = 'Flickr30kDataset'
-data_root = 'data/flickr30k/'
+data_root = 'data/flickr30k_entities/'
 
 test_pipeline = [
     dict(
@@ -23,15 +23,15 @@
 dataset_Flickr30k_val = dict(
     type=dataset_type,
     data_root=data_root,
-    ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
+    ann_file='final_flickr_separateGT_val.json',
     data_prefix=dict(img='flickr30k_images/'),
     pipeline=test_pipeline,
 )
 
 dataset_Flickr30k_test = dict(
     type=dataset_type,
     data_root=data_root,
-    ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
+    ann_file='final_flickr_separateGT_test.json',
     data_prefix=dict(img='flickr30k_images/'),
     pipeline=test_pipeline,
 )

diff --git a/configs/mm_grounding_dino/README.md b/configs/mm_grounding_dino/README.md
diff --git a/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py b/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py
@@ -3,6 +3,8 @@
 # https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2
 data_root = 'data/brain_tumor_v2/'
 class_name = ('label0', 'label1', 'label2')
+label_name = '_annotations.coco.json'
+
 palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142)]
 
 metainfo = dict(classes=class_name, palette=palette)
@@ -64,20 +66,20 @@
             pipeline=train_pipeline,
             return_classes=True,
             data_prefix=dict(img='train/'),
-            ann_file='train/_annotations.coco.json')))
+            ann_file='train/' + label_name)))
 
 val_dataloader = dict(
     dataset=dict(
         metainfo=metainfo,
         data_root=data_root,
         return_classes=True,
-        ann_file='valid/_annotations.coco.json',
+        ann_file='valid/' + label_name,
         data_prefix=dict(img='valid/')))
 test_dataloader = val_dataloader
 
 val_evaluator = dict(
     type='CocoMetric',
-    ann_file=data_root + 'valid/_annotations.coco.json',
+    ann_file=data_root + 'valid/' + label_name,
     metric='bbox',
     format_only=False)
 test_evaluator = val_evaluator
@@ -107,4 +109,4 @@
 
 default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
 
-load_from = ''
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py b/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py
@@ -107,4 +107,4 @@
 train_cfg = dict(max_epochs=max_epochs, val_interval=1)
 default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
 
-load_from = ''
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py b/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py
@@ -64,7 +64,7 @@
         custom_keys={
             'absolute_pos_embed': dict(decay_mult=0.),
             'backbone': dict(lr_mult=0.1),
-            # 'language_model': dict(lr_mult=0),
+            'language_model': dict(lr_mult=0.1),
         }))
 
 # learning policy
@@ -75,11 +75,11 @@
         begin=0,
         end=max_epochs,
         by_epoch=True,
-        milestones=[11],
+        milestones=[8, 11],
         gamma=0.1)
 ]
 train_cfg = dict(max_epochs=max_epochs, val_interval=1)
 
 default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
 
-load_from = ''
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py b/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco_48_17.py
@@ -8,21 +8,20 @@
                 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
                 'pizza', 'donut', 'chair', 'bed', 'toilet', 'tv', 'laptop',
                 'mouse', 'remote', 'microwave', 'oven', 'toaster',
-                'refrigerator', 'book', 'clock', 'vase', 'toothbrush')
+                'refrigerator', 'book', 'clock', 'vase', 'toothbrush')  # 48
 novel_classes = ('airplane', 'bus', 'cat', 'dog', 'cow', 'elephant',
                  'umbrella', 'tie', 'snowboard', 'skateboard', 'cup', 'knife',
-                 'cake', 'couch', 'keyboard', 'sink', 'scissors')
-all_classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-               'train', 'truck', 'boat', 'bench', 'bird', 'cat', 'dog',
-               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
-               'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
-               'skis', 'snowboard', 'kite', 'skateboard', 'surfboard',
-               'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
-               'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza',
-               'donut', 'cake', 'chair', 'couch', 'bed', 'toilet', 'tv',
-               'laptop', 'mouse', 'remote', 'keyboard', 'microwave', 'oven',
-               'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
-               'scissors', 'toothbrush')
+                 'cake', 'couch', 'keyboard', 'sink', 'scissors')  # 17
+all_classes = (
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard',
+    'surfboard', 'bottle', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
+    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'pizza', 'donut',
+    'cake', 'chair', 'couch', 'bed', 'toilet', 'tv', 'laptop', 'mouse',
+    'remote', 'keyboard', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'toothbrush')  # 65
 
 train_metainfo = dict(classes=base_classes)
 test_metainfo = dict(
@@ -95,7 +94,7 @@
         type='CocoDataset',
         metainfo=train_metainfo,
         data_root=data_root,
-        ann_file='zero-shot/instances_train2017_seen_2.json',
+        ann_file='annotations/instances_train2017_seen_2.json',
         data_prefix=dict(img='train2017/'),
         return_classes=True,
         filter_cfg=dict(filter_empty_gt=False, min_size=32),
@@ -111,7 +110,7 @@
         type='CocoDataset',
         metainfo=test_metainfo,
         data_root=data_root,
-        ann_file='zero-shot/instances_val2017_all_2.json',
+        ann_file='annotations/instances_val2017_all_2.json',
         data_prefix=dict(img='val2017/'),
         test_mode=True,
         pipeline=test_pipeline,
@@ -121,7 +120,7 @@
 
 val_evaluator = dict(
     type='OVCocoMetric',
-    ann_file=data_root + 'zero-shot/instances_val2017_all_2.json',
+    ann_file=data_root + 'annotations/instances_val2017_all_2.json',
     metric='bbox',
     format_only=False)
 test_evaluator = val_evaluator
@@ -155,4 +154,4 @@
     checkpoint=dict(
         max_keep_ckpts=1, save_best='coco/novel_ap50', rule='greater'))
 
-load_from = 'epoch_30.pth'
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa
diff --git a/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_sft_coco.py b/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_sft_coco.py
@@ -0,0 +1,93 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=20,  # ======= important =====
+        label_map_file='data/coco/annotations/coco2017_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        need_text=False,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017_od.json',
+        label_map_file='annotations/coco2017_label_map.json',
+        data_prefix=dict(img='train2017/'),
+        return_classes=True,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.00005, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0.0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'https://download.openmmlab.com/mmdetection/v3.0/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det_20231204_095047-b448804b.pth'  # noqa