[Enhance] Formatting non distributed training and inference and Suppo…

…rting CPU training. (#42) * [Docs] update batch size * Fix bug in non-distributed multi-gpu training/testing * support cpu training * update cpu training and testing
open-mmlab · Mar 10, 2022 · a8b528d · a8b528d
1 parent 501a6db
commit a8b528d
Show file tree

Hide file tree

Showing 9 changed files with 142 additions and 25 deletions.
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
@@ -1,6 +1,7 @@
 ## Test a model
 
 - single GPU
+- CPU
 - single node multiple GPU
 - multiple node
 
@@ -10,6 +11,10 @@ You can use the following commands to infer a dataset.
 # single-gpu
 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
 
+# CPU: disable GPUs and run single-gpu testing script
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+
 # multi-gpu
 ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]
 
@@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]
 
 If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.
 
+### Train on CPU
+
+The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+**Note**:
+
+We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.
+
+
 ### Train with multiple GPUs
 
 ```shell

diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
@@ -1,6 +1,7 @@
 ## Test a model
 
 - single GPU
+- CPU
 - single node multiple GPU
 - multiple node
 
@@ -10,6 +11,10 @@ You can use the following commands to infer a dataset.
 # single-gpu
 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
 
+# CPU: disable GPUs and run single-gpu testing script
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+
 # multi-gpu
 ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]
 
@@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]
 
 If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.
 
+### Train on CPU
+
+The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+**Note**:
+
+We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.
+
+
 ### Train with multiple GPUs
 
 ```shell

diff --git a/mmfewshot/__init__.py b/mmfewshot/__init__.py
@@ -32,7 +32,7 @@ def digit_version(version_str):
     f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
 
 mmdet_minimum_version = '2.16.0'
-mmdet_maximum_version = '2.21.0'
+mmdet_maximum_version = '2.23.0'
 mmdet_version = digit_version(mmdet.__version__)
 
 

diff --git a/mmfewshot/classification/apis/train.py b/mmfewshot/classification/apis/train.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
 from typing import Dict, Union
 
 import torch
@@ -22,7 +23,7 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel],
                 distributed: bool = False,
                 validate: bool = False,
                 timestamp: str = None,
-                device: str = 'cuda',
+                device: str = None,
                 meta: Dict = None) -> None:
     logger = get_root_logger(log_level=cfg.log_level)
 
@@ -54,13 +55,14 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel],
             broadcast_buffers=False,
             find_unused_parameters=find_unused_parameters)
     else:
-        if device == 'cuda':
-            model = MMDataParallel(
-                model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
-        elif device == 'cpu':
+        if device == 'cpu':
+            warnings.warn(
+                'The argument `device` is deprecated. To use cpu to train, '
+                'please refers to https://mmclassification.readthedocs.io/en'
+                '/latest/getting_started.html#train-a-model')
             model = model.cpu()
         else:
-            raise ValueError(F'unsupported device name {device}.')
+            model = MMDataParallel(model, device_ids=cfg.gpu_ids)
 
     # build runner
     optimizer = build_optimizer(model, cfg.optimizer)

diff --git a/mmfewshot/detection/apis/train.py b/mmfewshot/detection/apis/train.py
@@ -55,8 +55,8 @@ def train_detector(model: nn.Module,
             broadcast_buffers=False,
             find_unused_parameters=find_unused_parameters)
     else:
-        model = MMDataParallel(
-            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        # Please use MMCV >= 1.4.4 for CPU training!
+        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
 
     # build runner
     optimizer = build_optimizer(model, cfg.optimizer)

diff --git a/tools/classification/test.py b/tools/classification/test.py
@@ -3,6 +3,7 @@
 import os
 import os.path as osp
 import time
+import warnings
 
 import mmcv
 import torch
@@ -64,17 +65,35 @@ def parse_args():
         help='whether to set deterministic options for CUDNN backend.')
     parser.add_argument('--local_rank', type=int, default=0)
     parser.add_argument(
-        '--device',
-        choices=['cpu', 'cuda'],
-        default='cuda',
-        help='device used for testing')
+        '--device', default=None, help='device used for testing. (Deprecated)')
+    parser.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
     parser.add_argument(
         '--show_task_results',
         action='store_true',
         help='whether to record eval result of each task.')
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.device:
+        warnings.warn(
+            '--device is deprecated. To use cpu to test, please '
+            'refers to https://mmclassification.readthedocs.io/en/latest/'
+            'getting_started.html#inference-with-pretrained-models')
+
+    assert args.metrics or args.out, \
+        'Please specify at least one of output path and evaluation metrics.'
     return args
 
 
@@ -96,7 +115,14 @@ def main():
         # use config filename as default work_dir if cfg.work_dir is None
         cfg.work_dir = osp.join('./work_dirs',
                                 osp.splitext(osp.basename(args.config))[0])
-
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed testing. Use the first GPU '
+                      'in `gpu_ids` now.')
+    else:
+        cfg.gpu_ids = [args.gpu_id]
     # init distributed env first, since logger depends on the dist info.
     if args.launcher == 'none':
         distributed = False

diff --git a/tools/classification/train.py b/tools/classification/train.py
@@ -4,6 +4,7 @@
 import os
 import os.path as osp
 import time
+import warnings
 
 import cv2
 import mmcv
@@ -45,7 +46,13 @@ def parse_args():
         '--gpu-ids',
         type=int,
         nargs='+',
-        help='ids of gpus to use '
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
         '(only applicable to non-distributed training)')
     parser.add_argument('--seed', type=int, default=None, help='random seed')
     parser.add_argument(
@@ -87,10 +94,19 @@ def main():
                                 osp.splitext(osp.basename(args.config))[0])
     if args.resume_from is not None:
         cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
     if args.gpu_ids is not None:
-        cfg.gpu_ids = args.gpu_ids
-    else:
-        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
 
     # init distributed env first, since logger depends on the dist info.
     if args.launcher == 'none':

diff --git a/tools/detection/test.py b/tools/detection/test.py
@@ -27,6 +27,18 @@ def parse_args():
         nargs='+',
         help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
         ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
     parser.add_argument('--show', action='store_true', help='show results')
     parser.add_argument(
         '--show-dir', help='directory where painted images will be saved')
@@ -116,7 +128,14 @@ def main():
     # currently only support single images testing
     samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
     assert samples_per_gpu == 1, 'currently only support single images testing'
-
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed testing. Use the first GPU '
+                      'in `gpu_ids` now.')
+    else:
+        cfg.gpu_ids = [args.gpu_id]
     # init distributed env first, since logger depends on the dist info.
     if args.launcher == 'none':
         distributed = False
@@ -176,7 +195,8 @@ def main():
             shuffle=False)
 
     if not distributed:
-        model = MMDataParallel(model, device_ids=[0])
+        # Please use MMCV >= 1.4.4 for CPU testing!
+        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
         show_kwargs = dict(show_score_thr=args.show_score_thr)
         if cfg.data.get('model_init', None) is not None:
             from mmfewshot.detection.apis import (single_gpu_model_init,

diff --git a/tools/detection/train.py b/tools/detection/train.py
@@ -48,8 +48,14 @@ def parse_args():
         '--gpu-ids',
         type=int,
         nargs='+',
-        help='ids of gpus to use '
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
         '(only applicable to non-distributed training)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
     parser.add_argument('--seed', type=int, default=None, help='random seed')
     parser.add_argument(
         '--deterministic',
@@ -119,15 +125,24 @@ def main():
                                 osp.splitext(osp.basename(args.config))[0])
     if args.resume_from is not None:
         cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
     if args.gpu_ids is not None:
-        cfg.gpu_ids = args.gpu_ids
-    else:
-        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]
 
     # init distributed env first, since logger depends on the dist info.
     if args.launcher == 'none':
         distributed = False
-        rank, world_size = get_dist_info()
+        rank = 0
     else:
         distributed = True
         init_dist(args.launcher, **cfg.dist_params)