Skip to content

Commit

Permalink
[Enhance] Formatting non distributed training and inference and Suppo…
Browse files Browse the repository at this point in the history
…rting CPU training. (#42)

* [Docs] update batch size

* Fix bug in non-distributed multi-gpu training/testing

* support cpu training

* update cpu training and testing
  • Loading branch information
linyq17 committed Mar 10, 2022
1 parent 501a6db commit a8b528d
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 25 deletions.
19 changes: 19 additions & 0 deletions docs/en/get_started.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## Test a model

- single GPU
- CPU
- single node multiple GPU
- multiple node

Expand All @@ -10,6 +11,10 @@ You can use the following commands to infer a dataset.
# single-gpu
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]

# CPU: disable GPUs and run single-gpu testing script
export CUDA_VISIBLE_DEVICES=-1
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]

# multi-gpu
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]

Expand Down Expand Up @@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]

If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.

### Train on CPU

The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.

```shell
export CUDA_VISIBLE_DEVICES=-1
python tools/train.py ${CONFIG_FILE} [optional arguments]
```

**Note**:

We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.


### Train with multiple GPUs

```shell
Expand Down
19 changes: 19 additions & 0 deletions docs/zh_cn/get_started.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## Test a model

- single GPU
- CPU
- single node multiple GPU
- multiple node

Expand All @@ -10,6 +11,10 @@ You can use the following commands to infer a dataset.
# single-gpu
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]

# CPU: disable GPUs and run single-gpu testing script
export CUDA_VISIBLE_DEVICES=-1
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]

# multi-gpu
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]

Expand Down Expand Up @@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]

If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.

### Train on CPU

The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.

```shell
export CUDA_VISIBLE_DEVICES=-1
python tools/train.py ${CONFIG_FILE} [optional arguments]
```

**Note**:

We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.


### Train with multiple GPUs

```shell
Expand Down
2 changes: 1 addition & 1 deletion mmfewshot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def digit_version(version_str):
f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'

mmdet_minimum_version = '2.16.0'
mmdet_maximum_version = '2.21.0'
mmdet_maximum_version = '2.23.0'
mmdet_version = digit_version(mmdet.__version__)


Expand Down
14 changes: 8 additions & 6 deletions mmfewshot/classification/apis/train.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from typing import Dict, Union

import torch
Expand All @@ -22,7 +23,7 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel],
distributed: bool = False,
validate: bool = False,
timestamp: str = None,
device: str = 'cuda',
device: str = None,
meta: Dict = None) -> None:
logger = get_root_logger(log_level=cfg.log_level)

Expand Down Expand Up @@ -54,13 +55,14 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
if device == 'cuda':
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
elif device == 'cpu':
if device == 'cpu':
warnings.warn(
'The argument `device` is deprecated. To use cpu to train, '
'please refers to https://mmclassification.readthedocs.io/en'
'/latest/getting_started.html#train-a-model')
model = model.cpu()
else:
raise ValueError(F'unsupported device name {device}.')
model = MMDataParallel(model, device_ids=cfg.gpu_ids)

# build runner
optimizer = build_optimizer(model, cfg.optimizer)
Expand Down
4 changes: 2 additions & 2 deletions mmfewshot/detection/apis/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def train_detector(model: nn.Module,
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# Please use MMCV >= 1.4.4 for CPU training!
model = MMDataParallel(model, device_ids=cfg.gpu_ids)

# build runner
optimizer = build_optimizer(model, cfg.optimizer)
Expand Down
36 changes: 31 additions & 5 deletions tools/classification/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import os.path as osp
import time
import warnings

import mmcv
import torch
Expand Down Expand Up @@ -64,17 +65,35 @@ def parse_args():
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument(
'--device',
choices=['cpu', 'cuda'],
default='cuda',
help='device used for testing')
'--device', default=None, help='device used for testing. (Deprecated)')
parser.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='(Deprecated, please use --gpu-id) ids of gpus to use '
'(only applicable to non-distributed testing)')
parser.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed testing)')
parser.add_argument(
'--show_task_results',
action='store_true',
help='whether to record eval result of each task.')
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)

if args.device:
warnings.warn(
'--device is deprecated. To use cpu to test, please '
'refers to https://mmclassification.readthedocs.io/en/latest/'
'getting_started.html#inference-with-pretrained-models')

assert args.metrics or args.out, \
'Please specify at least one of output path and evaluation metrics.'
return args


Expand All @@ -96,7 +115,14 @@ def main():
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])

if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids[0:1]
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
'Because we only support single GPU mode in '
'non-distributed testing. Use the first GPU '
'in `gpu_ids` now.')
else:
cfg.gpu_ids = [args.gpu_id]
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
Expand Down
24 changes: 20 additions & 4 deletions tools/classification/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import os.path as osp
import time
import warnings

import cv2
import mmcv
Expand Down Expand Up @@ -45,7 +46,13 @@ def parse_args():
'--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
help='(Deprecated, please use --gpu-id) ids of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
Expand Down Expand Up @@ -87,10 +94,19 @@ def main():
osp.splitext(osp.basename(args.config))[0])
if args.resume_from is not None:
cfg.resume_from = args.resume_from
if args.gpus is not None:
cfg.gpu_ids = range(1)
warnings.warn('`--gpus` is deprecated because we only support '
'single GPU mode in non-distributed training. '
'Use `gpus=1` now.')
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
cfg.gpu_ids = args.gpu_ids[0:1]
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
'Because we only support single GPU mode in '
'non-distributed training. Use the first GPU '
'in `gpu_ids` now.')
if args.gpus is None and args.gpu_ids is None:
cfg.gpu_ids = [args.gpu_id]

# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
Expand Down
24 changes: 22 additions & 2 deletions tools/detection/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ def parse_args():
nargs='+',
help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
parser.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='(Deprecated, please use --gpu-id) ids of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed testing)')
parser.add_argument('--show', action='store_true', help='show results')
parser.add_argument(
'--show-dir', help='directory where painted images will be saved')
Expand Down Expand Up @@ -116,7 +128,14 @@ def main():
# currently only support single images testing
samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
assert samples_per_gpu == 1, 'currently only support single images testing'

if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids[0:1]
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
'Because we only support single GPU mode in '
'non-distributed testing. Use the first GPU '
'in `gpu_ids` now.')
else:
cfg.gpu_ids = [args.gpu_id]
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
Expand Down Expand Up @@ -176,7 +195,8 @@ def main():
shuffle=False)

if not distributed:
model = MMDataParallel(model, device_ids=[0])
# Please use MMCV >= 1.4.4 for CPU testing!
model = MMDataParallel(model, device_ids=cfg.gpu_ids)
show_kwargs = dict(show_score_thr=args.show_score_thr)
if cfg.data.get('model_init', None) is not None:
from mmfewshot.detection.apis import (single_gpu_model_init,
Expand Down
25 changes: 20 additions & 5 deletions tools/detection/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,14 @@ def parse_args():
'--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
help='(Deprecated, please use --gpu-id) ids of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed testing)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--deterministic',
Expand Down Expand Up @@ -119,15 +125,24 @@ def main():
osp.splitext(osp.basename(args.config))[0])
if args.resume_from is not None:
cfg.resume_from = args.resume_from
if args.gpus is not None:
cfg.gpu_ids = range(1)
warnings.warn('`--gpus` is deprecated because we only support '
'single GPU mode in non-distributed training. '
'Use `gpus=1` now.')
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
cfg.gpu_ids = args.gpu_ids[0:1]
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
'Because we only support single GPU mode in '
'non-distributed training. Use the first GPU '
'in `gpu_ids` now.')
if args.gpus is None and args.gpu_ids is None:
cfg.gpu_ids = [args.gpu_id]

# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
rank, world_size = get_dist_info()
rank = 0
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
Expand Down

0 comments on commit a8b528d

Please sign in to comment.