Skip to content

Commit

Permalink
[WIP] continue PR #784 (#1221)
Browse files Browse the repository at this point in the history
* Add gradient cumulative optimizer

fixes #190

* Update optimizer.py

* Update optimizer.py

* fix loss scale improperly in last equivalent_iter

* Add `GradientCumulativeOptimizerHook` in `__init__.py`.

* Add docstring of `GradientCumulativeOptimizerHook`.

* Add type check, BN warning and resume warning. And fix typo, lint the
code.

* Add unit test

* Update docstring example.

* Change GradientCumulativeOptimizerHook `__init__` arguments.

* Add GradientCumulativeOptimzierHook unit tests with IterBasedRunner.

* Add GradientCumulativeFp16OptimizerHook.

* Add unit tests of GradientCumulativeFp16OptimizerHook

* Use '!=' instead of '>' to determine resume

Co-authored-by: Zhiyuan Chen <this@zyc.ai>
  • Loading branch information
mzr1996 and ZhiyuanChen committed Aug 23, 2021
1 parent 9f0cc5a commit 357b0df
Show file tree
Hide file tree
Showing 4 changed files with 433 additions and 9 deletions.
13 changes: 8 additions & 5 deletions mmcv/runner/__init__.py
Expand Up @@ -11,10 +11,12 @@
from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook,
DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook,
Fp16OptimizerHook, Hook, IterTimerHook, LoggerHook,
LrUpdaterHook, MlflowLoggerHook, NeptuneLoggerHook,
OptimizerHook, PaviLoggerHook, SyncBuffersHook,
TensorboardLoggerHook, TextLoggerHook, WandbLoggerHook)
Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
GradientCumulativeOptimizerHook, Hook, IterTimerHook,
LoggerHook, LrUpdaterHook, MlflowLoggerHook,
NeptuneLoggerHook, OptimizerHook, PaviLoggerHook,
SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
WandbLoggerHook)
from .iter_based_runner import IterBasedRunner, IterLoader
from .log_buffer import LogBuffer
from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
Expand All @@ -39,5 +41,6 @@
'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
'_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
'ModuleList'
'ModuleList', 'GradientCumulativeOptimizerHook',
'GradientCumulativeFp16OptimizerHook'
]
6 changes: 4 additions & 2 deletions mmcv/runner/hooks/__init__.py
Expand Up @@ -11,7 +11,8 @@
from .lr_updater import LrUpdaterHook
from .memory import EmptyCacheHook
from .momentum_updater import MomentumUpdaterHook
from .optimizer import Fp16OptimizerHook, OptimizerHook
from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
GradientCumulativeOptimizerHook, OptimizerHook)
from .profiler import ProfilerHook
from .sampler_seed import DistSamplerSeedHook
from .sync_buffer import SyncBuffersHook
Expand All @@ -23,5 +24,6 @@
'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook',
'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook',
'DistEvalHook', 'ProfilerHook'
'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook',
'GradientCumulativeFp16OptimizerHook'
]
213 changes: 212 additions & 1 deletion mmcv/runner/hooks/optimizer.py
Expand Up @@ -5,7 +5,7 @@

from torch.nn.utils import clip_grad

from mmcv.utils import TORCH_VERSION, digit_version
from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
from ..dist_utils import allreduce_grads
from ..fp16_utils import LossScaler, wrap_fp16_model
from .hook import HOOKS, Hook
Expand Down Expand Up @@ -42,6 +42,91 @@ def after_train_iter(self, runner):
runner.optimizer.step()


@HOOKS.register_module()
class GradientCumulativeOptimizerHook(OptimizerHook):
"""Optimizer Hook implements multi-iters gradient cumulating.
Args:
cumulative_iters (int, optional): Num of gradient cumulative iters.
The optimizer will step every `cumulative_iters` iters.
Defaults to 1.
Examples:
>>> # Use cumulative_iters to simulate a large batch size
>>> # It is helpful when the hardware cannot handle a large batch size.
>>> loader = DataLoader(data, batch_size=64)
>>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
>>> # almost equals to
>>> loader = DataLoader(data, batch_size=256)
>>> optim_hook = OptimizerHook()
"""

def __init__(self, cumulative_iters=1, **kwargs):
super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)

assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
f'cumulative_iters only accepts positive int, but got ' \
f'{type(cumulative_iters)} instead.'

self.cumulative_iters = cumulative_iters
self.divisible_iters = 0
self.remainder_iters = 0
self.initialized = False

def has_batch_norm(self, module):
if isinstance(module, _BatchNorm):
return True
for m in module.children():
if self.has_batch_norm(m):
return True
return False

def _init(self, runner):
if runner.iter % self.cumulative_iters != 0:
runner.logger.warning(
'Resume iter number is not divisible by cumulative_iters in '
'GradientCumulativeOptimizerHook, which means the gradient of '
'some iters is lost and the result may be influenced slightly.'
)

if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
runner.logger.warning(
'GradientCumulativeOptimizerHook may slightly decrease '
'performance if the model has BatchNorm layers.')

residual_iters = runner.max_iters - runner.iter

self.divisible_iters = (
residual_iters // self.cumulative_iters * self.cumulative_iters)
self.remainder_iters = residual_iters - self.divisible_iters

self.initialized = True

def after_train_iter(self, runner):
if not self.initialized:
self._init(runner)

if runner.iter < self.divisible_iters:
loss_factor = self.cumulative_iters
else:
loss_factor = self.remainder_iters
loss = runner.outputs['loss']
loss = loss / loss_factor
loss.backward()

if (self.every_n_iters(runner, self.cumulative_iters)
or self.is_last_iter(runner)):

if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update({'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
runner.optimizer.step()
runner.optimizer.zero_grad()


if (TORCH_VERSION != 'parrots'
and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):

Expand Down Expand Up @@ -152,6 +237,60 @@ def after_train_iter(self, runner):
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()

@HOOKS.register_module()
class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
Fp16OptimizerHook):
"""Fp16 optimizer Hook (using PyTorch's implementation) implements
multi-iters gradient cumulating.
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
to take care of the optimization procedure.
"""

def __init__(self, *args, **kwargs):
super(GradientCumulativeFp16OptimizerHook,
self).__init__(*args, **kwargs)

def after_train_iter(self, runner):
if not self.initialized:
self._init(runner)

if runner.iter < self.divisible_iters:
loss_factor = self.cumulative_iters
else:
loss_factor = self.remainder_iters
loss = runner.outputs['loss']
loss = loss / loss_factor

self.loss_scaler.scale(loss).backward()

if (self.every_n_iters(runner, self.cumulative_iters)
or self.is_last_iter(runner)):

# copy fp16 grads in the model to fp32 params in the optimizer
self.loss_scaler.unscale_(runner.optimizer)

if self.grad_clip is not None:
grad_norm = self.clip_grads(runner.model.parameters())
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])

# backward and update scaler
self.loss_scaler.step(runner.optimizer)
self.loss_scaler.update(self._scale_update_param)

# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()

# clear grads
runner.model.zero_grad()
runner.optimizer.zero_grad()

else:

@HOOKS.register_module()
Expand Down Expand Up @@ -295,3 +434,75 @@ def after_train_iter(self, runner):
# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()

@HOOKS.register_module()
class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
Fp16OptimizerHook):
"""Fp16 optimizer Hook (using mmcv implementation) implements multi-
iters gradient cumulating."""

def __init__(self, *args, **kwargs):
super(GradientCumulativeFp16OptimizerHook,
self).__init__(*args, **kwargs)

def after_train_iter(self, runner):
if not self.initialized:
self._init(runner)

if runner.iter < self.divisible_iters:
loss_factor = self.cumulative_iters
else:
loss_factor = self.remainder_iters

loss = runner.outputs['loss']
loss = loss / loss_factor

# scale the loss value
scaled_loss = loss * self.loss_scaler.loss_scale
scaled_loss.backward()

if (self.every_n_iters(runner, self.cumulative_iters)
or self.is_last_iter(runner)):

# copy fp16 grads in the model to fp32 params in the optimizer
fp32_weights = []
for param_group in runner.optimizer.param_groups:
fp32_weights += param_group['params']
self.copy_grads_to_fp32(runner.model, fp32_weights)
# allreduce grads
if self.distributed:
allreduce_grads(fp32_weights, self.coalesce,
self.bucket_size_mb)

has_overflow = self.loss_scaler.has_overflow(fp32_weights)
# if has overflow, skip this iteration
if not has_overflow:
# scale the gradients back
for param in fp32_weights:
if param.grad is not None:
param.grad.div_(self.loss_scaler.loss_scale)
if self.grad_clip is not None:
grad_norm = self.clip_grads(fp32_weights)
if grad_norm is not None:
# Add grad norm to the logger
runner.log_buffer.update(
{'grad_norm': float(grad_norm)},
runner.outputs['num_samples'])
# update fp32 params
runner.optimizer.step()
# copy fp32 params to the fp16 model
self.copy_params_to_fp16(runner.model, fp32_weights)
else:
runner.logger.warning(
'Check overflow, downscale loss scale '
f'to {self.loss_scaler.cur_scale}')

self.loss_scaler.update_scale(has_overflow)

# save state_dict of loss_scaler
runner.meta.setdefault(
'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()

# clear grads
runner.model.zero_grad()
runner.optimizer.zero_grad()

0 comments on commit 357b0df

Please sign in to comment.