From 1562415f0e3c98aaf664d47be948af9ea8fd375b Mon Sep 17 00:00:00 2001 From: Iden Kalemaj Date: Thu, 3 Apr 2025 15:05:18 -0700 Subject: [PATCH 1/2] Replace register_backward_hook with register_full_backward_hook (#720) Summary: Pull Request resolved: https://github.com/pytorch/opacus/pull/720 register_backward_hook is deprecated and may lead to errors in gradient calculation. We switch to the supported register_full_backward_hook. Differential Revision: D68562558 Reviewed By: HuanyuZhang --- opacus/grad_sample/grad_sample_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opacus/grad_sample/grad_sample_module.py b/opacus/grad_sample/grad_sample_module.py index 19b5ffa6..745bb98b 100644 --- a/opacus/grad_sample/grad_sample_module.py +++ b/opacus/grad_sample/grad_sample_module.py @@ -207,7 +207,7 @@ def add_hooks( ) self.autograd_grad_sample_hooks.append( - module.register_backward_hook( + module.register_full_backward_hook( partial( self.capture_backprops_hook, loss_reduction=loss_reduction, From 658b9edbee608f78ccc19bd228ca65fd85b72e4a Mon Sep 17 00:00:00 2001 From: Iden Kalemaj Date: Thu, 3 Apr 2025 15:49:49 -0700 Subject: [PATCH 2/2] Use SimpleDistributedPerLayerClipping optimizer in hooks mode (#750) Summary: Pull Request resolved: https://github.com/pytorch/opacus/pull/750 We use SimpleDistributedPerLayerOptimizer instead of DistributedPerLayerOptimizer. The latter causes an issue when switching to `register_full_backward_hook`. The issue arises because DistributedPerLayerOptimizer uses per-parameter hooks on top of the per-module hooks. During the backward pass, the per-parameter hooks fire before the per-module hooks. Per-sample gradients are computed when the per-module hooks fire, and an error occurs when the per-parameter hooks try to access the per-sample gradients before they are computed. Forcing the order in which hooks are called is not possible with PyTorch. Differential Revision: D72420168 --- opacus/optimizers/__init__.py | 10 ++-------- opacus/tests/multigpu_gradcheck.py | 10 ++-------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/opacus/optimizers/__init__.py b/opacus/optimizers/__init__.py index 88f79a8d..bac211d0 100644 --- a/opacus/optimizers/__init__.py +++ b/opacus/optimizers/__init__.py @@ -13,10 +13,7 @@ # limitations under the License. from .adaclipoptimizer import AdaClipDPOptimizer -from .ddp_perlayeroptimizer import ( - DistributedPerLayerOptimizer, - SimpleDistributedPerLayerOptimizer, -) +from .ddp_perlayeroptimizer import SimpleDistributedPerLayerOptimizer from .ddpoptimizer import DistributedDPOptimizer from .ddpoptimizer_fast_gradient_clipping import ( DistributedDPOptimizerFastGradientClipping, @@ -28,7 +25,6 @@ __all__ = [ "AdaClipDPOptimizer", - "DistributedPerLayerOptimizer", "DistributedDPOptimizer", "DPOptimizer", "DPOptimizerFastGradientClipping", @@ -55,9 +51,7 @@ def get_optimizer_class(clipping: str, distributed: bool, grad_sample_mode: str elif clipping == "per_layer" and distributed is False: return DPPerLayerOptimizer elif clipping == "per_layer" and distributed is True: - if grad_sample_mode == "hooks": - return DistributedPerLayerOptimizer - elif grad_sample_mode == "ew": + if grad_sample_mode == "hooks" or grad_sample_mode == "ew": return SimpleDistributedPerLayerOptimizer else: raise ValueError(f"Unexpected grad_sample_mode: {grad_sample_mode}") diff --git a/opacus/tests/multigpu_gradcheck.py b/opacus/tests/multigpu_gradcheck.py index 6242d8e1..1e8e8456 100644 --- a/opacus/tests/multigpu_gradcheck.py +++ b/opacus/tests/multigpu_gradcheck.py @@ -26,10 +26,7 @@ from opacus import PrivacyEngine from opacus.distributed import DifferentiallyPrivateDistributedDataParallel as DPDDP from opacus.grad_sample import GradSampleModuleFastGradientClipping -from opacus.optimizers.ddp_perlayeroptimizer import ( - DistributedPerLayerOptimizer, - SimpleDistributedPerLayerOptimizer, -) +from opacus.optimizers.ddp_perlayeroptimizer import SimpleDistributedPerLayerOptimizer from opacus.optimizers.ddpoptimizer import DistributedDPOptimizer from opacus.optimizers.ddpoptimizer_fast_gradient_clipping import ( DistributedDPOptimizerFastGradientClipping, @@ -165,10 +162,7 @@ def demo_basic(rank, weight, world_size, dp, clipping, grad_sample_mode): grad_sample_mode=grad_sample_mode, ) if clipping == "per_layer": - assert isinstance( - optimizer, - (DistributedPerLayerOptimizer, SimpleDistributedPerLayerOptimizer), - ) + assert isinstance(optimizer, SimpleDistributedPerLayerOptimizer) else: assert isinstance(optimizer, DistributedDPOptimizer)