meta-pytorch · pagarwl · Dec 9, 2024 · Dec 9, 2024 · Dec 15, 2024 · Jan 12, 2025
diff --git a/opacus/grad_sample/embedding.py b/opacus/grad_sample/embedding.py
@@ -15,6 +15,7 @@
 
 from typing import Dict, List
 
+from opacus.grad_sample import embedding_norm_sample
 import torch
 import torch.nn as nn
 from opacus.grad_sample import embedding_norm_sample

diff --git a/opacus/grad_sample/embedding_norm_sample.py b/opacus/grad_sample/embedding_norm_sample.py
@@ -46,6 +46,12 @@ def compute_embedding_norm_sample(
       activations:  [tensor([[1, 1],
           [2, 0],
           [2, 0]])]
+      backprops:  tensor([[0.2000],
+          [0.2000],
+          [0.3000],
+          [0.1000],
+          [0.3000],
+          [0.1000]])
       backprops:  tensor([[[0.2], [0.2]],
           [[0.3], [0.1]],
           [[0.3], [0.1]]])

diff --git a/opacus/privacy_engine.py b/opacus/privacy_engine.py
@@ -309,7 +309,7 @@ def make_private(
         noise_generator=None,
         grad_sample_mode: str = "hooks",
         **kwargs,
-    ) -> Tuple[GradSampleModule, DPOptimizer, DataLoader]:
+    ):
         """
         Add privacy-related responsibilities to the main PyTorch training objects:
         model, optimizer, and the data loader.
@@ -359,12 +359,15 @@ def make_private(
                 details
 
         Returns:
-            Tuple of (model, optimizer, data_loader).
+            Tuple of (model, optimizer, criterion (if grad_sample_model="ghost"), data_loader).
 
             Model is a wrapper around the original model that also computes per sample
                 gradients
             Optimizer is a wrapper around the original optimizer that also does
              gradient clipping and noise addition to the gradients
+            Criterion is a wrapper around the original criterion that does two
+                backward pass under the hood. Returned if grad_sample_mode is
+                "ghost".
             DataLoader is a brand new DataLoader object, constructed to behave as
                 equivalent to the original data loader, possibly with updated
                 sampling mechanism. Points to the same dataset object.
@@ -497,17 +500,23 @@ def make_private_with_epsilon(
                 details
 
         Returns:
-            Tuple of (model, optimizer, data_loader).
+            Tuple of (model, optimizer, criterion (if grad_sample_mode="ghost"), data_loader).
 
             Model is a wrapper around the original model that also computes per sample
                 gradients
             Optimizer is a wrapper around the original optimizer that also does
                 gradient clipping and noise addition to the gradients
+            Criterion is a wrapper around the original criterion that does two
+                backward pass under the hood. Returned if grad_sample_mode is
+                "ghost".
             DataLoader is a brand new DataLoader object, constructed to behave as
                 equivalent to the original data loader, possibly with updated
                 sampling mechanism. Points to the same dataset object.
         """
         sample_rate = 1 / len(data_loader)
+        epsilon_tolerance = kwargs.get(
+            "epsilon_tolerance", 0.01
+        )  # same default as in get_noise_multiplier
 
         if len(self.accountant) > 0:
             warnings.warn(
@@ -527,6 +536,7 @@ def make_private_with_epsilon(
                 sample_rate=sample_rate,
                 epochs=epochs,
                 accountant=self.accountant.mechanism(),
+                epsilon_tolerance=epsilon_tolerance,
                 **kwargs,
             ),
             max_grad_norm=max_grad_norm,

diff --git a/opacus/tests/grad_sample_module_fast_gradient_clipping_test.py b/opacus/tests/grad_sample_module_fast_gradient_clipping_test.py
@@ -351,6 +351,7 @@ def test_norm_calculation(self):
         diff = flat_norms_normal - flat_norms_gc
 
         logging.info(f"Diff = {diff}")
+
         msg = "Fail: Gradient norms from vanilla DP-SGD and from fast gradient clipping are different"
         assert torch.allclose(flat_norms_normal, flat_norms_gc, atol=1e-3), msg
 

diff --git a/opacus/tests/grad_samples/embedding_norm_sample_test.py b/opacus/tests/grad_samples/embedding_norm_sample_test.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+
 import torch
 import torch.nn as nn
 from opacus.grad_sample import embedding_norm_sample
@@ -36,11 +37,15 @@ def test_compute_embedding_norm_sample(self):
         # Example input ids (activations). Shape: [3, 2]
         input_ids = torch.tensor([[1, 1], [2, 0], [2, 0]], dtype=torch.long)
 
-        # Example backprops. Shape: [3, 2, 1]
-        backprops = torch.tensor(
-            [[[0.2], [0.2]], [[0.3], [0.1]], [[0.3], [0.1]]], dtype=torch.float32
+        # Example gradients with respect to the embedding output (backprops).
+        # Shape: [6, 1]
+        grad_values = torch.tensor(
+            [[0.2], [0.2], [0.3], [0.1], [0.3], [0.1]], dtype=torch.float32
         )
 
+        # Simulate backprop through embedding layer
+        backprops = grad_values
+
         # Wrap input_ids in a list as expected by the norm sample function
         activations = [input_ids]
 
@@ -66,17 +71,17 @@ def test_compute_embedding_norm_sample_with_non_one_embedding_dim(self):
 
         # Manually set weights for the embedding layer for testing
         embedding_layer.weight = nn.Parameter(
-            torch.tensor([[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], dtype=torch.float32)
+            torch.tensor([[0.1], [0.2], [0.3]], dtype=torch.float32)
         )
 
         # Example input ids (activations). Shape: [6, 1, 1].
         input_ids = torch.tensor(
             [[[1]], [[1]], [[2]], [[0]], [[2]], [[0]]], dtype=torch.long
         )
 
-        # Example backprops per input id, with embedding_dim=2.
+        # Example gradients per input id, with embedding_dim=2.
         # Shape: [6, 1, 1, 2]
-        backprops = torch.tensor(
+        grad_values = torch.tensor(
             [
                 [[[0.2, 0.2]]],
                 [[[0.2, 0.2]]],
@@ -88,6 +93,9 @@ def test_compute_embedding_norm_sample_with_non_one_embedding_dim(self):
             dtype=torch.float32,
         )
 
+        # Simulate backprop through embedding layer
+        backprops = grad_values
+
         # Wrap input_ids in a list as expected by the grad norm function
         activations = [input_ids]
 
@@ -204,6 +212,7 @@ def test_compute_embedding_norm_sample_with_extra_activations_per_example(self):
         expected_norms = torch.tensor(
             [0.0150, 0.0071, 0.0005, 0.0081, 0.0039], dtype=torch.float32
         )
+        print("expected_norms: ", expected_norms)
         computed_norms = result[embedding_layer.weight]
 
         # Verify the computed norms match the expected norms