open-mmlab · zhouzaida · Dec 16, 2022 · Nov 28, 2022 · Nov 30, 2022 · Dec 8, 2022
diff --git a/mmengine/optim/optimizer/default_constructor.py b/mmengine/optim/optimizer/default_constructor.py
@@ -184,12 +184,13 @@ def add_params(self,
         # first sort with alphabet order and then sort with reversed len of str
         sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
 
-        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
-        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
-        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
-        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
         bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
-        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None)
 
         # special rules for norm layers and depth-wise conv layers
         is_norm = isinstance(module,
@@ -225,29 +226,36 @@ def add_params(self,
             if not is_custom:
                 # bias_lr_mult affects all bias parameters
                 # except for norm.bias dcn.conv_offset.bias
-                if name == 'bias' and not (is_norm or is_dcn_module):
+                if name == 'bias' and not (
+                        is_norm or is_dcn_module) and bias_lr_mult is not None:
                     param_group['lr'] = self.base_lr * bias_lr_mult
 
                 if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and dcn_offset_lr_mult is not None
                         and isinstance(module, torch.nn.Conv2d)):
                     # deal with both dcn_offset's bias & weight
                     param_group['lr'] = self.base_lr * dcn_offset_lr_mult
 
                 # apply weight decay policies
                 if self.base_wd is not None:
                     # norm decay
-                    if is_norm:
+                    if is_norm and norm_decay_mult is not None:
                         param_group[
                             'weight_decay'] = self.base_wd * norm_decay_mult
+                    # bias lr and decay
+                    elif (name == 'bias' and not is_dcn_module
+                          and bias_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
                     # depth-wise conv
-                    elif is_dwconv:
+                    elif is_dwconv and dwconv_decay_mult is not None:
                         param_group[
                             'weight_decay'] = self.base_wd * dwconv_decay_mult
-                    # bias lr and decay
-                    elif name == 'bias' and not is_dcn_module:
-                        # TODO: current bias_decay_mult will have affect on DCN
+                    # flatten parameters except dcn offset
+                    elif (param.ndim == 1 and not is_dcn_module
+                          and flat_decay_mult is not None):
                         param_group[
-                            'weight_decay'] = self.base_wd * bias_decay_mult
+                            'weight_decay'] = self.base_wd * flat_decay_mult
             params.append(param_group)
             for key, value in param_group.items():
                 if key == 'params':

diff --git a/tests/test_optim/test_optimizer/test_optimizer.py b/tests/test_optim/test_optimizer/test_optimizer.py
@@ -123,6 +123,7 @@ def _check_sgd_optimizer(self,
                              norm_decay_mult=1,
                              dwconv_decay_mult=1,
                              dcn_offset_lr_mult=1,
+                             flat_decay_mult=1,
                              bypass_duplicate=False):
         param_groups = optimizer.param_groups
         assert isinstance(optimizer, torch.optim.SGD)
@@ -139,7 +140,7 @@ def _check_sgd_optimizer(self,
         # param1
         param1 = param_groups[0]
         assert param1['lr'] == self.base_lr
-        assert param1['weight_decay'] == self.base_wd
+        assert param1['weight_decay'] == self.base_wd * flat_decay_mult
         # conv1.weight
         conv1_weight = param_groups[1]
         assert conv1_weight['lr'] == self.base_lr
@@ -163,7 +164,7 @@ def _check_sgd_optimizer(self,
         # sub.param1
         sub_param1 = param_groups[6]
         assert sub_param1['lr'] == self.base_lr
-        assert sub_param1['weight_decay'] == self.base_wd
+        assert sub_param1['weight_decay'] == self.base_wd * flat_decay_mult
         # sub.conv1.weight
         sub_conv1_weight = param_groups[7]
         assert sub_conv1_weight['lr'] == self.base_lr
@@ -172,8 +173,7 @@ def _check_sgd_optimizer(self,
         # sub.conv1.bias
         sub_conv1_bias = param_groups[8]
         assert sub_conv1_bias['lr'] == self.base_lr * bias_lr_mult
-        assert sub_conv1_bias[
-            'weight_decay'] == self.base_wd * dwconv_decay_mult
+        assert sub_conv1_bias['weight_decay'] == self.base_wd * bias_decay_mult
         # sub.gn.weight
         sub_gn_weight = param_groups[9]
         assert sub_gn_weight['lr'] == self.base_lr
@@ -258,7 +258,8 @@ def test_build_default_optimizer_constructor(self):
             bias_decay_mult=0.5,
             norm_decay_mult=0,
             dwconv_decay_mult=0.1,
-            dcn_offset_lr_mult=0.1)
+            dcn_offset_lr_mult=0.1,
+            flat_decay_mult=0.3)
         optim_constructor_cfg = dict(
             type='DefaultOptimWrapperConstructor',
             optim_wrapper_cfg=optim_wrapper,
@@ -390,7 +391,8 @@ def test_default_optimizer_constructor_with_model_wrapper(self):
             bias_decay_mult=0.5,
             norm_decay_mult=0,
             dwconv_decay_mult=0.1,
-            dcn_offset_lr_mult=0.1)
+            dcn_offset_lr_mult=0.1,
+            flat_decay_mult=0.3)
         optim_constructor = DefaultOptimWrapperConstructor(
             optim_wrapper_cfg, paramwise_cfg)
         optim_wrapper = optim_constructor(model)
@@ -429,7 +431,8 @@ def test_default_optimizer_constructor_with_model_wrapper(self):
                 bias_decay_mult=0.5,
                 norm_decay_mult=0,
                 dwconv_decay_mult=0.1,
-                dcn_offset_lr_mult=0.1)
+                dcn_offset_lr_mult=0.1,
+                flat_decay_mult=0.3)
             optim_constructor = DefaultOptimWrapperConstructor(
                 optim_wrapper_cfg, paramwise_cfg)
             optim_wrapper = optim_constructor(model)
@@ -484,7 +487,8 @@ def test_default_optimizer_constructor_with_paramwise_cfg(self):
             bias_decay_mult=0.5,
             norm_decay_mult=0,
             dwconv_decay_mult=0.1,
-            dcn_offset_lr_mult=0.1)
+            dcn_offset_lr_mult=0.1,
+            flat_decay_mult=0.3)
         optim_constructor = DefaultOptimWrapperConstructor(
             optim_wrapper_cfg, paramwise_cfg)
         optim_wrapper = optim_constructor(self.model)
@@ -554,6 +558,7 @@ def test_default_optimizer_constructor_bypass_duplicate(self):
             norm_decay_mult=0,
             dwconv_decay_mult=0.1,
             dcn_offset_lr_mult=0.1,
+            flat_decay_mult=0.3,
             bypass_duplicate=True)
         optim_constructor = DefaultOptimWrapperConstructor(
             optim_wrapper_cfg, paramwise_cfg)