move dynamic control to Quant Args

neuralmagic · Apr 25, 2024 · 7ca861b · 7ca861b
1 parent 5082aad
commit 7ca861b
Show file tree

Hide file tree

Showing 7 changed files with 19 additions and 52 deletions.
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -121,7 +121,7 @@ def _maybe_calibrate_or_quantize(
         return value
 
     observer = getattr(module, f"{base_name}_observer")
-    if observer.DYNAMIC:
+    if args.dynamic:
         # dynamic quantization - get scale and zero point directly from observer
         scale, zero_point = observer(value)
     else:

diff --git a/src/compressed_tensors/quantization/lifecycle/frozen.py b/src/compressed_tensors/quantization/lifecycle/frozen.py
@@ -30,20 +30,17 @@ def freeze_module_quantization(module: Module):
 
     :param module: module to freeze quantization for
     """
-    if not getattr(module, "quantization_scheme", None):
+    scheme = getattr(module, "quantization_scheme", None)
+    if not scheme:
         # no quantization scheme nothing to do
         return
 
-    # delete observers from module
-    observer_names = []
-    for submodule_name, submodule in module.named_modules():
-        if "." not in submodule_name and submodule_name.endswith("_observer"):
-            if getattr(submodule, "DYNAMIC", False):
-                continue  # do not delete dynamic observers
-
-            # delete any non-dynamic observers that belong directly to this module
-            observer_names.append(submodule_name)
-    for observer_name in observer_names:
-        delattr(module, observer_name)
+    # delete observers from module if not dynamic
+    if scheme.input_activations and not scheme.input_activations.dynamic:
+        delattr(module, "input_observer")
+    if scheme.weights and not scheme.weights.dynamic:
+        delattr(module, "weight_observer")
+    if scheme.output_activations and not scheme.output_activations.dynamic:
+        delattr(module, "output_observer")
 
     module.quantization_status = QuantizationStatus.FROZEN
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -84,7 +84,7 @@ def _initialize_scale_zero_point_observer(
     observer = quantization_args.get_observer()
     module.register_module(f"{base_name}_observer", observer)
 
-    if observer.DYNAMIC:
+    if quantization_args.dynamic:
         return  # no need to register a scale and zero point for a dynamic observer
 
     device = next(module.parameters()).device

diff --git a/src/compressed_tensors/quantization/observers/__init__.py b/src/compressed_tensors/quantization/observers/__init__.py
@@ -19,4 +19,3 @@
 from .base import *
 from .memoryless import *
 from .min_max import *
-from .dynamic import *
diff --git a/src/compressed_tensors/quantization/observers/dynamic.py b/src/compressed_tensors/quantization/observers/dynamic.py
diff --git a/src/compressed_tensors/quantization/observers/memoryless.py b/src/compressed_tensors/quantization/observers/memoryless.py
@@ -23,10 +23,10 @@
 __all__ = ["MemorylessObserver"]
 
 
-@Observer.register("memoryless")
+@Observer.register("memoryless", alias=["dynamic"])
 class MemorylessObserver(Observer):
     """
-    Implements a dynamic quantization observer that sets the scale and
+    Implements a quantization observer that sets the scale and
     zero point based on the latest observed value without tracking state
     """
 

diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -61,6 +61,7 @@ class QuantizationArgs(BaseModel):
     strategy: QuantizationStrategy = QuantizationStrategy.TENSOR
     group_size: Optional[int] = None
     block_structure: Optional[str] = None
+    dynamic: bool = False
     observer: str = Field(
         default="minmax",
         description=(
@@ -82,4 +83,9 @@ def get_observer(self):
         """
         from compressed_tensors.quantization.observers.base import Observer
 
+        if self.observer == "minmax" and self.dynamic:
+            # override defualt observer for dynamic, you never want minmax which
+            # keeps state across samples for dynamic
+            self.observer = "memoryless"
+
         return Observer.load_from_registry(self.observer, quantization_args=self)