Add attribute output_dtype to QuantizeLinear (#5956)

The purpose of this change is to allow setting the quantized type without providing the zero-point tensor for symmetric quantization. This reduces model size, most importantly for block quantization where the zero-point tensor dimensions are large, and reduces backend runtime. This implements issue #5943 --------- Signed-off-by: Gal Hubara Agam <ghubaraagam@nvidia.com>
onnx · Feb 25, 2024 · c95a59c · c95a59c
1 parent 7ddb57c
commit c95a59c
Show file tree

Hide file tree

Showing 31 changed files with 500 additions and 79 deletions.
diff --git a/docs/Changelog.md b/docs/Changelog.md
@@ -25409,6 +25409,8 @@ This version of the operator has been available since version 21 of the default
 <dd>(Optional) The axis of the dequantizing dimension of the input tensor. Used for per-axis and blocked quantization. Negative value means counting dimensions from the back. Accepted range is `[-r, r-1]` where `r = rank(input)`.</dd>
 <dt><tt>block_size</tt> : int (default is 0)</dt>
 <dd>(Optional) The size of the quantization block (number of times every scale is replicated). Used only for blocked quantization. The block size is a positive integer. Given `x` shape `(D0, ..., Di, ..., Dn)`, `y_scale` shape `(S0, ... Si, ...Sn)` and `axis=i`, the accepted range is `[ceil(Di/Si), ceil(Di/(Si-1))-1]`</dd>
+<dt><tt>output_dtype</tt> : int (default is 0)</dt>
+<dd>(Optional) The output data type. If not supplied, the output data type is inferred from `y_zero_point` data type (`T2`). If neither `output_dtype` nor `y_zero_point` are supplied, output data type is uint8. If both `output_dtype` and `y_zero_point` are specified, `output_dtype` must be `T2`.</dd>
 <dt><tt>saturate</tt> : int (default is 1)</dt>
 <dd>The parameter defines how the conversion behaves if an input value is out of range of the destination type. It only applies for float 8 quantization (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. All cases are fully described in two tables inserted in the operator description.</dd>
 </dl>

diff --git a/docs/Operators.md b/docs/Operators.md
@@ -20275,6 +20275,8 @@ Other versions of this operator: <a href="Changelog.md#QuantizeLinear-10">10</a>
 <dd>(Optional) The axis of the dequantizing dimension of the input tensor. Used for per-axis and blocked quantization. Negative value means counting dimensions from the back. Accepted range is `[-r, r-1]` where `r = rank(input)`.</dd>
 <dt><tt>block_size</tt> : int (default is 0)</dt>
 <dd>(Optional) The size of the quantization block (number of times every scale is replicated). Used only for blocked quantization. The block size is a positive integer. Given `x` shape `(D0, ..., Di, ..., Dn)`, `y_scale` shape `(S0, ... Si, ...Sn)` and `axis=i`, the accepted range is `[ceil(Di/Si), ceil(Di/(Si-1))-1]`</dd>
+<dt><tt>output_dtype</tt> : int (default is 0)</dt>
+<dd>(Optional) The output data type. If not supplied, the output data type is inferred from `y_zero_point` data type (`T2`). If neither `output_dtype` nor `y_zero_point` are supplied, output data type is uint8. If both `output_dtype` and `y_zero_point` are specified, `output_dtype` must be `T2`.</dd>
 <dt><tt>saturate</tt> : int (default is 1)</dt>
 <dd>The parameter defines how the conversion behaves if an input value is out of range of the destination type. It only applies for float 8 quantization (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz). It is true by default. All cases are fully described in two tables inserted in the operator description.</dd>
 </dl>
@@ -20347,7 +20349,7 @@ expect(
 
 
 <details>
-<summary>blocked</summary>
+<summary>blocked_asymmetric</summary>
 
 ```python
 node = onnx.helper.make_node(
@@ -20407,7 +20409,74 @@ expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_blocked",
+    name="test_quantizelinear_blocked_asymmetric",
+)
+```
+
+</details>
+
+
+<details>
+<summary>blocked_symmetric</summary>
+
+```python
+node = onnx.helper.make_node(
+    "QuantizeLinear",
+    inputs=["x", "y_scale"],
+    outputs=["y"],
+    axis=1,
+    block_size=2,
+    output_dtype=TensorProto.INT16,
+)
+
+x = np.array(
+    [
+        [6.0, -8, -10, 5.0],
+        [1.0, 8.0, 4.0, 5.0],
+        [0.0, 20.0, 10.0, 4.0],
+    ],
+    dtype=np.float32,
+)
+
+y_scale = np.array(
+    [
+        [1.5, 2.5],
+        [3.0, 4.9],
+        [5.1, 6.9],
+    ],
+    dtype=np.float32,
+)
+
+# x.shape = (3, 4)
+# y_scale.shape = (3, 2)
+
+block_axis = 1
+# The block shape is [x.shape[i] // y_scale.shape[i] for i in range(len(x.shape))] = (1, 2)
+assert all(
+    x.shape[i] == y_scale.shape[i]
+    for i in range(len(x.shape))
+    if i != block_axis
+)
+assert x.shape[block_axis] % y_scale.shape[block_axis] == 0
+repeats = x.shape[block_axis] // y_scale.shape[block_axis]
+
+# Create element-wise scale and zero point
+y_scale_elementwise = np.repeat(y_scale, repeats=repeats, axis=block_axis)
+
+y_val = np.clip(
+    np.rint(x / y_scale_elementwise), a_min=-32768, a_max=32767
+).astype(np.int16)
+y = make_tensor(
+    "y",
+    TensorProto.INT16,
+    x.shape,
+    y_val,
+)
+expect(
+    node,
+    inputs=[x, y_scale],
+    outputs=[y],
+    name="test_quantizelinear_blocked_symmetric",
 )
 ```
 

diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
@@ -13778,7 +13778,7 @@ for quant_type_name in ["uint8", "int8"]:
 
 
 ### QuantizeLinear
-There are 9 test cases, listed as following:
+There are 10 test cases, listed as following:
 <details>
 <summary>axis</summary>
 
@@ -13815,7 +13815,7 @@ expect(
 
 </details>
 <details>
-<summary>blocked</summary>
+<summary>blocked_asymmetric</summary>
 
 ```python
 node = onnx.helper.make_node(
@@ -13875,7 +13875,72 @@ expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_blocked",
+    name="test_quantizelinear_blocked_asymmetric",
+)
+```
+
+</details>
+<details>
+<summary>blocked_symmetric</summary>
+
+```python
+node = onnx.helper.make_node(
+    "QuantizeLinear",
+    inputs=["x", "y_scale"],
+    outputs=["y"],
+    axis=1,
+    block_size=2,
+    output_dtype=TensorProto.INT16,
+)
+
+x = np.array(
+    [
+        [6.0, -8, -10, 5.0],
+        [1.0, 8.0, 4.0, 5.0],
+        [0.0, 20.0, 10.0, 4.0],
+    ],
+    dtype=np.float32,
+)
+
+y_scale = np.array(
+    [
+        [1.5, 2.5],
+        [3.0, 4.9],
+        [5.1, 6.9],
+    ],
+    dtype=np.float32,
+)
+
+# x.shape = (3, 4)
+# y_scale.shape = (3, 2)
+
+block_axis = 1
+# The block shape is [x.shape[i] // y_scale.shape[i] for i in range(len(x.shape))] = (1, 2)
+assert all(
+    x.shape[i] == y_scale.shape[i]
+    for i in range(len(x.shape))
+    if i != block_axis
+)
+assert x.shape[block_axis] % y_scale.shape[block_axis] == 0
+repeats = x.shape[block_axis] // y_scale.shape[block_axis]
+
+# Create element-wise scale and zero point
+y_scale_elementwise = np.repeat(y_scale, repeats=repeats, axis=block_axis)
+
+y_val = np.clip(
+    np.rint(x / y_scale_elementwise), a_min=-32768, a_max=32767
+).astype(np.int16)
+y = make_tensor(
+    "y",
+    TensorProto.INT16,
+    x.shape,
+    y_val,
+)
+expect(
+    node,
+    inputs=[x, y_scale],
+    outputs=[y],
+    name="test_quantizelinear_blocked_symmetric",
 )
 ```
 

diff --git a/onnx/backend/test/case/node/quantizelinear.py b/onnx/backend/test/case/node/quantizelinear.py
@@ -276,7 +276,7 @@ def export_int4() -> None:
         )
 
     @staticmethod
-    def export_blocked() -> None:
+    def export_blocked_asymmetric() -> None:
         node = onnx.helper.make_node(
             "QuantizeLinear",
             inputs=["x", "y_scale", "y_zero_point"],
@@ -334,5 +334,66 @@ def export_blocked() -> None:
             node,
             inputs=[x, y_scale, y_zero_point],
             outputs=[y],
-            name="test_quantizelinear_blocked",
+            name="test_quantizelinear_blocked_asymmetric",
+        )
+
+    @staticmethod
+    def export_blocked_symmetric() -> None:
+        node = onnx.helper.make_node(
+            "QuantizeLinear",
+            inputs=["x", "y_scale"],
+            outputs=["y"],
+            axis=1,
+            block_size=2,
+            output_dtype=TensorProto.INT16,
+        )
+
+        x = np.array(
+            [
+                [6.0, -8, -10, 5.0],
+                [1.0, 8.0, 4.0, 5.0],
+                [0.0, 20.0, 10.0, 4.0],
+            ],
+            dtype=np.float32,
+        )
+
+        y_scale = np.array(
+            [
+                [1.5, 2.5],
+                [3.0, 4.9],
+                [5.1, 6.9],
+            ],
+            dtype=np.float32,
+        )
+
+        # x.shape = (3, 4)
+        # y_scale.shape = (3, 2)
+
+        block_axis = 1
+        # The block shape is [x.shape[i] // y_scale.shape[i] for i in range(len(x.shape))] = (1, 2)
+        assert all(
+            x.shape[i] == y_scale.shape[i]
+            for i in range(len(x.shape))
+            if i != block_axis
+        )
+        assert x.shape[block_axis] % y_scale.shape[block_axis] == 0
+        repeats = x.shape[block_axis] // y_scale.shape[block_axis]
+
+        # Create element-wise scale and zero point
+        y_scale_elementwise = np.repeat(y_scale, repeats=repeats, axis=block_axis)
+
+        y_val = np.clip(
+            np.rint(x / y_scale_elementwise), a_min=-32768, a_max=32767
+        ).astype(np.int16)
+        y = make_tensor(
+            "y",
+            TensorProto.INT16,
+            x.shape,
+            y_val,
+        )
+        expect(
+            node,
+            inputs=[x, y_scale],
+            outputs=[y],
+            name="test_quantizelinear_blocked_symmetric",
         )
diff --git a/...de/test_quantizelinear_blocked/model.onnx → ...ntizelinear_blocked_asymmetric/model.onnx b/...de/test_quantizelinear_blocked/model.onnx → ...ntizelinear_blocked_asymmetric/model.onnx
diff --git a/...linear_blocked/test_data_set_0/input_0.pb → ...ked_asymmetric/test_data_set_0/input_0.pb b/...linear_blocked/test_data_set_0/input_0.pb → ...ked_asymmetric/test_data_set_0/input_0.pb
diff --git a/...linear_blocked/test_data_set_0/input_1.pb → ...ked_asymmetric/test_data_set_0/input_1.pb b/...linear_blocked/test_data_set_0/input_1.pb → ...ked_asymmetric/test_data_set_0/input_1.pb
diff --git a/...linear_blocked/test_data_set_0/input_2.pb → ...ked_asymmetric/test_data_set_0/input_2.pb b/...linear_blocked/test_data_set_0/input_2.pb → ...ked_asymmetric/test_data_set_0/input_2.pb
diff --git a/...inear_blocked/test_data_set_0/output_0.pb → ...ed_asymmetric/test_data_set_0/output_0.pb b/...inear_blocked/test_data_set_0/output_0.pb → ...ed_asymmetric/test_data_set_0/output_0.pb
diff --git a/onnx/backend/test/data/node/test_quantizelinear_blocked_symmetric/model.onnx b/onnx/backend/test/data/node/test_quantizelinear_blocked_symmetric/model.onnx
diff --git a/onnx/backend/test/data/node/test_quantizelinear_blocked_symmetric/test_data_set_0/input_0.pb b/onnx/backend/test/data/node/test_quantizelinear_blocked_symmetric/test_data_set_0/input_0.pb
diff --git a/onnx/backend/test/data/node/test_quantizelinear_blocked_symmetric/test_data_set_0/input_1.pb b/onnx/backend/test/data/node/test_quantizelinear_blocked_symmetric/test_data_set_0/input_1.pb
diff --git a/.../backend/test/data/node/test_quantizelinear_blocked_symmetric/test_data_set_0/output_0.pb b/.../backend/test/data/node/test_quantizelinear_blocked_symmetric/test_data_set_0/output_0.pb
diff --git a/...est/data/node/test_reduce_log_sum_exp_do_not_keepdims_example/test_data_set_0/output_0.pb b/...est/data/node/test_reduce_log_sum_exp_do_not_keepdims_example/test_data_set_0/output_0.pb
diff --git a/...node/test_reduce_log_sum_exp_do_not_keepdims_example_expanded/test_data_set_0/output_0.pb b/...node/test_reduce_log_sum_exp_do_not_keepdims_example_expanded/test_data_set_0/output_0.pb
diff --git a/...test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random/test_data_set_0/output_0.pb b/...test/data/node/test_reduce_log_sum_exp_do_not_keepdims_random/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
 �1�?
diff --git a/.../node/test_reduce_log_sum_exp_do_not_keepdims_random_expanded/test_data_set_0/output_0.pb b/.../node/test_reduce_log_sum_exp_do_not_keepdims_random_expanded/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
 �1�?
diff --git a/...ckend/test/data/node/test_reduce_log_sum_exp_keepdims_example/test_data_set_0/output_0.pb b/...ckend/test/data/node/test_reduce_log_sum_exp_keepdims_example/test_data_set_0/output_0.pb
diff --git a/...t/data/node/test_reduce_log_sum_exp_keepdims_example_expanded/test_data_set_0/output_0.pb b/...t/data/node/test_reduce_log_sum_exp_keepdims_example_expanded/test_data_set_0/output_0.pb
diff --git a/...ackend/test/data/node/test_reduce_log_sum_exp_keepdims_random/test_data_set_0/output_0.pb b/...ackend/test/data/node/test_reduce_log_sum_exp_keepdims_random/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
 �1�?
diff --git a/...st/data/node/test_reduce_log_sum_exp_keepdims_random_expanded/test_data_set_0/output_0.pb b/...st/data/node/test_reduce_log_sum_exp_keepdims_random_expanded/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
 �1�?
diff --git a/...a/node/test_reduce_log_sum_exp_negative_axes_keepdims_example/test_data_set_0/output_0.pb b/...a/node/test_reduce_log_sum_exp_negative_axes_keepdims_example/test_data_set_0/output_0.pb
diff --git a/...st_reduce_log_sum_exp_negative_axes_keepdims_example_expanded/test_data_set_0/output_0.pb b/...st_reduce_log_sum_exp_negative_axes_keepdims_example_expanded/test_data_set_0/output_0.pb
diff --git a/...ta/node/test_reduce_log_sum_exp_negative_axes_keepdims_random/test_data_set_0/output_0.pb b/...ta/node/test_reduce_log_sum_exp_negative_axes_keepdims_random/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
 �1�?
diff --git a/...est_reduce_log_sum_exp_negative_axes_keepdims_random_expanded/test_data_set_0/output_0.pb b/...est_reduce_log_sum_exp_negative_axes_keepdims_random_expanded/test_data_set_0/output_0.pb
@@ -1,2 +1,2 @@
-BreducedJ0�eMIz�@�&~X@{��U��忎c;��^@n6���"@��
+BreducedJ0�eMIz�@�&~X@}��U��忎c;��^@n6���"@��
 �1�?
diff --git a/onnx/common/interned_strings.h b/onnx/common/interned_strings.h
@@ -190,7 +190,8 @@ namespace ONNX_NAMESPACE {
   _(end)                            \
   _(num_groups)                     \
   _(stash_type)                     \
-  _(block_size)
+  _(block_size)                     \
+  _(output_dtype)
 
 enum BuiltinSymbol {
 #define DEFINE_SYMBOL(s) k##s,

diff --git a/onnx/defs/quantization/defs.cc b/onnx/defs/quantization/defs.cc
@@ -78,6 +78,13 @@ ONNX_OPERATOR_SET_SCHEMA(
             "`[ceil(Di/Si), ceil(Di/(Si-1))-1]`",
             AttributeProto::INT,
             static_cast<int64_t>(0))
+        .Attr(
+            "output_dtype",
+            "(Optional) The output data type. If not supplied, the output data type is inferred from `y_zero_point` data type (`T2`). "
+            "If neither `output_dtype` nor `y_zero_point` are supplied, output data type is uint8. "
+            "If both `output_dtype` and `y_zero_point` are specified, `output_dtype` must be `T2`.",
+            AttributeProto::INT,
+            static_cast<int64_t>(0))
         .TypeConstraint(
             "T1",
             {"tensor(float)", "tensor(float16)", "tensor(bfloat16)", "tensor(int32)"},
@@ -97,8 +104,22 @@ ONNX_OPERATOR_SET_SCHEMA(
             "The type of the input `y_zero_point` and the output `y`.")
         .SetDoc(QuantizeLinear_ver21_doc)
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-          if (ctx.hasInput(2)) {
+          auto const zp_type = ctx.hasInput(2) ? ctx.getInputType(2) : nullptr;
+          auto const output_dtype =
+              static_cast<TensorProto_DataType>(getAttribute(ctx, "output_dtype", TensorProto::UNDEFINED));
+          if (zp_type != nullptr) {
+            auto const zp_elem_type = static_cast<TensorProto_DataType>(getTensorElementType(*zp_type));
+            if (output_dtype != TensorProto::UNDEFINED && output_dtype != zp_elem_type) {
+              fail_type_inference(
+                  "output_dtype ",
+                  TensorProto_DataType_Name(output_dtype),
+                  " does not match y_zero_point type ",
+                  TensorProto_DataType_Name(zp_elem_type),
+                  ".");
+            }
             propagateElemTypeFromInputToOutput(ctx, 2, 0);
+          } else if (output_dtype != TensorProto::UNDEFINED) {
+            propagateElemTypeFromAttributeToOutput(ctx, "output_dtype", 0);
           } else {
             updateOutputElemType(ctx, 0, TensorProto::UINT8);
           }