microsoft · zhanghuanrong · Jan 29, 2021 · Jan 5, 2021 · Jan 19, 2021 · Jan 19, 2021
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -50,6 +50,7 @@ class NhwcTransformerImpl {
   void TransformQLinearActivation(Node& node);
   void TransformQLinearGlobalAveragePool(Node& node);
   void TransformSplit(Node& node);
+  void TransformPad(Node& node);
 
   Graph& graph_;
 
@@ -282,6 +283,49 @@ void NhwcTransformerImpl::TransformSplit(Node& node) {
   CreateNhwcArgument(node, node, nhwc_input->rank_);
 }
 
+void NhwcTransformerImpl::TransformPad(Node& node) {
+  auto& input_defs = node.MutableInputDefs();
+
+  auto* nhwc_input = LookupNhwcArgument(input_defs[0]);
+  if (nhwc_input == nullptr) {
+    return;
+  }
+
+  const ONNX_NAMESPACE::TensorProto* pads_tensor_proto = nullptr;
+  if (!graph_utils::NodeArgIsConstant(graph_, *input_defs[1]) ||
+      !graph_.GetInitializedTensor(input_defs[1]->Name(), pads_tensor_proto) ||
+      (pads_tensor_proto->dims_size() != 1) ||
+      (pads_tensor_proto->dims(0) != nhwc_input->rank_ * 2) ||
+      (nhwc_input->rank_ <= 2)) {  // nc only, no any hw axises
+    return;
+  }
+
+  // perm nchw to nhwc on pad tensor
+  Initializer pads_initializer{*pads_tensor_proto, graph_.ModelPath()};
+  const int64_t* nchw_pads_data = pads_initializer.data<int64_t>();
+  size_t n_dim = static_cast<size_t>(pads_tensor_proto->dims(0)) / 2;
+  std::vector<int64_t> nhwc_pads(nchw_pads_data, nchw_pads_data + pads_tensor_proto->dims(0));
+  std::copy_n(nchw_pads_data + 2, n_dim - 2, nhwc_pads.data() + 1);
+  std::copy_n(nchw_pads_data + 2 + n_dim, n_dim - 2, nhwc_pads.data() + 1 + n_dim);
+  nhwc_pads[n_dim - 1] = nchw_pads_data[1];
+  nhwc_pads[2 * n_dim - 1] = nchw_pads_data[n_dim + 1];
+
+  ONNX_NAMESPACE::TensorProto nhwc_pads_tensor_proto;
+  nhwc_pads_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  nhwc_pads_tensor_proto.set_name(graph_.GenerateNodeArgName("nhwc_permutated_pads"));
+  nhwc_pads_tensor_proto.set_raw_data(nhwc_pads.data(), n_dim * 2 * sizeof(int64_t));
+  nhwc_pads_tensor_proto.add_dims(n_dim * 2);
+  NodeArg* nhwc_pads_arg = &graph_utils::AddInitializer(graph_, nhwc_pads_tensor_proto);
+
+  // Update the node to directly use the NHWC inputs and decrement the original
+  // use counts of the NHWC inputs.
+  input_defs[1] = nhwc_pads_arg;
+  input_defs[0] = nhwc_input->nhwc_arg_;
+  nhwc_input->remaining_original_uses_--;
+
+  CreateNhwcArgument(node, node, nhwc_input->rank_);
+}
+
 void NhwcTransformerImpl::Transform(Node& node) {
   if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConv", {10})) {
     TransformQLinearConv(node);
@@ -295,6 +339,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
     TransformQLinearGlobalAveragePool(node);
   } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Split", {2, 11, 13})) {
     TransformSplit(node);
+  } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {11, 13})) {
+    TransformPad(node);
   }
 }
 

diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -322,8 +322,15 @@ static Status PadImpl(OpKernelContext* ctx,
 
           int64_t prePad = reshaped_pad[inner_axis];
           int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
-          PadAxisConstant(axisStart - prePad, *axisStart, prePad);
-          PadAxisConstant(output, *(output - 1), postPad);
+          if (inner_no_pad_size == 1) {
+            PadAxisConstant(axisStart - prePad, *axisStart, prePad);
+            PadAxisConstant(output, *(output - 1), postPad);
+          } else {
+            // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
+            // Also general loop below after handling first pad axis with non-pad axis works fine.
+            PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, pads[inner_axis]);
+            PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, pads[inner_axis + data_rank]);
+          }
           output += postPad;
           alignSkip = prePad;
         }
@@ -353,8 +360,14 @@ static Status PadImpl(OpKernelContext* ctx,
 
           int64_t prePad = reshaped_pad[inner_axis];
           int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
-          PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, prePad);
-          PadInnermostAxis(output, output - 2, -1 /* inputDelta */, postPad);
+          if (inner_no_pad_size == 1) {
+            PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, prePad);
+            PadInnermostAxis(output, output - 2, -1 /* inputDelta */, postPad);
+          } else {
+            // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode.
+            PadAxis(axisStart - prePad, axisStart + prePad, 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, pads[inner_axis]);
+            PadAxis(output, output - 2 * inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, pads[inner_axis + data_rank]);
+          }
           output += postPad;
           alignSkip = prePad;
         }

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
@@ -139,14 +139,14 @@ def get_intermediate_outputs(self, calib_mode='naive', providers=None, ort_graph
         '''
 
         #conduct inference session and get intermediate outputs
+        sess_options = onnxruntime.SessionOptions()
         if ort_graph_optimization_enable:
-            session = onnxruntime.InferenceSession(self.augmented_model_path, None) 
+            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
         else:            
-            sess_options = onnxruntime.SessionOptions()
-            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL  #ORT_ENABLE_BASIC
-            session = onnxruntime.InferenceSession(self.augmented_model_path,
-                                                   sess_options=sess_options,
-                                                   providers=providers)
+            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+        session = onnxruntime.InferenceSession(self.augmented_model_path,
+                                               sess_options=sess_options,
+                                               providers=providers)
 
         #number of outputs in original model
         num_model_outputs = len(self.model.graph.output)

diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -15,7 +15,7 @@
 from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
 
 from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue, quantization_modes
-from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name
+from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name, quantize_nparray
 from .quant_utils import QuantType, onnx_domain, __producer__, __version__
 
 from .registry import CreateOpQuantizer, CreateDefaultOpQuantizer
@@ -48,11 +48,11 @@ def quantize_data(data, quantize_range, qType):
         scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1
         zero_point = 0
         # signed byte type
-        quantized_data = (np.asarray(data) / scale).round().astype('b')
+        quantized_data = quantize_nparray(QuantType.QInt8, np.asarray(data), scale, zero_point)
     elif qType == onnx_proto.TensorProto.UINT8:
         scale = (float(rmax) - rmin) / quantize_range if rmin != rmax else 1
         zero_point = round((0 - rmin) / scale)  # round to nearest integer
-        quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B')  # unsigned byte type
+        quantized_data = quantize_nparray(QuantType.QUInt8, np.asarray(data), scale, zero_point)
     else:
         raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
 
@@ -500,21 +500,25 @@ def _get_quantization_params(self, param_name):
 
         return True, scale_name, zero_point_name, scale_shape, zero_point_shape
 
-    def _get_quantize_input_nodes(self, node, input_index, qType):
+    def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name = None, given_zp_name = None):
         '''
         Given an input for a node (which is not a initializer), this function
             - add nodes to compute zero point and scale for this input if they don't exist.
             - add new QuantizeLinear node to quantize the input.
             parameter node: node being quantized in NodeProto format.
             parameter input_index: index of input in node.input.
             parameter qType: type to quantize to.
+            parameter given_scale_name: if those inputs need to be quanitzed using this scale tensor.
+            parameter given_zp_name: if those inputs to be quantized using this zeropoint tensor.
             return: List of newly created nodes in NodeProto format.
         '''
         input_name = node.input[input_index]
         output_name = input_name + "_quantized"
 
-        data_found, scale_name, zp_name, _, _ = \
-            self._get_quantization_params(input_name)
+        if (given_scale_name is not None) and (given_zp_name is not None):
+            data_found, scale_name, zp_name = (True, given_scale_name, given_zp_name)
+        else:
+            data_found, scale_name, zp_name, _, _ = self._get_quantization_params(input_name)
 
         if self.static:
             if data_found == False:

diff --git a/onnxruntime/python/tools/quantization/operators/pad.py b/onnxruntime/python/tools/quantization/operators/pad.py
@@ -0,0 +1,68 @@
+import onnx
+import numpy as np
+from .base_operator import QuantOperatorBase
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
+
+
+class QPad(QuantOperatorBase):
+    def __init__(self, onnx_quantizer, onnx_node):
+        super().__init__(onnx_quantizer, onnx_node)
+
+    def quantize(self):
+        node = self.node
+        assert (node.op_type == "Pad")
+
+        # Only after version 11, it has the optional constant_value
+        # If input[0] is not quantized, do not quanitize this node
+        if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
+            super().quantize()
+            return
+        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
+
+        kwargs = {}
+        for attribute in node.attribute:
+            kv = attribute_to_kwarg(attribute)
+            kwargs.update(kv)
+
+        if 'mode' not in kwargs or kwargs['mode'] == b'constant':
+            if len(node.input) > 2:  # There is 3rd input 'constant_value'
+                zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
+                scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
+                if zp_tensor is None or scale_tensor is None:
+                    super().quantize()
+                    return
+
+                padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
+                if padding_constant_initializer is not None:
+                    zp_array = onnx.numpy_helper.to_array(zp_tensor)
+                    zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
+                    scale_array = onnx.numpy_helper.to_array(scale_tensor)
+                    scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
+                    padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
+                    quantized_padding_constant_array = quantize_nparray(
+                        quantized_input_value.qType, padding_constant_array, scale_value, zp_value)
+                    quantized_padding_constant_name = node.input[2] + "_quantized"
+                    quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
+                        quantized_padding_constant_array, quantized_padding_constant_name)
+                    # Suppose this padding constant initializer only used by the node
+                    self.quantizer.model.remove_initializer(padding_constant_initializer)
+                    self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
+                    node.input[2] = quantized_padding_constant_name
+                else:
+                    pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
+                        node, 2, quantized_input_value.qType,
+                        quantized_input_value.scale_name, quantized_input_value.zp_name)
+                    self.quantizer.new_nodes += [pad_value_qnodes]
+                    node.input[2] = pad_value_qnodes.output[0]
+            else:
+                node.input.extend([quantized_input_value.zp_name])  # pad zero_point for original zero
+
+        # Create an entry for output quantized value
+        quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
+                                                quantized_input_value.scale_name, quantized_input_value.zp_name,
+                                                QuantizedValueType.Input)
+        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+        node.input[0] = quantized_input_value.q_name
+        node.output[0] = quantized_output_value.q_name
+        self.quantizer.new_nodes += [node]
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -60,6 +60,15 @@ class QuantType(Enum):
 }
 
 
+def quantize_nparray(qtype, arr, scale, zero_point, low = None, high = None):
+    dtype = QUANT_TYPE_TO_NP_TYPE[qtype]
+    cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
+    cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
+    arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
+    numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
+    return arr_fp32.astype(dtype)
+
+
 class QuantizedInitializer:
     '''
         Represents a linearly quantized weight input from ONNX operators

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -35,7 +35,7 @@ def optimize_model(model_path: Path):
     sess_option = SessionOptions()
     sess_option.optimized_model_filepath = opt_model_path.as_posix()
     sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
-    _ = InferenceSession(model_path.as_posix(), sess_option)
+    _ = InferenceSession(model_path.as_posix(), sess_option, providers=['CPUExecutionProvider'])
     optimized_model = onnx.load(opt_model_path.as_posix())
     return optimized_model
 

diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
@@ -11,6 +11,7 @@
 from .operators.gavgpool import QGlobalAveragePool
 from .operators.lstm import LSTMQuant
 from .operators.split import QSplit
+from .operators.pad import QPad
 
 CommonOpsRegistry = {"Gather": GatherQuant, "EmbedLayerNormalization": EmbedLayerNormalizationQuant}
 
@@ -33,7 +34,8 @@
     "Sigmoid": QLinearActivation,
     "MaxPool": QMaxPool,
     "GlobalAveragePool": QGlobalAveragePool,
-    "Split": QSplit,
+    "Split" : QSplit,
+    "Pad" : QPad,
 }
 QLinearOpsRegistry.update(CommonOpsRegistry)
 

diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -455,6 +455,40 @@ TEST(NhwcTransformerTests, ConvSplit) {
   }
 }
 
+TEST(NhwcTransformerTests, ConvPad) {
+  std::vector<std::string> pad_modes = {"constant", "reflect", "edge"};
+  for (const auto& mode : pad_modes) {
+    auto build_test_case = [&](NhwcTestHelper& helper) {
+      auto* input_arg = helper.MakeInput<uint8_t>({1, 23, 13, 13});
+      auto* conv1_output_arg = helper.MakeIntermediate();
+      auto* pads_const = helper.MakeScalarInitializer<uint8_t>(131);
+      auto* pads_arg = helper.Make1DInitializer<int64_t>({0, 0, 1, 2, 0, 0, 3, 4});
+      auto* pad_output_arg = helper.MakeIntermediate();
+      auto* conv2_output_arg = helper.MakeIntermediate();
+      auto* output_arg = helper.MakeOutput();
+
+      Node& conv1_node = helper.AddQLinearConvNode<uint8_t>(input_arg, .01f, 135,
+                                                            {30, 23, 3, 3}, .02f, 126,
+                                                            conv1_output_arg, .37f, 131);
+      conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+      Node& pad_node = helper.AddNode("Pad", {conv1_output_arg, pads_arg, pads_const}, {pad_output_arg});
+      pad_node.AddAttribute("mode", mode);
+      helper.AddQLinearConvNode<uint8_t>(pad_output_arg, .37f, 131,
+                                         {16, 30, 3, 3}, .015f, 129,
+                                         conv2_output_arg, .37f, 131);
+      helper.AddDequantizeLinearNode(conv2_output_arg, .37f, 131, output_arg);
+    };
+
+    auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
+      EXPECT_EQ(op_to_count["Transpose"], 2);
+    };
+
+    NhwcTransformerTester(build_test_case, check_nhwc_graph);
+  }
+}
+
 TEST(NhwcTransformerTests, ConvBlockActivation) {
   auto test_case = [&](uint32_t extra_edges) {
     auto build_test_case = [&](NhwcTestHelper& helper) {