microsoft · guoyu-wang · Dec 19, 2020 · Dec 17, 2020 · Dec 17, 2020 · Dec 17, 2020
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -7,6 +7,7 @@
 
 #include <core/common/safeint.h>
 #include <core/common/logging/logging.h>
+#include <core/framework/tensorprotoutils.h>
 #include <core/graph/graph.h>
 #include <core/graph/graph_viewer.h>
 #include <core/providers/common.h>
@@ -64,15 +65,42 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) {
   return QLinearOpType::Unknown;
 }
 
+ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) {
+  const auto& op_type = node.OpType();
+  bool is_qlinear_conv = (op_type == "QLinearConv");
+  ORT_ENFORCE(op_type == "Conv" || is_qlinear_conv);
+
+  NodeAttrHelper helper(node);
+  const auto group = helper.Get("group", 1);
+
+  size_t w_idx = is_qlinear_conv ? 3 : 1;
+  const auto& weight = node.InputDefs()[w_idx]->Name();
+  const auto& weight_tensor = *initializers.at(weight);
+
+  // For ONNX we only have 1 conv ops
+  // For NNAPI we have 3
+  // Input is (N, C, H, W)
+  // group == 1,                                   --> regular conv
+  // group != 1 && weight is (M, 1, kH, kW),       --> depthwise conv
+  // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv
+  if (group == 1)
+    return ConvType::Regular;
+  else if ((weight_tensor.dims()[1] == 1))
+    return ConvType::Depthwise;
+  else
+    return ConvType::Grouped;
+}
+
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) {
   return qlinear_op_type == QLinearOpType::QLinearConv ||
          qlinear_op_type == QLinearOpType::QLinearMatMul ||
          qlinear_op_type == QLinearOpType::QLinearAdd;
 }
 
 bool HasValidBinaryOpQuantizedInputs(const Node& node) {
+  auto op_type = GetQLinearOpType(node);
   int32_t a_input_type, b_input_type;
-  if (!IsQLinearBinaryOp(GetQLinearOpType(node))) {
+  if (!IsQLinearBinaryOp(op_type)) {
     LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] is not a binary qlinear op";
     return false;
   }
@@ -83,7 +111,16 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
   if (!GetType(*input_defs[3], b_input_type))
     return false;
 
-  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) {
+  // QlinearConv supports u8u8 or u8s8
+  // QLinearMatMul/Add only support u8u8
+  bool is_qlinear_conv = op_type == QLinearOpType::QLinearConv;
+  bool has_valid_qlinear_conv_weight =
+      (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
+       b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8);
+
+  if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 ||
+      (!is_qlinear_conv && a_input_type != b_input_type) ||
+      (is_qlinear_conv && !has_valid_qlinear_conv_weight)) {
     LOGS_DEFAULT(VERBOSE) << "[" << node.OpType()
                           << "] A Input type: [" << a_input_type
                           << "] B Input type: [" << b_input_type
@@ -95,8 +132,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
 }
 
 bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
-                                const std::vector<size_t>& indices) {
-  const auto& op = node.OpType();
+                                const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
+  const auto& op_type = node.OpType();
+  bool is_qlinear_conv = (op_type == "QLinearConv");
   const auto input_defs(node.InputDefs());
   for (const auto idx : indices) {
     if (idx >= input_defs.size()) {
@@ -106,13 +144,42 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
     }
     const auto scale_name = input_defs[idx]->Name();
     if (Contains(initializers, scale_name)) {
-      const auto& tensor = *initializers.at(scale_name);
-      if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
-        LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
-        return false;
+      const auto& scale_tensor = *initializers.at(scale_name);
+      int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
+      bool is_conv_weight = is_qlinear_conv && idx == 4;
+      bool is_conv_u8s8_weight = false;
+
+      if (is_conv_weight) {
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
+      }
+
+      // We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
+      // We only support per-channel quantization for u8s8
+      // For all other cases, the scales should be a scalar
+      if (is_conv_u8s8_weight) {
+        if (params.android_sdk_ver < 29) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
+                                << "system API level: " << params.android_sdk_ver;
+          return false;
+        }
+
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        if (weight_tensor.dims()[0] != scales_dim) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                                << " weight dimension[0] " << weight_tensor.dims()[0]
+                                << " scale dimension " << scales_dim;
+          return false;
+        }
+      } else {
+        if (scales_dim != 1) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+          return false;
+        }
       }
     } else {
-      LOGS_DEFAULT(VERBOSE) << "The scale of " << op << " must be known";
+      LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
       return false;
     }
   }
@@ -122,28 +189,72 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
 
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                     const std::vector<size_t>& indices) {
-  const auto& op = node.OpType();
+  const auto& op_type = node.OpType();
+  bool is_qlinear_conv = (op_type == "QLinearConv");
   const auto input_defs(node.InputDefs());
   for (const auto idx : indices) {
     if (idx >= input_defs.size()) {
       LOGS_DEFAULT(VERBOSE) << "HasValidQuantizationZeroPoints, Input index,  " << idx
                             << " >= input number, " << input_defs.size();
       return false;
     }
-    const auto zero_point_name = node.InputDefs()[idx]->Name();
+
+    const auto zero_point_name = input_defs[idx]->Name();
     if (Contains(initializers, zero_point_name)) {
-      const auto& tensor = *initializers.at(zero_point_name);
-      if (!tensor.dims().empty() && tensor.dims()[0] != 1) {
-        LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization";
-        return false;
+      bool is_conv_weight = is_qlinear_conv && idx == 5;
+      bool is_conv_u8s8_weight = false;
+      if (is_conv_weight) {
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
       }
-      if (tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-        LOGS_DEFAULT(VERBOSE) << op << " does not support zero point data type "
-                              << std::to_string(tensor.data_type());
-        return false;
+
+      const auto& zero_tensor = *initializers.at(zero_point_name);
+      int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
+      if (is_conv_u8s8_weight) {
+        if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
+          LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
+                                << "actual zero point type: [" << zero_tensor.data_type() << "]";
+          return false;
+        }
+
+        // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
+        // or a tensor with same channel as weight, for NNAPI we only support it be
+        // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
+        // quantization is 0 there is no input for it
+        const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
+        if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
+                                << " weight dimension[0] " << weight_tensor.dims()[0]
+                                << " zero point dimension " << zero_dim;
+          return false;
+        }
+
+        std::unique_ptr<uint8_t[]> unpacked_tensor;
+        size_t tensor_byte_size;
+        auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, unpacked_tensor, tensor_byte_size);
+        if (!status.IsOK()) {
+          LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
+          return false;
+        }
+
+        // Verify all onnx weight zero point(s) are 0(s)
+        const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
+        for (size_t i = 0; i < tensor_byte_size; i++) {
+          if (zero_points[i] != 0) {
+            LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
+                                  << "zero_points[" << i << "] has value: " << zero_points[i];
+            return false;
+          }
+        }
+      } else {
+        if (zero_dim != 1) {
+          LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
+                                << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
+          return false;
+        }
       }
     } else {
-      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op << " must be known";
+      LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
       return false;
     }
   }

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -87,8 +87,18 @@ enum class QLinearOpType : uint8_t {
   // QLinearReduceMean,
 };
 
+enum class ConvType : uint8_t {
+  Regular,
+  Depthwise,
+  Grouped,
+};
+
 QLinearOpType GetQLinearOpType(const onnxruntime::Node& node);
 
+// Return the type of the conv ops,
+// This function assumes the input is a 2d conv node
+ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers);
+
 // This qlinear op is an operator takes 2 input and produces 1 output
 // Such as QLinearConv, QLinearMatMul, QLinearAdd, ...
 bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
@@ -97,7 +107,7 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type);
 bool HasValidBinaryOpQuantizedInputs(const Node& node);
 // Check if a qlinear op has valid scales for given indices
 bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
-                                const std::vector<size_t>& indices);
+                                const std::vector<size_t>& indices, const OpSupportCheckParams& params);
 // Check if a qlinear op has valid zero points for given indices
 bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
                                     const std::vector<size_t>& indices);

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -26,7 +26,7 @@ int32_t ModelBuilder::GetAndroidSdkVer() const {
 // Scalar operand is copied into the model, no need to persist
 #define DEFINE_ADD_OPERAND_FROM_SCALAR(scalar_type, op_type)                      \
   Status ModelBuilder::AddOperandFromScalar(scalar_type value, uint32_t& index) { \
-    OperandType operandType(Type::op_type);                                       \
+    OperandType operandType(Type::op_type, vector<uint32_t>{});                   \
     ORT_RETURN_IF_ERROR(AddNewNNAPIOperand(operandType, index));                  \
     RETURN_STATUS_ON_ERROR_WITH_NOTE(                                             \
         nnapi_->ANeuralNetworksModel_setOperandValue(                             \
@@ -377,6 +377,18 @@ Status ModelBuilder::AddNewNNAPIOperand(const OperandType& operand_type, uint32_
   RETURN_STATUS_ON_ERROR(
       nnapi_->ANeuralNetworksModel_addOperand(nnapi_model_->model_, &operand_type.operandType));
   index = next_index_++;
+
+  if (operand_type.channelQuant) {
+    if (GetAndroidSdkVer() < 29) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Per-channel quantization is only supported on Android API level 29+,",
+                             " system API level: ", GetAndroidSdkVer());
+    }
+
+    RETURN_STATUS_ON_ERROR(nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
+        nnapi_model_->model_, index, &operand_type.channelQuant->params));
+  }
+
   return Status::OK();
 }