diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc index b4241b92d77a4..55739d12df059 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -64,6 +65,32 @@ QLinearOpType GetQLinearOpType(const onnxruntime::Node& node) { return QLinearOpType::Unknown; } +ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers) { + const auto& op_type = node.OpType(); + bool is_qlinear_conv = (op_type == "QLinearConv"); + ORT_ENFORCE(op_type == "Conv" || is_qlinear_conv); + + NodeAttrHelper helper(node); + const auto group = helper.Get("group", 1); + + size_t w_idx = is_qlinear_conv ? 3 : 1; + const auto& weight = node.InputDefs()[w_idx]->Name(); + const auto& weight_tensor = *initializers.at(weight); + + // For ONNX we only have 1 conv ops + // For NNAPI we have 3 + // Input is (N, C, H, W) + // group == 1, --> regular conv + // group != 1 && weight is (M, 1, kH, kW), --> depthwise conv + // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv + if (group == 1) + return ConvType::Regular; + else if ((weight_tensor.dims()[1] == 1)) + return ConvType::Depthwise; + else + return ConvType::Grouped; +} + bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) { return qlinear_op_type == QLinearOpType::QLinearConv || qlinear_op_type == QLinearOpType::QLinearMatMul || @@ -71,8 +98,9 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type) { } bool HasValidBinaryOpQuantizedInputs(const Node& node) { + auto op_type = GetQLinearOpType(node); int32_t a_input_type, b_input_type; - if (!IsQLinearBinaryOp(GetQLinearOpType(node))) { + if (!IsQLinearBinaryOp(op_type)) { LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] is not a binary qlinear op"; return false; } @@ -83,7 +111,16 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) { if (!GetType(*input_defs[3], b_input_type)) return false; - if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || a_input_type != b_input_type) { + // QlinearConv supports u8u8 or u8s8 + // QLinearMatMul/Add only support u8u8 + bool is_qlinear_conv = op_type == QLinearOpType::QLinearConv; + bool has_valid_qlinear_conv_weight = + (b_input_type == ONNX_NAMESPACE::TensorProto_DataType_UINT8 || + b_input_type == ONNX_NAMESPACE::TensorProto_DataType_INT8); + + if (a_input_type != ONNX_NAMESPACE::TensorProto_DataType_UINT8 || + (!is_qlinear_conv && a_input_type != b_input_type) || + (is_qlinear_conv && !has_valid_qlinear_conv_weight)) { LOGS_DEFAULT(VERBOSE) << "[" << node.OpType() << "] A Input type: [" << a_input_type << "] B Input type: [" << b_input_type @@ -95,8 +132,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) { } bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node, - const std::vector& indices) { - const auto& op = node.OpType(); + const std::vector& indices, const OpSupportCheckParams& params) { + const auto& op_type = node.OpType(); + bool is_qlinear_conv = (op_type == "QLinearConv"); const auto input_defs(node.InputDefs()); for (const auto idx : indices) { if (idx >= input_defs.size()) { @@ -106,13 +144,42 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const } const auto scale_name = input_defs[idx]->Name(); if (Contains(initializers, scale_name)) { - const auto& tensor = *initializers.at(scale_name); - if (!tensor.dims().empty() && tensor.dims()[0] != 1) { - LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization"; - return false; + const auto& scale_tensor = *initializers.at(scale_name); + int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; + bool is_conv_weight = is_qlinear_conv && idx == 4; + bool is_conv_u8s8_weight = false; + + if (is_conv_weight) { + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; + } + + // We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv + // We only support per-channel quantization for u8s8 + // For all other cases, the scales should be a scalar + if (is_conv_u8s8_weight) { + if (params.android_sdk_ver < 29) { + LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, " + << "system API level: " << params.android_sdk_ver; + return false; + } + + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + if (weight_tensor.dims()[0] != scales_dim) { + LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," + << " weight dimension[0] " << weight_tensor.dims()[0] + << " scale dimension " << scales_dim; + return false; + } + } else { + if (scales_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " + << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; + return false; + } } } else { - LOGS_DEFAULT(VERBOSE) << "The scale of " << op << " must be known"; + LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known"; return false; } } @@ -122,7 +189,8 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node, const std::vector& indices) { - const auto& op = node.OpType(); + const auto& op_type = node.OpType(); + bool is_qlinear_conv = (op_type == "QLinearConv"); const auto input_defs(node.InputDefs()); for (const auto idx : indices) { if (idx >= input_defs.size()) { @@ -130,20 +198,63 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co << " >= input number, " << input_defs.size(); return false; } - const auto zero_point_name = node.InputDefs()[idx]->Name(); + + const auto zero_point_name = input_defs[idx]->Name(); if (Contains(initializers, zero_point_name)) { - const auto& tensor = *initializers.at(zero_point_name); - if (!tensor.dims().empty() && tensor.dims()[0] != 1) { - LOGS_DEFAULT(VERBOSE) << op << " does not support per-channel quantization"; - return false; + bool is_conv_weight = is_qlinear_conv && idx == 5; + bool is_conv_u8s8_weight = false; + if (is_conv_weight) { + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; } - if (tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - LOGS_DEFAULT(VERBOSE) << op << " does not support zero point data type " - << std::to_string(tensor.data_type()); - return false; + + const auto& zero_tensor = *initializers.at(zero_point_name); + int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0]; + if (is_conv_u8s8_weight) { + if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) { + LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, " + << "actual zero point type: [" << zero_tensor.data_type() << "]"; + return false; + } + + // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar, + // or a tensor with same channel as weight, for NNAPI we only support it be + // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel + // quantization is 0 there is no input for it + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," + << " weight dimension[0] " << weight_tensor.dims()[0] + << " zero point dimension " << zero_dim; + return false; + } + + std::unique_ptr unpacked_tensor; + size_t tensor_byte_size; + auto status = onnxruntime::utils::UnpackInitializerData(zero_tensor, unpacked_tensor, tensor_byte_size); + if (!status.IsOK()) { + LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage(); + return false; + } + + // Verify all onnx weight zero point(s) are 0(s) + const int8_t* zero_points = reinterpret_cast(unpacked_tensor.get()); + for (size_t i = 0; i < tensor_byte_size; i++) { + if (zero_points[i] != 0) { + LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, " + << "zero_points[" << i << "] has value: " << zero_points[i]; + return false; + } + } + } else { + if (zero_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " + << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; + return false; + } } } else { - LOGS_DEFAULT(VERBOSE) << "The zero point of " << op << " must be known"; + LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known"; return false; } } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h index 44825599bc5c8..35e016930d69c 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h @@ -87,8 +87,18 @@ enum class QLinearOpType : uint8_t { // QLinearReduceMean, }; +enum class ConvType : uint8_t { + Regular, + Depthwise, + Grouped, +}; + QLinearOpType GetQLinearOpType(const onnxruntime::Node& node); +// Return the type of the conv ops, +// This function assumes the input is a 2d conv node +ConvType GetConvType(const onnxruntime::Node& node, const InitializedTensorSet& initializers); + // This qlinear op is an operator takes 2 input and produces 1 output // Such as QLinearConv, QLinearMatMul, QLinearAdd, ... bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type); @@ -97,7 +107,7 @@ bool IsQLinearBinaryOp(QLinearOpType qlinear_op_type); bool HasValidBinaryOpQuantizedInputs(const Node& node); // Check if a qlinear op has valid scales for given indices bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node, - const std::vector& indices); + const std::vector& indices, const OpSupportCheckParams& params); // Check if a qlinear op has valid zero points for given indices bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node, const std::vector& indices); diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc index 9bb7e16179408..68f70dfe588bb 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc @@ -26,7 +26,7 @@ int32_t ModelBuilder::GetAndroidSdkVer() const { // Scalar operand is copied into the model, no need to persist #define DEFINE_ADD_OPERAND_FROM_SCALAR(scalar_type, op_type) \ Status ModelBuilder::AddOperandFromScalar(scalar_type value, uint32_t& index) { \ - OperandType operandType(Type::op_type); \ + OperandType operandType(Type::op_type, vector{}); \ ORT_RETURN_IF_ERROR(AddNewNNAPIOperand(operandType, index)); \ RETURN_STATUS_ON_ERROR_WITH_NOTE( \ nnapi_->ANeuralNetworksModel_setOperandValue( \ @@ -377,6 +377,18 @@ Status ModelBuilder::AddNewNNAPIOperand(const OperandType& operand_type, uint32_ RETURN_STATUS_ON_ERROR( nnapi_->ANeuralNetworksModel_addOperand(nnapi_model_->model_, &operand_type.operandType)); index = next_index_++; + + if (operand_type.channelQuant) { + if (GetAndroidSdkVer() < 29) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Per-channel quantization is only supported on Android API level 29+,", + " system API level: ", GetAndroidSdkVer()); + } + + RETURN_STATUS_ON_ERROR(nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams( + nnapi_model_->model_, index, &operand_type.channelQuant->params)); + } + return Status::OK(); } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc index b6e5a039c9518..1e5badf13fcd6 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc @@ -458,6 +458,7 @@ static Status HandleAutoPad(const Shape& input_shape, vector& onnx_pads, int32_t& nnapi_padding_code, bool& use_auto_pad) { + use_auto_pad = false; if (auto_pad_type != AutoPadType::NOTSET) { ORT_RETURN_IF_ERROR(ComputeConvPads(input_shape, weight_size_y, weight_size_x, onnx_pads, onnx_strides, onnx_dilations, @@ -524,6 +525,47 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint( return Status::OK(); } +static Status GetConvOpQuantizationScaleAndZeroPoint( + const ModelBuilder& model_builder, const Node& node, + float& a_scale, float& w_scale, float& y_scale, + int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point, + optional>& w_scales) ORT_MUST_USE_RESULT; +static Status GetConvOpQuantizationScaleAndZeroPoint( + const ModelBuilder& model_builder, const Node& node, + float& a_scale, float& w_scale, float& y_scale, + int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point, + optional>& w_scales) { + // Get scale and zero points + // We will handle per-channel weight scale and zero point later + ORT_RETURN_IF_ERROR( + GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node, + a_scale, w_scale, y_scale, + a_zero_point, w_zero_point, y_zero_point)); + + const auto input_defs = node.InputDefs(); + const auto& initializers(model_builder.GetInitializerTensors()); + const auto& weight_tensor = *initializers.at(input_defs[3]->Name()); + + // We are done here is this is u8u8 QLinearConv + if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8) + return Status::OK(); + + // Now we have u8s8 QlinearConv + // u8s8 QlinearConv always have 0 as zero point so we are not getting it here + // and we do not use w_scale here, so we reset them back to 0 + w_scale = 0.0f; + w_zero_point = 0; + + // We need to copy the 1d scales array for per-channel quantization + const auto& scale_tensor = *initializers.at(input_defs[4]->Name()); + const auto* scales = GetTensorFloatData(scale_tensor); + size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; + vector scales_vec(scales_size, 0.0f); + memcpy(scales_vec.data(), scales, sizeof(float) * scales_size); + w_scales = onnxruntime::make_optional(std::move(scales_vec)); + return Status::OK(); +} + // NNAPI has the quantization scale and zero point embedded in the ANeuralNetworksOperandType // ONNX has the quantization scale and zero point as the inputs of the qlinear operators // We want to verify the scale and zeropoint of the ONNX inputs matches the values embedded in the NNAPI inputs @@ -553,6 +595,35 @@ static Status IsValidInputQuantizedType(const ModelBuilder& model_builder, return Status::OK(); } +static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder, + const std::string& input_name, + float scale, + int32_t zero_point, + const optional>& scales) ORT_MUST_USE_RESULT; +static Status IsValidConvWeightQuantizedType(const ModelBuilder& model_builder, + const std::string& input_name, + float scale, + int32_t zero_point, + const optional>& scales) { + // first verify as the weight has no per-channel quantization + ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input_name, scale, zero_point)); + + if (scales) { + const OperandType& input_operand_type = model_builder.GetOperandTypes().at(input_name); + if (!input_operand_type.channelQuant) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input [", input_name, "] has no channelQuant"); + } + + if (input_operand_type.channelQuant.value().scales != scales.value()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input [", input_name, "] has mismatch scales between onnx and NNAPI"); + } + } + + return Status::OK(); +} + static void AddBinaryOpQuantizationScaleAndZeroPointToSkip(ModelBuilder& model_builder, const Node& node) { const auto input_defs(node.InputDefs()); model_builder.AddInitializerToSkip(input_defs[1]->Name()); // a_scale @@ -1253,6 +1324,13 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N } } + const auto& weight = input_defs[w_idx]->Name(); + const auto& weight_tensor = *initializers.at(weight); + auto conv_type = GetConvType(node, model_builder.GetGraphViewer().GetAllInitializedTensors()); + bool conv_2d = (conv_type == ConvType::Regular), + depthwise_conv_2d = (conv_type == ConvType::Depthwise), + grouped_conv_2d = (conv_type == ConvType::Grouped); + float x_scale = 0.0f, w_scale = 0.0f, y_scale = 0.0f; @@ -1260,31 +1338,16 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N w_zero_point = 0, y_zero_point = 0; + // this is for per-channel quantization weights + optional> w_scales; + if (is_qlinear_conv) { - ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node, - x_scale, w_scale, y_scale, - x_zero_point, w_zero_point, y_zero_point)); + ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node, + x_scale, w_scale, y_scale, + x_zero_point, w_zero_point, y_zero_point, + w_scales)); } - const auto& weight = input_defs[w_idx]->Name(); - const auto& weight_tensor = *initializers.at(weight); - bool conv_2d = false, - depthwise_conv_2d = false, - grouped_conv_2d = false; - - // For ONNX we only have 1 conv ops - // For NNAPI we have 3 - // Input is (N, C, H, W) - // group == 1, --> regular conv - // group != 1 && weight is (M, 1, kH, kW), --> depthwise conv - // group != 1 && weight is (M, C/group, kH, kW), --> grouped conv - if (group == 1) - conv_2d = true; - else if ((weight_tensor.dims()[1] == 1)) - depthwise_conv_2d = true; - else - grouped_conv_2d = true; - Shape onnx_weight_shape; for (auto dim : weight_tensor.dims()) onnx_weight_shape.push_back(SafeInt(dim)); @@ -1297,12 +1360,22 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N case ONNX_NAMESPACE::TensorProto_DataType_UINT8: onnx_weight_type = Type::TENSOR_QUANT8_ASYMM; break; + case ONNX_NAMESPACE::TensorProto_DataType_INT8: + onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL; + break; default: return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "The initializer of graph ", weight, " doesn't have valid type: ", weight_tensor.data_type()); } - OperandType onnx_weight_operand_type(onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point); + // Get weight operand type + // Per-channel quantized weight is handled differently + OperandType onnx_weight_operand_type = + (is_qlinear_conv && w_scales.has_value()) + ? OperandType{onnx_weight_type, onnx_weight_shape, + SymmPerChannelQuantParams{w_scales.value(), + depthwise_conv_2d ? 3u : 0u}} // channelDim is 3 for depthwise-conv + : OperandType{onnx_weight_type, onnx_weight_shape, w_scale, w_zero_point}; // Pre-process weights if (conv_2d || grouped_conv_2d) { @@ -1314,7 +1387,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N if (is_qlinear_conv) { // Verify if the scale and zero point matchs from onnx input/weight and nnapi input/weight ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, input, x_scale, x_zero_point)); - ORT_RETURN_IF_ERROR(IsValidInputQuantizedType(model_builder, weight, w_scale, w_zero_point)); + ORT_RETURN_IF_ERROR(IsValidConvWeightQuantizedType(model_builder, weight, w_scale, w_zero_point, w_scales)); } bool hasBias = (input_defs.size() > b_idx); @@ -1332,14 +1405,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N vector buffer(bias_dimen[0], 0.0f); OperandType bias_operand_type(Type::TENSOR_FLOAT32, bias_dimen, x_scale * w_scale); ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type)); - } else if (weight_type == Type::TENSOR_QUANT8_ASYMM) { + } else if (weight_type == Type::TENSOR_QUANT8_ASYMM || weight_type == Type::TENSOR_QUANT8_SYMM_PER_CHANNEL) { vector buffer(bias_dimen[0], 0); OperandType bias_operand_type(Type::TENSOR_INT32, bias_dimen, x_scale * w_scale); ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(bias, buffer.data(), bias_operand_type)); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unknown weight type ", TypeToStr(weight_type)); } - } else if (is_qlinear_conv) { // QLinearConv's bias type need special handling + } else if (is_qlinear_conv) { + // QLinearConv's bias type need special handling to add scale for quantization input const auto& bias_tensor = *model_builder.GetInitializerTensors().at(bias); ORT_RETURN_IF_NOT(bias_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT32, "bias of QLinearConv should be int32, actual type: ", bias_tensor.data_type()); diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc index 43e68fd98f92f..47d2be430b7e1 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc @@ -228,7 +228,7 @@ bool BinaryOpSupportChecker::HasSupportedInputsImpl(const Node& node) const { } bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const OpSupportCheckParams& /* params */) const { + const OpSupportCheckParams& params) const { const auto& op_type(node.OpType()); const auto input_defs(node.InputDefs()); bool op_is_qlinear = op_type == "QLinearAdd"; @@ -265,7 +265,7 @@ bool BinaryOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initi // All scale/zero points are initializer scalars // a/b/y_scale - if (!HasValidQuantizationScales(initializers, node, {1, 4, 6})) + if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params)) return false; // a/b/y_zero_point @@ -599,7 +599,7 @@ bool ConvOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial } // a/b/y_scale - if (!HasValidQuantizationScales(initializers, node, {1, 4, 6})) + if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params)) return false; // a/b/y_zero_point @@ -860,7 +860,7 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial // All scale/zero points are initializer scalars // a/b/y_scale - if (!HasValidQuantizationScales(initializers, node, {1, 4, 6})) + if (!HasValidQuantizationScales(initializers, node, {1, 4, 6}, params)) return false; // a/b/y_zero_point @@ -1003,7 +1003,7 @@ class QuantizeLinearOpSupportChecker : public BaseOpSupportChecker { }; bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const OpSupportCheckParams& /* params */) const { + const OpSupportCheckParams& params) const { const auto input_defs(node.InputDefs()); const auto output_defs(node.OutputDefs()); @@ -1018,7 +1018,7 @@ bool QuantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSe return false; } - if (!HasValidQuantizationScales(initializers, node, {1})) + if (!HasValidQuantizationScales(initializers, node, {1}, params)) return false; if (input_defs.size() == 3) { // has zero_point input @@ -1045,9 +1045,9 @@ class DequantizeLinearOpSupportChecker : public BaseOpSupportChecker { }; bool DequantizeLinearOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const OpSupportCheckParams& /* params */) const { + const OpSupportCheckParams& params) const { const auto input_defs(node.InputDefs()); - if (!HasValidQuantizationScales(initializers, node, {1})) + if (!HasValidQuantizationScales(initializers, node, {1}, params)) return false; if (input_defs.size() == 3) { // has zero_point input diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.cc index e44da9635a1e6..ff5a619cb3827 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.cc @@ -32,10 +32,22 @@ OperandType::OperandType(Type type, const std::vector& d, float scale, }; } -OperandType::OperandType(const OperandType& other) { - type = other.type; - dimensions = other.dimensions; - operandType = other.operandType; +OperandType::OperandType(Type type, const std::vector& d, SymmPerChannelQuantParams&& channelQuant) + : type(type), dimensions(d), channelQuant(std::move(channelQuant)) { + operandType = { + .type = static_cast(type), + .dimensionCount = static_cast(dimensions.size()), + .dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr, + .scale = 0.0f, + .zeroPoint = 0, + }; +} + +OperandType::OperandType(const OperandType& other) + : operandType(other.operandType), + type(other.type), + dimensions(other.dimensions), + channelQuant(other.channelQuant) { operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr; } @@ -44,6 +56,7 @@ OperandType& OperandType::operator=(const OperandType& other) { type = other.type; dimensions = other.dimensions; operandType = other.operandType; + channelQuant = other.channelQuant; operandType.dimensions = dimensions.size() > 0 ? dimensions.data() : nullptr; } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h index c75e301e5ddce..2836a034b4013 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h @@ -21,6 +21,9 @@ #include "NeuralNetworksTypes.h" +// Move to std::optional when we switch to c++ 17 +#include "core/common/optional.h" + template T Product(const std::vector& v) { return static_cast( @@ -99,12 +102,40 @@ inline std::string TypeToStr(const Type& type) { } } +struct SymmPerChannelQuantParams { + ANeuralNetworksSymmPerChannelQuantParams params; + std::vector scales; + SymmPerChannelQuantParams(std::vector scalesVec, uint32_t channelDim) + : scales(std::move(scalesVec)) { + params = { + .channelDim = channelDim, + .scaleCount = static_cast(scales.size()), + .scales = scales.size() > 0 ? scales.data() : nullptr, + }; + } + SymmPerChannelQuantParams(const SymmPerChannelQuantParams& other) + : params(other.params), scales(other.scales) { + params.scales = scales.size() > 0 ? scales.data() : nullptr; + } + SymmPerChannelQuantParams& operator=(const SymmPerChannelQuantParams& other) { + if (this != &other) { + params = other.params; + scales = other.scales; + params.scales = scales.size() > 0 ? scales.data() : nullptr; + } + return *this; + } +}; + struct OperandType { ANeuralNetworksOperandType operandType; Type type; std::vector dimensions; + onnxruntime::optional channelQuant; + + explicit OperandType(Type type, const std::vector& d, float scale = 0.0f, int32_t zeroPoint = 0); + explicit OperandType(Type type, const std::vector& d, SymmPerChannelQuantParams&& channelQuant); - explicit OperandType(Type type, const std::vector& d = {}, float scale = 0.0f, int32_t zeroPoint = 0); OperandType(const OperandType& other); OperandType& operator=(const OperandType& other); diff --git a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc index d34a7774530a3..c3a29213ddcc9 100644 --- a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc @@ -389,7 +389,9 @@ class QLinearConvOpTester { Y_shape.push_back(output_channels); for (size_t n = 0; n < kernel_rank; n++) { Y_shape.push_back(((input_shape[n] + pads[n] + pads[kernel_rank + n]) - - (dilations[n] * (kernel_shape[n] - 1) + 1)) / strides[n] + 1); + (dilations[n] * (kernel_shape[n] - 1) + 1)) / + strides[n] + + 1); } const int64_t* output_shape = Y_shape.data() + 2; Y_data.resize(ShapeSize(Y_shape)); @@ -464,22 +466,38 @@ class QLinearConvOpTester { test.AddInput("x", X_.shape_, X_.data_); test.AddInput("x_scale", {}, X_.scale_, all_input_initializer_except_x); - test.AddInput("x_zero_point", {}, {X_.zero_point_}); + test.AddInput("x_zero_point", {}, {X_.zero_point_}, all_input_initializer_except_x); const std::vector W_scale_shape{static_cast(W_.scale_.size())}; test.AddInput("w", W_.shape_, W_.data_, all_input_initializer_except_x); test.AddInput("w_scale", W_scale_shape, W_.scale_, all_input_initializer_except_x); - test.AddInput("w_zero_point", {}, {W_.zero_point_}); + test.AddInput("w_zero_point", {}, {W_.zero_point_}, all_input_initializer_except_x); test.AddInput("y_scale", {}, {output_scale_}, all_input_initializer_except_x); - test.AddInput("y_zero_point", {}, {output_zero_point_}); + test.AddInput("y_zero_point", {}, {output_zero_point_}, all_input_initializer_except_x); if (!B_.empty()) { const std::vector B_shape{static_cast(B_.size())}; - test.AddInput("b", B_shape, B_); + test.AddInput("b", B_shape, B_, all_input_initializer_except_x); } - test.AddOutput("y", Y_shape, Y_data); + float abs_error = 0.0f; + + // For quantized models, NNAPI's rounding is different than CPU provider + // Sometimes the result is within +/-1 of result of CPU provider + // For ONNX, we use rounding to nearest ties to even. + // For NNAPI, it is using std::round which is HALF_AWAY_FROM_ZERO, see + // https://android.googlesource.com/platform/frameworks/ml/+/refs/heads/master/nn/common/operations/Quantize.cpp + // Use 1 as abs_error which is the smallest possbile for uint8_t + // + // NOTE, for now the tolerance will only apply if the NNAPI is actually used, + // if for any reason the execution falls back to CPU, we still expect an exact match + // See, 'void Check(...' in onnxruntime/test/providers/provider_test_utils.cc +#ifdef USE_NNAPI + abs_error = 1.0f; +#endif + + test.AddOutput("y", Y_shape, Y_data, false /* sort_output */, 0.0f /* rel_error */, abs_error); if (!pads_.empty()) { test.AddAttribute("pads", pads_); diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc index ee12ff542e42b..43e9858e0c13c 100644 --- a/onnxruntime/test/providers/provider_test_utils.cc +++ b/onnxruntime/test/providers/provider_test_utils.cc @@ -68,6 +68,53 @@ void Check(const OpTester::Data& expected_data, const Tensor& output_tensor, } } +template <> +void Check(const OpTester::Data& expected_data, + const Tensor& output_tensor, + const std::string& provider_type) { + auto& expected_tensor = expected_data.data_.Get(); + auto* expected = expected_tensor.template Data(); + auto* output = output_tensor.template Data(); + auto size = output_tensor.Shape().Size(); + + bool has_abs_err = expected_data.absolute_error_.has_value(); + bool has_rel_err = expected_data.relative_error_.has_value(); + + if (expected_data.sort_output_) { + // if order can be jumbled in the output of an operator, sort both the + // expected and output buffers prior to + // comparison this is a "best-effort" algo and should satisfy the + // requirement for the few ops that do require this + // support without investing in a more sophisticated infrastructure for the + // same + sort_expected_and_actual_buffers(expected, output, size); + } + + // For uint8_t results, we only allow NNAPI EP to have an error tolerance, see below for the reason + // For any other EPs, we still expect an exact match for the results + if (provider_type == kNnapiExecutionProvider && (has_abs_err || has_rel_err)) { + double threshold = has_abs_err + ? expected_data.absolute_error_.value() + : 0.0; + + for (int i = 0; i < size; ++i) { + if (has_rel_err) { + EXPECT_NEAR(expected[i], output[i], + expected_data.relative_error_.value() * expected[i]) // expected[i] is unsigned, can't be negative + << "i:" << i << ", provider_type: " << provider_type; + } else { // has_abs_err + EXPECT_NEAR(expected[i], output[i], threshold) + << "i:" << i << ", provider_type: " << provider_type; + } + } + } else { + for (int i = 0; i < size; ++i) { + EXPECT_EQ(expected[i], output[i]) << "i:" << i + << ", provider_type: " << provider_type; + } + } +} + template <> void Check(const OpTester::Data& expected_data, const Tensor& output_tensor, @@ -747,8 +794,7 @@ void OpTester::Run( kAclExecutionProvider, kArmNNExecutionProvider, kNnapiExecutionProvider, - kRocmExecutionProvider - }; + kRocmExecutionProvider}; bool has_run = false; @@ -844,8 +890,7 @@ void OpTester::Run( } } - if (!valid) - { + if (!valid) { std::cerr << "No kernel registered from EP: " << provider_type << "for node: " << node.OpType() << std::endl; break; } diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h index 2f4c579bb6272..f15c0cb4a3848 100644 --- a/onnxruntime/test/providers/provider_test_utils.h +++ b/onnxruntime/test/providers/provider_test_utils.h @@ -324,20 +324,24 @@ class OpTester { template void AddOutput(const char* name, const std::vector& dims, const std::initializer_list& expected_values, - bool sort_output = false) { - AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false, sort_output); + bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) { + AddData(output_data_, name, dims, expected_values.begin(), expected_values.size(), false, + sort_output, nullptr /* dim_params */, rel_error, abs_error); } // This function doesn't work for vector because const vector cannot invoke its data(). template void AddOutput(const char* name, const std::vector& dims, const std::vector& expected_values, - bool sort_output = false) { - AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false, sort_output); + bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) { + AddData(output_data_, name, dims, expected_values.data(), expected_values.size(), false, + sort_output, nullptr /* dim_params */, rel_error, abs_error); } template - void AddOutput(const char* name, const std::vector& dims, const T* p_values, const size_t size) { - AddData(output_data_, name, dims, p_values, size); + void AddOutput(const char* name, const std::vector& dims, const T* p_values, const size_t size, + bool sort_output = false, float rel_error = 0.0f, float abs_error = 0.0f) { + AddData(output_data_, name, dims, p_values, size, false, + sort_output, nullptr /* dim_params */, rel_error, abs_error); } template @@ -521,7 +525,8 @@ class OpTester { template void AddData(std::vector& data, const char* name, const std::vector& dims, const T* values, int64_t values_count, bool is_initializer = false, bool sort_output = false, - const std::vector* dim_params = nullptr) { + const std::vector* dim_params = nullptr, + float rel_error = 0.0f, float abs_error = 0.0f) { ORT_TRY { TensorShape shape{dims}; ORT_ENFORCE(shape.Size() == values_count, values_count, " input values doesn't match tensor size of ", @@ -565,7 +570,19 @@ class OpTester { } node_arg.SetShape(new_shape); } - data.push_back(Data(std::move(node_arg), std::move(value), optional(), optional(), sort_output)); + + optional rel; + optional abs; + + if (rel_error != 0.0f) { + rel = rel_error; + } + + if (abs_error != 0.0f) { + abs = abs_error; + } + + data.push_back(Data(std::move(node_arg), std::move(value), std::move(rel), std::move(abs), sort_output)); if (is_initializer) initializer_index_.push_back(data.size() - 1); } ORT_CATCH(const std::exception& ex) {