Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions onnxruntime/core/optimizer/nhwc_transformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class NhwcTransformerImpl {
void TransformQLinearActivation(Node& node);
void TransformQLinearGlobalAveragePool(Node& node);
void TransformSplit(Node& node);
void TransformPad(Node& node);

Graph& graph_;

Expand Down Expand Up @@ -282,6 +283,49 @@ void NhwcTransformerImpl::TransformSplit(Node& node) {
CreateNhwcArgument(node, node, nhwc_input->rank_);
}

void NhwcTransformerImpl::TransformPad(Node& node) {
auto& input_defs = node.MutableInputDefs();

auto* nhwc_input = LookupNhwcArgument(input_defs[0]);
if (nhwc_input == nullptr) {
return;
}

const ONNX_NAMESPACE::TensorProto* pads_tensor_proto = nullptr;
if (!graph_utils::NodeArgIsConstant(graph_, *input_defs[1]) ||
!graph_.GetInitializedTensor(input_defs[1]->Name(), pads_tensor_proto) ||
(pads_tensor_proto->dims_size() != 1) ||
(pads_tensor_proto->dims(0) != nhwc_input->rank_ * 2) ||
(nhwc_input->rank_ <= 2)) { // nc only, no any hw axises
return;
}

// perm nchw to nhwc on pad tensor
Initializer pads_initializer{*pads_tensor_proto, graph_.ModelPath()};
const int64_t* nchw_pads_data = pads_initializer.data<int64_t>();
size_t n_dim = static_cast<size_t>(pads_tensor_proto->dims(0)) / 2;
std::vector<int64_t> nhwc_pads(nchw_pads_data, nchw_pads_data + pads_tensor_proto->dims(0));
std::copy_n(nchw_pads_data + 2, n_dim - 2, nhwc_pads.data() + 1);
std::copy_n(nchw_pads_data + 2 + n_dim, n_dim - 2, nhwc_pads.data() + 1 + n_dim);
nhwc_pads[n_dim - 1] = nchw_pads_data[1];
nhwc_pads[2 * n_dim - 1] = nchw_pads_data[n_dim + 1];

ONNX_NAMESPACE::TensorProto nhwc_pads_tensor_proto;
nhwc_pads_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
nhwc_pads_tensor_proto.set_name(graph_.GenerateNodeArgName("nhwc_permutated_pads"));
nhwc_pads_tensor_proto.set_raw_data(nhwc_pads.data(), n_dim * 2 * sizeof(int64_t));
nhwc_pads_tensor_proto.add_dims(n_dim * 2);
NodeArg* nhwc_pads_arg = &graph_utils::AddInitializer(graph_, nhwc_pads_tensor_proto);

// Update the node to directly use the NHWC inputs and decrement the original
// use counts of the NHWC inputs.
input_defs[1] = nhwc_pads_arg;
input_defs[0] = nhwc_input->nhwc_arg_;
nhwc_input->remaining_original_uses_--;

CreateNhwcArgument(node, node, nhwc_input->rank_);
}

void NhwcTransformerImpl::Transform(Node& node) {
if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "QLinearConv", {10})) {
TransformQLinearConv(node);
Expand All @@ -295,6 +339,8 @@ void NhwcTransformerImpl::Transform(Node& node) {
TransformQLinearGlobalAveragePool(node);
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Split", {2, 11, 13})) {
TransformSplit(node);
} else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {11, 13})) {
TransformPad(node);
}
}

Expand Down
21 changes: 17 additions & 4 deletions onnxruntime/core/providers/cpu/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,15 @@ static Status PadImpl(OpKernelContext* ctx,

int64_t prePad = reshaped_pad[inner_axis];
int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
PadAxisConstant(axisStart - prePad, *axisStart, prePad);
PadAxisConstant(output, *(output - 1), postPad);
if (inner_no_pad_size == 1) {
PadAxisConstant(axisStart - prePad, *axisStart, prePad);
PadAxisConstant(output, *(output - 1), postPad);
} else {
// When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
// Also general loop below after handling first pad axis with non-pad axis works fine.
PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, pads[inner_axis]);
PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, pads[inner_axis + data_rank]);
}
output += postPad;
alignSkip = prePad;
}
Expand Down Expand Up @@ -353,8 +360,14 @@ static Status PadImpl(OpKernelContext* ctx,

int64_t prePad = reshaped_pad[inner_axis];
int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, prePad);
PadInnermostAxis(output, output - 2, -1 /* inputDelta */, postPad);
if (inner_no_pad_size == 1) {
PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, prePad);
PadInnermostAxis(output, output - 2, -1 /* inputDelta */, postPad);
} else {
// When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode.
PadAxis(axisStart - prePad, axisStart + prePad, 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, pads[inner_axis]);
PadAxis(output, output - 2 * inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size * 2), inner_no_pad_size, pads[inner_axis + data_rank]);
}
output += postPad;
alignSkip = prePad;
}
Expand Down
12 changes: 6 additions & 6 deletions onnxruntime/python/tools/quantization/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,14 @@ def get_intermediate_outputs(self, calib_mode='naive', providers=None, ort_graph
'''

#conduct inference session and get intermediate outputs
sess_options = onnxruntime.SessionOptions()
if ort_graph_optimization_enable:
session = onnxruntime.InferenceSession(self.augmented_model_path, None)
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
else:
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL #ORT_ENABLE_BASIC
session = onnxruntime.InferenceSession(self.augmented_model_path,
sess_options=sess_options,
providers=providers)
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
session = onnxruntime.InferenceSession(self.augmented_model_path,
sess_options=sess_options,
providers=providers)

#number of outputs in original model
num_model_outputs = len(self.model.graph.output)
Expand Down
16 changes: 10 additions & 6 deletions onnxruntime/python/tools/quantization/onnx_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel

from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue, quantization_modes
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg, type_to_name, quantize_nparray
from .quant_utils import QuantType, onnx_domain, __producer__, __version__

from .registry import CreateOpQuantizer, CreateDefaultOpQuantizer
Expand Down Expand Up @@ -48,11 +48,11 @@ def quantize_data(data, quantize_range, qType):
scale = (float(max_range) * 2) / quantize_range if max_range > 0 else 1
zero_point = 0
# signed byte type
quantized_data = (np.asarray(data) / scale).round().astype('b')
quantized_data = quantize_nparray(QuantType.QInt8, np.asarray(data), scale, zero_point)
elif qType == onnx_proto.TensorProto.UINT8:
scale = (float(rmax) - rmin) / quantize_range if rmin != rmax else 1
zero_point = round((0 - rmin) / scale) # round to nearest integer
quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B') # unsigned byte type
quantized_data = quantize_nparray(QuantType.QUInt8, np.asarray(data), scale, zero_point)
else:
raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))

Expand Down Expand Up @@ -500,21 +500,25 @@ def _get_quantization_params(self, param_name):

return True, scale_name, zero_point_name, scale_shape, zero_point_shape

def _get_quantize_input_nodes(self, node, input_index, qType):
def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name = None, given_zp_name = None):
'''
Given an input for a node (which is not a initializer), this function
- add nodes to compute zero point and scale for this input if they don't exist.
- add new QuantizeLinear node to quantize the input.
parameter node: node being quantized in NodeProto format.
parameter input_index: index of input in node.input.
parameter qType: type to quantize to.
parameter given_scale_name: if those inputs need to be quanitzed using this scale tensor.
parameter given_zp_name: if those inputs to be quantized using this zeropoint tensor.
return: List of newly created nodes in NodeProto format.
'''
input_name = node.input[input_index]
output_name = input_name + "_quantized"

data_found, scale_name, zp_name, _, _ = \
self._get_quantization_params(input_name)
if (given_scale_name is not None) and (given_zp_name is not None):
data_found, scale_name, zp_name = (True, given_scale_name, given_zp_name)
else:
data_found, scale_name, zp_name, _, _ = self._get_quantization_params(input_name)

if self.static:
if data_found == False:
Expand Down
68 changes: 68 additions & 0 deletions onnxruntime/python/tools/quantization/operators/pad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import onnx
import numpy as np
from .base_operator import QuantOperatorBase
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray


class QPad(QuantOperatorBase):
def __init__(self, onnx_quantizer, onnx_node):
super().__init__(onnx_quantizer, onnx_node)

def quantize(self):
node = self.node
assert (node.op_type == "Pad")

# Only after version 11, it has the optional constant_value
# If input[0] is not quantized, do not quanitize this node
if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
super().quantize()
return
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]

kwargs = {}
for attribute in node.attribute:
kv = attribute_to_kwarg(attribute)
kwargs.update(kv)

if 'mode' not in kwargs or kwargs['mode'] == b'constant':
if len(node.input) > 2: # There is 3rd input 'constant_value'
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
if zp_tensor is None or scale_tensor is None:
super().quantize()
return

padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
if padding_constant_initializer is not None:
zp_array = onnx.numpy_helper.to_array(zp_tensor)
zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
scale_array = onnx.numpy_helper.to_array(scale_tensor)
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
quantized_padding_constant_array = quantize_nparray(
quantized_input_value.qType, padding_constant_array, scale_value, zp_value)
quantized_padding_constant_name = node.input[2] + "_quantized"
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
quantized_padding_constant_array, quantized_padding_constant_name)
# Suppose this padding constant initializer only used by the node
self.quantizer.model.remove_initializer(padding_constant_initializer)
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
node.input[2] = quantized_padding_constant_name
else:
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
node, 2, quantized_input_value.qType,
quantized_input_value.scale_name, quantized_input_value.zp_name)
self.quantizer.new_nodes += [pad_value_qnodes]
node.input[2] = pad_value_qnodes.output[0]
else:
node.input.extend([quantized_input_value.zp_name]) # pad zero_point for original zero

# Create an entry for output quantized value
quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
quantized_input_value.scale_name, quantized_input_value.zp_name,
QuantizedValueType.Input)
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value

node.input[0] = quantized_input_value.q_name
node.output[0] = quantized_output_value.q_name
self.quantizer.new_nodes += [node]
9 changes: 9 additions & 0 deletions onnxruntime/python/tools/quantization/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ class QuantType(Enum):
}


def quantize_nparray(qtype, arr, scale, zero_point, low = None, high = None):
dtype = QUANT_TYPE_TO_NP_TYPE[qtype]
cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
return arr_fp32.astype(dtype)


class QuantizedInitializer:
'''
Represents a linearly quantized weight input from ONNX operators
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def optimize_model(model_path: Path):
sess_option = SessionOptions()
sess_option.optimized_model_filepath = opt_model_path.as_posix()
sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
_ = InferenceSession(model_path.as_posix(), sess_option)
_ = InferenceSession(model_path.as_posix(), sess_option, providers=['CPUExecutionProvider'])
optimized_model = onnx.load(opt_model_path.as_posix())
return optimized_model

Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/python/tools/quantization/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .operators.gavgpool import QGlobalAveragePool
from .operators.lstm import LSTMQuant
from .operators.split import QSplit
from .operators.pad import QPad

CommonOpsRegistry = {"Gather": GatherQuant, "EmbedLayerNormalization": EmbedLayerNormalizationQuant}

Expand All @@ -33,7 +34,8 @@
"Sigmoid": QLinearActivation,
"MaxPool": QMaxPool,
"GlobalAveragePool": QGlobalAveragePool,
"Split": QSplit,
"Split" : QSplit,
"Pad" : QPad,
}
QLinearOpsRegistry.update(CommonOpsRegistry)

Expand Down
34 changes: 34 additions & 0 deletions onnxruntime/test/optimizer/nhwc_transformer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,40 @@ TEST(NhwcTransformerTests, ConvSplit) {
}
}

TEST(NhwcTransformerTests, ConvPad) {
std::vector<std::string> pad_modes = {"constant", "reflect", "edge"};
for (const auto& mode : pad_modes) {
auto build_test_case = [&](NhwcTestHelper& helper) {
auto* input_arg = helper.MakeInput<uint8_t>({1, 23, 13, 13});
auto* conv1_output_arg = helper.MakeIntermediate();
auto* pads_const = helper.MakeScalarInitializer<uint8_t>(131);
auto* pads_arg = helper.Make1DInitializer<int64_t>({0, 0, 1, 2, 0, 0, 3, 4});
auto* pad_output_arg = helper.MakeIntermediate();
auto* conv2_output_arg = helper.MakeIntermediate();
auto* output_arg = helper.MakeOutput();

Node& conv1_node = helper.AddQLinearConvNode<uint8_t>(input_arg, .01f, 135,
{30, 23, 3, 3}, .02f, 126,
conv1_output_arg, .37f, 131);
conv1_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
Node& pad_node = helper.AddNode("Pad", {conv1_output_arg, pads_arg, pads_const}, {pad_output_arg});
pad_node.AddAttribute("mode", mode);
helper.AddQLinearConvNode<uint8_t>(pad_output_arg, .37f, 131,
{16, 30, 3, 3}, .015f, 129,
conv2_output_arg, .37f, 131);
helper.AddDequantizeLinearNode(conv2_output_arg, .37f, 131, output_arg);
};

auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["com.microsoft.QLinearConv"], 2);
EXPECT_EQ(op_to_count["Transpose"], 2);
};

NhwcTransformerTester(build_test_case, check_nhwc_graph);
}
}

TEST(NhwcTransformerTests, ConvBlockActivation) {
auto test_case = [&](uint32_t extra_edges) {
auto build_test_case = [&](NhwcTestHelper& helper) {
Expand Down
Loading