Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
bfd35c9
Draft code to remove Q/DQ ops from node units in OpenVINO EP
adrianlizarraga May 1, 2024
787e512
remove unnecessary code
adrianlizarraga May 1, 2024
8590e6c
Rename function, lintrunner
adrianlizarraga May 1, 2024
47d48f6
Add rulesets for Q and DQ removal
sspintel May 1, 2024
0e71fb4
Handle cases for unsupported QDQ targets
sspintel May 2, 2024
96cccfb
Detect and skip duplicated DQs to dst graph
sspintel May 3, 2024
e059ff3
Add QDQ stripping to separate files
sspintel May 3, 2024
49a2b60
Fix resource access bug in duplicate DQ removal
sspintel May 3, 2024
0cd32c4
Add extended rule sets for each Q and DQ in a NodeUnit
sspintel May 5, 2024
2788b20
Remove unreachable code + NPU can take FLOAT for unsupported initiali…
sspintel May 6, 2024
6b16007
Implement a better way to dump stripped models from OVEP
sspintel May 6, 2024
257b041
Fix rulesets
sspintel May 7, 2024
f3c3bbe
Add OV session option for PTQ model
preetha-intel May 7, 2024
9d78b6c
Enable qdq stripping only for PTQ models
preetha-intel May 7, 2024
f378f8e
Enable is_ptq for python APIs
sspintel May 7, 2024
e3060ac
Fix to ignore unused initializers from dst graph
sspintel May 8, 2024
b46adee
Revert the logic and always keep initializers for nodes that are adde…
sspintel May 8, 2024
4970fff
Rename flag to enable qdq optimizer; Fix bug in dst graph inputs orde…
sspintel May 9, 2024
cc3dd38
Make enable_qdq_optimizer change in contexts.h
sspintel May 9, 2024
09ba129
Enable Q ruleset for standalone Qs & Handle standalone duplicate DQs
sspintel May 10, 2024
e5344c2
Add check for QDQ model; Address PR review comments
sspintel May 13, 2024
19a6af4
Dump graph name is unknown when input model is serialized
sspintel May 13, 2024
e833cfe
Fix case of a StandAlone DQ feeding to a supported Op
sspintel May 13, 2024
351f74b
Verbose logging of qdq optimizer status and duration
sspintel May 13, 2024
e156246
Fix logging of qdq optimizer status
sspintel May 13, 2024
4b9974b
Add standalone duplicate DQ DT check
sspintel May 13, 2024
c8c55cb
Fix for Linux build
sspintel May 14, 2024
7b4acfa
Fix case when Qs have const init inputs
sspintel May 14, 2024
96fc477
FIx review comments
sspintel May 16, 2024
1e920b2
Fix for Pad op with no dimensions
sspintel May 17, 2024
22ae1a7
Formatting fix
sspintel May 17, 2024
980e0bd
Coverty Issues Fixed
sfatimar Apr 25, 2024
d62aaf2
fix coverity issues
saurabhkale17 May 13, 2024
bf99ed2
Rewrite Q ruleset for Conv and MatMul
sspintel May 20, 2024
ed61165
Fix for node return type in debug mode
sspintel May 20, 2024
2e9bb81
Exception for dynamic shape models with qdq stripping
sspintel May 21, 2024
2575b58
Revert "Rewrite Q ruleset for Conv and MatMul"
sspintel May 21, 2024
4d3f82a
Fix lint issues
sspintel May 22, 2024
f76fca4
Fix cpplint issues
sspintel May 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 85 additions & 17 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
// Copyright (C) Intel Corporation
// Licensed under the MIT License

#include <array>
#include <algorithm>
#include <cassert>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <unordered_set>
#include <utility>

#include "core/providers/shared_library/provider_api.h"
#include "core/providers/openvino/contexts.h"
#include "core/providers/openvino/backend_manager.h"
#include "core/providers/openvino/ibackend.h"
#include "core/providers/openvino/backend_utils.h"
#include "core/providers/openvino/qdq_transformations/qdq_stripping.h"

namespace onnxruntime {
namespace openvino_ep {
Expand All @@ -33,8 +39,6 @@ BackendManager::BackendManager(const GlobalContext& global_context,
ORT_THROW("Import blob from model failed");
}

auto prec_str = GetGlobalContext().precision_str;

// Save the indexes of graph inputs among fused_node's inputDefs
// (which also contains initializers).
auto node_input_defs = fused_node.InputDefs();
Expand All @@ -44,7 +48,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
i++;
}

auto graph_inputs = subgraph.GetInputs();
const std::vector<const NodeArg*>& graph_inputs = subgraph.GetInputs();
for (auto input : graph_inputs) {
auto it = subgraph_context_.input_names.find(input->Name());
if (it == subgraph_context_.input_names.end()) {
Expand All @@ -67,6 +71,9 @@ BackendManager::BackendManager(const GlobalContext& global_context,
if (ModelHasSymbolicInputDims(subgraph)) {
subgraph_context_.has_dynamic_input_shape = true;
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
ORT_ENFORCE(!global_context_.enable_qdq_optimizer,
"QDQ stripping should not be enabled for models with dynamic input shapes. "
"Set enable_qdq_optimizer to False");
if (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
GetGlobalContext().device_type.find("GPU") != std::string::npos) {
if (!GetGlobalContext().disable_dynamic_shapes) {
Expand Down Expand Up @@ -218,27 +225,88 @@ bool BackendManager::ModelHasSymbolicInputDims(const onnxruntime::GraphViewer& s
return has_sym_dims;
}

// Check to see if the graph is QDQ
static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();

for (size_t i = 0; i < node_indices.size(); i++) {
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
return true;
}
}
return false;
}

static void DumpOpenVINOEPModel(std::string onnx_model_path_name,
ONNX_NAMESPACE::ModelProto* model_proto,
const onnxruntime::Node& fused_node) {
if (openvino_ep::backend_utils::IsDebugEnabled()) {
auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name;
#ifdef _WIN32
size_t slash = model_name.find_last_of("\\");
#else
size_t slash = model_name.find_last_of("/");
#endif
model_name = model_name.substr(slash + 1, std::string::npos);
size_t dot = model_name.find_last_of(".");
model_name = model_name.substr(0, dot);

std::string subgraph_name = fused_node.Name();
size_t dash = subgraph_name.find_last_of("-");
subgraph_name = subgraph_name.substr(dash, std::string::npos);

const std::string name = model_name + subgraph_name + ".onnx";

std::fstream dump(name, std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(dump);
}
}

std::unique_ptr<ONNX_NAMESPACE::ModelProto>
BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
const onnxruntime::GraphViewer& subgraph,
const logging::Logger& logger) const {
auto model = subgraph.CreateModel(logger);

auto model_proto = model->ToProto();
model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
subgraph.ToProto(*model_proto->mutable_graph(), true, true);

#ifndef NDEBUG
std::chrono::time_point<std::chrono::high_resolution_clock> model_proto_create_start_, model_proto_create_end_;
if (openvino_ep::backend_utils::IsDebugEnabled()) {
const std::string& name = fused_node.Name();
std::fstream dump(name + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
model_proto->SerializeToOstream(dump);
model_proto_create_start_ = std::chrono::high_resolution_clock::now();
}
#else
ORT_UNUSED_PARAMETER(fused_node);
#endif

return model_proto;
auto print_model_proto_duration = [&]() {
if (openvino_ep::backend_utils::IsDebugEnabled()) {
model_proto_create_end_ = std::chrono::high_resolution_clock::now();
auto model_proto_create_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(
model_proto_create_end_ - model_proto_create_start_)
.count();
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model Proto creation took: " << model_proto_create_duration << " ms.";
}
};

// QDQ stripping enabled only for the NPU
if (global_context_.device_type.find("NPU") != std::string::npos &&
global_context_.enable_qdq_optimizer &&
IsQDQGraph(subgraph)) {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
std::unique_ptr<onnxruntime::Model> model;
Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, model);
auto model_proto = model->ToProto();
model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
print_model_proto_duration();
DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
return model_proto;
} else {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 0";
auto model = subgraph.CreateModel(logger);
auto model_proto = model->ToProto();
model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
subgraph.ToProto(*model_proto->mutable_graph(), true, true);
print_model_proto_duration();
DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
return model_proto;
}
}

std::vector<std::vector<int64_t>> GetInputTensorShapes(const Ort::KernelContext& context) {
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/openvino/backend_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class BackendManager {
const onnxruntime::Node& fused_node,
const onnxruntime::GraphViewer& subgraph,
const logging::Logger& logger) const;

bool ModelHasSymbolicInputDims(const onnxruntime::GraphViewer& subgraph) const;
bool ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const;

Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/providers/openvino/backend_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <algorithm>
#include <sstream>
#include <fstream>
#include <utility>

#include "openvino/pass/convert_fp32_to_fp16.hpp"
#include "openvino/pass/constant_folding.hpp"
Expand All @@ -17,15 +18,13 @@ namespace onnxruntime {
namespace openvino_ep {
namespace backend_utils {

#ifndef NDEBUG
bool IsDebugEnabled() {
const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_DEBUG");
if (!env_name.empty()) {
return true;
}
return false;
}
#endif

bool IsCILogEnabled() {
const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG");
Expand Down Expand Up @@ -265,7 +264,7 @@ void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,

void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std::string deviceName) {
auto performanceMap = request->GetNewObj().get_profiling_info();
printPerformanceCounts(performanceMap, stream, deviceName);
printPerformanceCounts(performanceMap, stream, std::move(deviceName));
}

} // namespace backend_utils
Expand Down
2 changes: 0 additions & 2 deletions onnxruntime/core/providers/openvino/backend_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@ namespace openvino_ep {
namespace backend_utils {
const std::string log_tag = "[OpenVINO-EP] ";

#ifndef NDEBUG
bool IsDebugEnabled();
#endif

// Internal diagnostic function.
bool IsCILogEnabled();
Expand Down
42 changes: 26 additions & 16 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
}
// using the input name retrieved from ONNX original to match with the input names returned by OV tensors
if (input_names.find(onnx_input_name) != input_names.end()) {
input_name = onnx_input_name;
input_name = std::move(onnx_input_name);
} else {
ORT_THROW(log_tag +
"Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
Expand Down Expand Up @@ -285,7 +285,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
} catch (const char* msg) {
ORT_THROW(msg);
}
FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
}
input_idx++;
}
Expand Down Expand Up @@ -373,7 +373,11 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
}

size_t batch_size = 1;
auto tensor = GetOutputTensor(context, batch_size, infer_request, output_name, subgraph_context_.output_names);
Ort::UnownedValue tensor = GetOutputTensor(context,
batch_size,
infer_request,
output_name,
subgraph_context_.output_names);
auto mem_info = tensor.GetTensorMemoryInfo();
// Check if ORT Value wraps a device pointer
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
Expand Down Expand Up @@ -440,27 +444,30 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
ORT_THROW(msg);
}
size_t batch_size = 1;
auto output_tensor =
GetOutputTensor(context, batch_size, infer_request, output_name, subgraph_context_.output_names);
Ort::UnownedValue output_tensor =
GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
return;
} else {
size_t batch_slice = 0;
FillOutputBlob(graph_output_blob, output_tensor, batch_slice);
FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
}
}

if (!const_outputs_map_.empty()) {
for (auto item : const_outputs_map_) {
auto out_name = item.first;
for (const auto& item : const_outputs_map_) {
const auto& out_name = item.first;
auto node = item.second;
auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
Ort::UnownedValue output_tensor = GetOutputTensor(context,
std::move(out_name),
subgraph_context_.output_names,
node);
auto mem_info = output_tensor.GetTensorMemoryInfo();
if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
ORT_THROW(log_tag + "IO Buffering is not supported for constant subgraphs");
} else {
FillOutputsWithConstantData(node, output_tensor);
FillOutputsWithConstantData(std::move(node), output_tensor);
}
}
}
Expand All @@ -478,12 +485,15 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
LOGS_DEFAULT(INFO) << log_tag << "In Infer";

if (subgraph_context_.is_constant) {
for (auto item : const_outputs_map_) {
auto out_name = item.first;
auto node = item.second;
for (const auto& item : const_outputs_map_) {
std::string out_name = item.first;
std::shared_ptr<ov::Node> node = item.second;
try {
auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
FillOutputsWithConstantData(node, output_tensor);
Ort::UnownedValue output_tensor = GetOutputTensor(context,
std::move(out_name),
subgraph_context_.output_names,
node);
FillOutputsWithConstantData(std::move(node), output_tensor);
} catch (std::string const& msg) {
ORT_THROW(msg);
}
Expand Down Expand Up @@ -536,7 +546,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
}

// Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_
inferRequestsQueue_->putIdleRequest(infer_request);
inferRequestsQueue_->putIdleRequest(std::move(infer_request));
#ifndef NDEBUG
#ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED
if (openvino_ep::backend_utils::IsDebugEnabled()) {
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/openvino/contexts.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct GlobalContext {
bool disable_dynamic_shapes = false;
bool ep_context_embed_mode = true;
bool export_ep_ctx_blob = false;
bool enable_qdq_optimizer = false;
size_t num_of_threads;
std::string device_type;
std::string precision_str;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (C) Intel Corporation
// Licensed under the MIT License
#include <filesystem>
#include <utility>

#include "core/providers/shared_library/provider_api.h"
#include "core/providers/openvino/openvino_execution_provider.h"
Expand Down Expand Up @@ -31,20 +32,21 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
global_context_->num_of_threads = info.num_of_threads_;
global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;

// to check if target device is available
// using ie_core capability GetAvailableDevices to fetch list of devices plugged in
if (info.cache_dir_.empty()) {
bool device_found = false;
auto available_devices = global_context_->ie_core.GetAvailableDevices();
std::vector<std::string> available_devices = global_context_->ie_core.GetAvailableDevices();
// Checking for device_type configuration
if (info.device_type_ != "") {
if (info.device_type_.find("HETERO") != std::string::npos ||
info.device_type_.find("MULTI") != std::string::npos ||
info.device_type_.find("AUTO") != std::string::npos) {
device_found = true;
} else {
for (auto device : available_devices) {
for (std::string device : available_devices) {
if (device.rfind(info.device_type_, 0) == 0) {
if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" ||
info.precision_ == "FP16" ||
Expand Down Expand Up @@ -79,7 +81,7 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
std::to_string(global_context_->OpenVINO_Version.at(1));

// Check for valid ctx node and maintain state for validity
if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, openvino_sdk_version))
if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version)))
ORT_ENFORCE(graph_viewer.NumberOfNodes() == 1,
"[Invalid Graph] EPContext Model with OpenVINO compiled blob should not have more than one node.");

Expand Down Expand Up @@ -118,7 +120,8 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
}(graph_viewer);

openvino_ep::GetCapability obj(graph_viewer,
global_context_->device_type);
global_context_->device_type,
global_context_->enable_qdq_optimizer);
result = obj.Execute();

global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
Expand All @@ -129,7 +132,7 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
common::Status OpenVINOExecutionProvider::Compile(
const std::vector<FusedNodeAndGraph>& fused_nodes,
std::vector<NodeComputeInfo>& node_compute_funcs) {
for (const auto& fused_node_graph : fused_nodes) {
for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
const Node& fused_node = fused_node_graph.fused_node;

Expand Down
Loading