microsoft · jywu-msft · May 24, 2024 · May 1, 2024 · May 1, 2024 · May 1, 2024
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -1,15 +1,21 @@
 // Copyright (C) Intel Corporation
 // Licensed under the MIT License
 
+#include <array>
+#include <algorithm>
+#include <cassert>
 #include <fstream>
 #include <sstream>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/contexts.h"
 #include "core/providers/openvino/backend_manager.h"
 #include "core/providers/openvino/ibackend.h"
 #include "core/providers/openvino/backend_utils.h"
+#include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -33,8 +39,6 @@ BackendManager::BackendManager(const GlobalContext& global_context,
       ORT_THROW("Import blob from model failed");
   }
 
-  auto prec_str = GetGlobalContext().precision_str;
-
   // Save the indexes of graph inputs among fused_node's inputDefs
   // (which also contains initializers).
   auto node_input_defs = fused_node.InputDefs();
@@ -44,7 +48,7 @@ BackendManager::BackendManager(const GlobalContext& global_context,
     i++;
   }
 
-  auto graph_inputs = subgraph.GetInputs();
+  const std::vector<const NodeArg*>& graph_inputs = subgraph.GetInputs();
   for (auto input : graph_inputs) {
     auto it = subgraph_context_.input_names.find(input->Name());
     if (it == subgraph_context_.input_names.end()) {
@@ -67,6 +71,9 @@ BackendManager::BackendManager(const GlobalContext& global_context,
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
+    ORT_ENFORCE(!global_context_.enable_qdq_optimizer,
+                "QDQ stripping should not be enabled for models with dynamic input shapes. "
+                "Set enable_qdq_optimizer to False");
     if (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
         GetGlobalContext().device_type.find("GPU") != std::string::npos) {
       if (!GetGlobalContext().disable_dynamic_shapes) {
@@ -218,27 +225,88 @@ bool BackendManager::ModelHasSymbolicInputDims(const onnxruntime::GraphViewer& s
   return has_sym_dims;
 }
 
+// Check to see if the graph is QDQ
+static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
+  std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+
+  for (size_t i = 0; i < node_indices.size(); i++) {
+    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
+    if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void DumpOpenVINOEPModel(std::string onnx_model_path_name,
+                                ONNX_NAMESPACE::ModelProto* model_proto,
+                                const onnxruntime::Node& fused_node) {
+  if (openvino_ep::backend_utils::IsDebugEnabled()) {
+    auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name;
+#ifdef _WIN32
+    size_t slash = model_name.find_last_of("\\");
+#else
+    size_t slash = model_name.find_last_of("/");
+#endif
+    model_name = model_name.substr(slash + 1, std::string::npos);
+    size_t dot = model_name.find_last_of(".");
+    model_name = model_name.substr(0, dot);
+
+    std::string subgraph_name = fused_node.Name();
+    size_t dash = subgraph_name.find_last_of("-");
+    subgraph_name = subgraph_name.substr(dash, std::string::npos);
+
+    const std::string name = model_name + subgraph_name + ".onnx";
+
+    std::fstream dump(name, std::ios::out | std::ios::trunc | std::ios::binary);
+    model_proto->SerializeToOstream(dump);
+  }
+}
+
 std::unique_ptr<ONNX_NAMESPACE::ModelProto>
 BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
                                            const onnxruntime::GraphViewer& subgraph,
                                            const logging::Logger& logger) const {
-  auto model = subgraph.CreateModel(logger);
-
-  auto model_proto = model->ToProto();
-  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-  subgraph.ToProto(*model_proto->mutable_graph(), true, true);
-
-#ifndef NDEBUG
+  std::chrono::time_point<std::chrono::high_resolution_clock> model_proto_create_start_, model_proto_create_end_;
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
-    const std::string& name = fused_node.Name();
-    std::fstream dump(name + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
-    model_proto->SerializeToOstream(dump);
+    model_proto_create_start_ = std::chrono::high_resolution_clock::now();
   }
-#else
-  ORT_UNUSED_PARAMETER(fused_node);
-#endif
 
-  return model_proto;
+  auto print_model_proto_duration = [&]() {
+    if (openvino_ep::backend_utils::IsDebugEnabled()) {
+      model_proto_create_end_ = std::chrono::high_resolution_clock::now();
+      auto model_proto_create_duration =
+          std::chrono::duration_cast<std::chrono::milliseconds>(
+              model_proto_create_end_ - model_proto_create_start_)
+              .count();
+      LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model Proto creation took: " << model_proto_create_duration << " ms.";
+    }
+  };
+
+  // QDQ stripping enabled only for the NPU
+  if (global_context_.device_type.find("NPU") != std::string::npos &&
+      global_context_.enable_qdq_optimizer &&
+      IsQDQGraph(subgraph)) {
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1";
+    std::unique_ptr<onnxruntime::Model> model;
+    Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, model);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return model_proto;
+  } else {
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 0";
+    auto model = subgraph.CreateModel(logger);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    subgraph.ToProto(*model_proto->mutable_graph(), true, true);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node);
+    return model_proto;
+  }
 }
 
 std::vector<std::vector<int64_t>> GetInputTensorShapes(const Ort::KernelContext& context) {

diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -36,6 +36,7 @@ class BackendManager {
       const onnxruntime::Node& fused_node,
       const onnxruntime::GraphViewer& subgraph,
       const logging::Logger& logger) const;
+
   bool ModelHasSymbolicInputDims(const onnxruntime::GraphViewer& subgraph) const;
   bool ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const;
 

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -4,6 +4,7 @@
 #include <algorithm>
 #include <sstream>
 #include <fstream>
+#include <utility>
 
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/pass/constant_folding.hpp"
@@ -17,15 +18,13 @@ namespace onnxruntime {
 namespace openvino_ep {
 namespace backend_utils {
 
-#ifndef NDEBUG
 bool IsDebugEnabled() {
   const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_DEBUG");
   if (!env_name.empty()) {
     return true;
   }
   return false;
 }
-#endif
 
 bool IsCILogEnabled() {
   const std::string env_name = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG");
@@ -265,7 +264,7 @@ void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
 
 void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std::string deviceName) {
   auto performanceMap = request->GetNewObj().get_profiling_info();
-  printPerformanceCounts(performanceMap, stream, deviceName);
+  printPerformanceCounts(performanceMap, stream, std::move(deviceName));
 }
 
 }  // namespace backend_utils

diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -29,9 +29,7 @@ namespace openvino_ep {
 namespace backend_utils {
 const std::string log_tag = "[OpenVINO-EP] ";
 
-#ifndef NDEBUG
 bool IsDebugEnabled();
-#endif
 
 // Internal diagnostic function.
 bool IsCILogEnabled();

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -240,7 +240,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       }
       // using the input name retrieved from ONNX original to match with the input names returned by OV tensors
       if (input_names.find(onnx_input_name) != input_names.end()) {
-        input_name = onnx_input_name;
+        input_name = std::move(onnx_input_name);
       } else {
         ORT_THROW(log_tag +
                   "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name +
@@ -285,7 +285,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         } catch (const char* msg) {
           ORT_THROW(msg);
         }
-        FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_);
+        FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
       }
       input_idx++;
     }
@@ -373,7 +373,11 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe
       }
 
       size_t batch_size = 1;
-      auto tensor = GetOutputTensor(context, batch_size, infer_request, output_name, subgraph_context_.output_names);
+      Ort::UnownedValue tensor = GetOutputTensor(context,
+                                                 batch_size,
+                                                 infer_request,
+                                                 output_name,
+                                                 subgraph_context_.output_names);
       auto mem_info = tensor.GetTensorMemoryInfo();
       // Check if ORT Value wraps a device pointer
       if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
@@ -440,27 +444,30 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
         ORT_THROW(msg);
       }
       size_t batch_size = 1;
-      auto output_tensor =
-          GetOutputTensor(context, batch_size, infer_request, output_name, subgraph_context_.output_names);
+      Ort::UnownedValue output_tensor =
+          GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
       auto mem_info = output_tensor.GetTensorMemoryInfo();
       if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
         return;
       } else {
         size_t batch_slice = 0;
-        FillOutputBlob(graph_output_blob, output_tensor, batch_slice);
+        FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
       }
     }
 
     if (!const_outputs_map_.empty()) {
-      for (auto item : const_outputs_map_) {
-        auto out_name = item.first;
+      for (const auto& item : const_outputs_map_) {
+        const auto& out_name = item.first;
         auto node = item.second;
-        auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
+        Ort::UnownedValue output_tensor = GetOutputTensor(context,
+                                                          std::move(out_name),
+                                                          subgraph_context_.output_names,
+                                                          node);
         auto mem_info = output_tensor.GetTensorMemoryInfo();
         if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
           ORT_THROW(log_tag + "IO Buffering is not supported for constant subgraphs");
         } else {
-          FillOutputsWithConstantData(node, output_tensor);
+          FillOutputsWithConstantData(std::move(node), output_tensor);
         }
       }
     }
@@ -478,12 +485,15 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
   LOGS_DEFAULT(INFO) << log_tag << "In Infer";
 
   if (subgraph_context_.is_constant) {
-    for (auto item : const_outputs_map_) {
-      auto out_name = item.first;
-      auto node = item.second;
+    for (const auto& item : const_outputs_map_) {
+      std::string out_name = item.first;
+      std::shared_ptr<ov::Node> node = item.second;
       try {
-        auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node);
-        FillOutputsWithConstantData(node, output_tensor);
+        Ort::UnownedValue output_tensor = GetOutputTensor(context,
+                                                          std::move(out_name),
+                                                          subgraph_context_.output_names,
+                                                          node);
+        FillOutputsWithConstantData(std::move(node), output_tensor);
       } catch (std::string const& msg) {
         ORT_THROW(msg);
       }
@@ -536,7 +546,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     }
 
     // Once the inference is completed, the infer_request becomes free and is placed back into pool of infer_requests_
-    inferRequestsQueue_->putIdleRequest(infer_request);
+    inferRequestsQueue_->putIdleRequest(std::move(infer_request));
 #ifndef NDEBUG
 #ifndef IO_BUFFER_ENABLED  // Printing performance counts is disabled when IO_BUFFER_ENABLED
     if (openvino_ep::backend_utils::IsDebugEnabled()) {

diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
@@ -20,6 +20,7 @@ struct GlobalContext {
   bool disable_dynamic_shapes = false;
   bool ep_context_embed_mode = true;
   bool export_ep_ctx_blob = false;
+  bool enable_qdq_optimizer = false;
   size_t num_of_threads;
   std::string device_type;
   std::string precision_str;

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -1,6 +1,7 @@
 // Copyright (C) Intel Corporation
 // Licensed under the MIT License
 #include <filesystem>
+#include <utility>
 
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/openvino_execution_provider.h"
@@ -31,20 +32,21 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
   global_context_->num_of_threads = info.num_of_threads_;
   global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR};
   global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_;
+  global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_;
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
   if (info.cache_dir_.empty()) {
     bool device_found = false;
-    auto available_devices = global_context_->ie_core.GetAvailableDevices();
+    std::vector<std::string> available_devices = global_context_->ie_core.GetAvailableDevices();
     // Checking for device_type configuration
     if (info.device_type_ != "") {
       if (info.device_type_.find("HETERO") != std::string::npos ||
           info.device_type_.find("MULTI") != std::string::npos ||
           info.device_type_.find("AUTO") != std::string::npos) {
         device_found = true;
       } else {
-        for (auto device : available_devices) {
+        for (std::string device : available_devices) {
           if (device.rfind(info.device_type_, 0) == 0) {
             if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" ||
                                                                        info.precision_ == "FP16" ||
@@ -79,7 +81,7 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
                                      std::to_string(global_context_->OpenVINO_Version.at(1));
 
   // Check for valid ctx node and maintain state for validity
-  if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, openvino_sdk_version))
+  if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version)))
     ORT_ENFORCE(graph_viewer.NumberOfNodes() == 1,
                 "[Invalid Graph] EPContext Model with OpenVINO compiled blob should not have more than one node.");
 
@@ -118,7 +120,8 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   }(graph_viewer);
 
   openvino_ep::GetCapability obj(graph_viewer,
-                                 global_context_->device_type);
+                                 global_context_->device_type,
+                                 global_context_->enable_qdq_optimizer);
   result = obj.Execute();
 
   global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
@@ -129,7 +132,7 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
 common::Status OpenVINOExecutionProvider::Compile(
     const std::vector<FusedNodeAndGraph>& fused_nodes,
     std::vector<NodeComputeInfo>& node_compute_funcs) {
-  for (const auto& fused_node_graph : fused_nodes) {
+  for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) {
     const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
     const Node& fused_node = fused_node_graph.fused_node;