microsoft · hariharans29 · Nov 19, 2020 · Nov 18, 2020 · Nov 18, 2020 · Nov 19, 2020
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -62,6 +62,7 @@ option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
 option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
+option(onnxruntime_USE_MKLML "Build the default cpu provider with MKL-ML binary dependency" OFF)
 option(onnxruntime_USE_FEATURIZERS "Build ML Featurizers support" OFF)
 option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF)
 option(onnxruntime_USE_OPENBLAS "Use openblas" OFF)
@@ -199,7 +200,12 @@ if(onnxruntime_USE_OPENMP)
   if (OPENMP_FOUND)
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    include_directories(${OpenMP_CXX_INCLUDE_DIR})    
+    include_directories(${OpenMP_CXX_INCLUDE_DIR})
+    # MKLML and NGraph depend on their own OpenMP library that may be different with the compiler's.
+    # Disable the options to build mklml/NGraph and OpenMP together.
+    if((WIN32 OR APPLE) AND onnxruntime_USE_MKLML)
+      message(FATAL_ERROR "Please use only one of onnxruntime_USE_MKLML, onnxruntime_USE_OPENMP")
+    endif()
     if(onnxruntime_USE_NGRAPH)
       message(FATAL_ERROR "Please use only one of onnxruntime_USE_NGRAPH, onnxruntime_USE_OPENMP")
     endif()
@@ -756,7 +762,8 @@ if (onnxruntime_USE_ARMNN)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES armnn pthread arm_compute_core arm_compute arm_compute_graph)
 endif()
 
-if (onnxruntime_USE_DNNL)
+# MKLML
+if (onnxruntime_USE_DNNL OR onnxruntime_USE_MKLML)
   include(dnnl)
 endif()
 
@@ -772,13 +779,29 @@ if (onnxruntime_USE_TVM)
   if (onnxruntime_USE_OPENMP)
     set(USE_OPENMP "gnu")
   endif()
+  if (onnxruntime_USE_MKLML)
+    set(USE_OPENMP "intel")
+    # make sure MKLML in ORT is used by TVM
+    if (WIN32)
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
+    else()
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
+    endif()
+  endif()
 
   add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
   set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
   set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
   set_target_properties(tvm_runtime PROPERTIES FOLDER "External/tvm")
   set_target_properties(nnvm_compiler PROPERTIES FOLDER "External/tvm")
 
+  if (onnxruntime_USE_MKLML)
+    add_dependencies(tvm project_mklml)
+    add_dependencies(tvm_topi project_mklml)
+    add_dependencies(tvm_runtime project_mklml)
+    add_dependencies(nnvm_compiler project_mklml)
+  endif()
+
   set(TVM_INCLUDES ${PROJECT_SOURCE_DIR}/external/tvm/include
     ${PROJECT_SOURCE_DIR}/external/tvm/3rdparty/dmlc-core/include
     ${PROJECT_SOURCE_DIR}/external/tvm/3rdparty/dlpack/include
@@ -827,6 +850,22 @@ endif()
 set_target_properties(onnx PROPERTIES FOLDER "External/ONNX")
 set_target_properties(onnx_proto PROPERTIES FOLDER "External/ONNX")
 
+if (onnxruntime_USE_MKLML)
+  add_definitions(-DUSE_MKLML=1 -DUSE_MKLML_FOR_BLAS=1)
+  if (WIN32 OR APPLE)
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES mklml)
+  else()
+    if(onnxruntime_USE_OPENMP)
+      list(APPEND onnxruntime_EXTERNAL_LIBRARIES mklml_gnu)
+    else()
+      list(APPEND onnxruntime_EXTERNAL_LIBRARIES mklml_intel)
+    endif()
+  endif()
+  list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES project_mklml)
+  include_directories(${MKLML_INCLUDE_DIR})
+  link_directories(${MKLML_LIB_DIR})
+endif()
+
 if (onnxruntime_USE_NGRAPH)
   if (NOT onnxruntime_USE_FULL_PROTOBUF)
     message(FATAL_ERROR "Please set onnxruntime_USE_FULL_PROTOBUF=ON for nGraph execution provider.")

diff --git a/cmake/external/dnnl.cmake b/cmake/external/dnnl.cmake
@@ -3,23 +3,62 @@ include (ExternalProject)
 set(DNNL_URL https://github.com/oneapi-src/onednn)
 # If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
 set(DNNL_TAG v1.7)
+set(MKLML_VERSION 2019.0.5.20190502)
 
 if(WIN32)
   set(MKLML_OS_VERSION_STR "win")
   set(MKLML_FILE_EXTENSION "zip")
   set(DNNL_SHARED_LIB dnnl.dll)
-  set(DNNL_IMPORT_LIB dnnl.lib)  
+  set(DNNL_IMPORT_LIB dnnl.lib)
+  if(onnxruntime_USE_MKLML)
+    # Windows-only updated MKLML binary which contains fix for thread cleanup hang.
+    set(MKLML_VERSION 2020.0.20190813)
+    set(MKLML_SHARED_LIB mklml.dll)
+    set(MKLML_IMPORT_LIB mklml.lib)
+    set(IOMP5MD_SHARED_LIB libiomp5md.dll)
+    set(IOMP5MD_IMPORT_LIB libiomp5md.lib)
+  endif()
 else()
   set(MKLML_FILE_EXTENSION "tgz")
   if (APPLE)
     set(DNNL_SHARED_LIB libdnnl.1.dylib)
-    set(MKLML_OS_VERSION_STR "mac")    
+    set(MKLML_OS_VERSION_STR "mac")
+    if(onnxruntime_USE_MKLML)
+      set(MKLML_SHARED_LIB libmklml.dylib)
+      set(IOMP5MD_SHARED_LIB libiomp5.dylib)
+    endif()
   else()
     set(DNNL_SHARED_LIB libdnnl.so.1)
-    set(MKLML_OS_VERSION_STR "lnx")    
+    set(MKLML_OS_VERSION_STR "lnx")
+    if(onnxruntime_USE_MKLML)
+      if(onnxruntime_USE_OPENMP)
+        set(MKLML_SHARED_LIB libmklml_gnu.so)
+      else()
+        set(MKLML_SHARED_LIB libmklml_intel.so)
+        set(IOMP5MD_SHARED_LIB libiomp5.so)
+      endif()
+    endif()
   endif()  
 endif()
 
+if (onnxruntime_USE_MKLML)
+  set(MKLDNN_VERSION_SHORT v0.20)
+  set(MKLML_URL https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_VERSION_SHORT}/mklml_${MKLML_OS_VERSION_STR}_${MKLML_VERSION}.${MKLML_FILE_EXTENSION})
+
+  ExternalProject_Add(project_mklml
+    PREFIX mklml
+    URL ${MKLML_URL}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND ""  )
+
+  set(MKML_DIR ${CMAKE_CURRENT_BINARY_DIR}/mklml/src/project_mklml)
+  set(MKLML_INCLUDE_DIR "${MKML_DIR}/include")
+  set(MKLML_LIB_DIR "${MKML_DIR}/lib")
+  link_directories(${MKLML_LIB_DIR})
+endif()
+
 if (onnxruntime_USE_DNNL AND onnxruntime_DNNL_GPU_RUNTIME STREQUAL "ocl" AND onnxruntime_DNNL_OPENCL_ROOT STREQUAL "")
   message(FATAL_ERROR "onnxruntime_DNNL_OPENCL_ROOT required for onnxruntime_DNNL_GPU_RUNTIME")
 elseif(onnxruntime_USE_DNNL AND onnxruntime_DNNL_GPU_RUNTIME STREQUAL "ocl")
@@ -57,4 +96,7 @@ if (onnxruntime_USE_DNNL)
     CMAKE_ARGS -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${DNNL_INSTALL} ${DNNL_GPU_CMAKE_ARGS}
   )
   link_directories(${DNNL_LIB_DIR})
+  #if (onnxruntime_USE_MKLML)
+  #  add_dependencies(project_dnnl project_mklml)
+  #endif()
 endif()
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -379,6 +379,15 @@ if (onnxruntime_USE_TVM)
   )
 endif()
 
+if (onnxruntime_USE_MKLML)
+  add_custom_command(
+    TARGET onnxruntime_pybind11_state POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${MKLML_LIB_DIR}/${MKLML_SHARED_LIB} ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB}
+        $<TARGET_FILE_DIR:${test_data_target}>/onnxruntime/capi/
+  )
+endif()
+
 if (onnxruntime_USE_NUPHAR)
   file(GLOB onnxruntime_python_nuphar_python_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/nuphar/scripts/*"

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -678,6 +678,14 @@ if (onnxruntime_USE_DNNL)
     COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $<TARGET_FILE_DIR:${test_data_target}>
     )
 endif()
+if (onnxruntime_USE_MKLML)
+  add_custom_command(
+    TARGET ${test_data_target} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+    ${MKLML_LIB_DIR}/${MKLML_SHARED_LIB} ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB}
+    $<TARGET_FILE_DIR:${test_data_target}>
+  )
+endif()
 if(WIN32)
   if (onnxruntime_USE_NGRAPH)
     add_custom_command(

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc
@@ -21,7 +21,9 @@ class Attention : public OpKernel, public AttentionCPUBase {
   explicit Attention(const OpKernelInfo& info);
 
   Status Compute(OpKernelContext* context) const override;
+#if !defined(USE_MKLML_FOR_BLAS)
   Status PrePack(const Tensor& tensor, int input_idx, bool& is_packed) override;
+#endif
 
  private:
   BufferUniquePtr packed_weights_;
@@ -173,6 +175,7 @@ template <typename T>
 Attention<T>::Attention(const OpKernelInfo& info) : OpKernel(info), AttentionCPUBase(info) {
 }
 
+#if !defined(USE_MKLML_FOR_BLAS)
 
 template <typename T>
 Status Attention<T>::PrePack(const Tensor& weights, int input_idx, bool& is_packed) {
@@ -219,6 +222,8 @@ Status Attention<T>::PrePack(const Tensor& weights, int input_idx, bool& is_pack
   return Status::OK();
 }
 
+#endif
+
 template <typename T>
 Status Attention<T>::Compute(OpKernelContext* context) const {
   const Tensor* input = context->Input<Tensor>(0);

diff --git a/onnxruntime/contrib_ops/cpu/cdist.cc b/onnxruntime/contrib_ops/cpu/cdist.cc
@@ -56,8 +56,8 @@ static void CalculateSqeuclidean(const Tensor& a, const Tensor& b, Tensor& c, co
   // in Xij and Yjk are very similar, so subtracting can be problematic.
   // Due to that we calculate -2*sum_k(Xik*Yjk) using GEMM, add sum_k(Xik**2) next, and add sum_k(Yjk**2) last.
 
-// use MLAS on 64-bit (no 32-bit dgemm)
-#if defined(_M_AMD64) || defined(__x86_64__)
+// use MLAS on 64-bit (no 32-bit dgemm), or MKL on 32-bit or 64-bit
+#if defined(_M_AMD64) || defined(__x86_64__) || defined(USE_MKLML_FOR_BLAS)
   // Use GEMM of A and B^T with -2 as alpha to calculate -2*sum_k(Xik*Yjk)
   math::Gemm<T>(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
                 m, n, k,

diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -126,6 +126,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
   return Status::OK();
 }
 
+#if !defined(USE_MKLML_FOR_BLAS)
 Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, bool& is_packed) {
   is_packed = false;
 
@@ -135,6 +136,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, bool& is_pack
   }
   return Status::OK();
 }
+#endif
 
 Status MatMul<float>::Compute(OpKernelContext* ctx) const {
   concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
@@ -162,6 +164,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
   // TODO: replace it with GemmBatch for performance, it's OK for now as GemmBatch unrolls as well
   size_t max_len = helper.OutputOffsets().size();
   for (size_t i = 0; i < max_len; i++) {
+#if !defined(USE_MKLML_FOR_BLAS)
     if (packed_b_) {
       MlasGemm(
           trans_a ? CblasTrans : CblasNoTrans,
@@ -178,6 +181,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
           thread_pool);
       continue;
     }
+#endif
     math::Gemm<float, concurrency::ThreadPool>(
         trans_a ? CblasTrans : CblasNoTrans,
         trans_b ? CblasTrans : CblasNoTrans,

diff --git a/onnxruntime/core/providers/cpu/math/matmul.h b/onnxruntime/core/providers/cpu/math/matmul.h
@@ -24,7 +24,9 @@ class MatMul<float> final : public OpKernel {
     info.GetAttrOrDefault<float>("alpha", &alpha_attr_, 1.0);
   }
 
+#if !defined(USE_MKLML_FOR_BLAS)
   Status PrePack(const Tensor& tensor, int input_idx, bool& is_packed) override;
+#endif
 
   Status Compute(OpKernelContext* context) const override;
 

diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
@@ -204,6 +204,7 @@ Status DeepCpuLstmOp::TryPackWeights(const Tensor& weights, PackedWeights& packe
   return Status::OK();
 }
 
+#if !defined(USE_MKLML_FOR_BLAS)
 Status DeepCpuLstmOp::PrePack(const Tensor& tensor, int input_idx, bool& is_packed) {
   is_packed = false;
 
@@ -217,6 +218,7 @@ Status DeepCpuLstmOp::PrePack(const Tensor& tensor, int input_idx, bool& is_pack
 
   return Status::OK();
 }
+#endif
 
 Status DeepCpuLstmOp::Compute(OpKernelContext* context) const {
   const Tensor& X = *context->Input<Tensor>(0);  // inputs. [seq_length, batch_size, input_size]

diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
@@ -18,7 +18,9 @@ class DeepCpuLstmOp final : public OpKernel, public LSTMBase {
  public:
   DeepCpuLstmOp(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
 
+#if !defined(USE_MKLML_FOR_BLAS)
   Status PrePack(const Tensor& tensor, int input_idx, bool& is_packed) override;
+#endif
   Status Compute(OpKernelContext* context) const override;
 
   ~DeepCpuLstmOp() override = default;

diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
@@ -58,7 +58,7 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
   options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));
 
   std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
-#if defined(_OPENMP)
+#if defined(_OPENMP) || defined(USE_MKLML)
   // a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
   // user may change it to 0 to turn it off,
   // or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL

diff --git a/onnxruntime/core/providers/nuphar/extern/igemv_mkl.cc b/onnxruntime/core/providers/nuphar/extern/igemv_mkl.cc
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "igemv_mkl.h"
+
+namespace onnxruntime {
+#ifdef NUPHAR_USE_MKL
+void MKLIntGemvS16S16S32R(
+    int16_t* matrixA,
+    int16_t* matrixB,
+    int M,
+    int N,
+    int K,
+    int32_t* output) {
+  MKL_INT32 co = 0;
+  cblas_gemm_s16s16s32(CBLAS_LAYOUT::CblasColMajor, CBLAS_TRANSPOSE::CblasTrans, CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_OFFSET::CblasFixOffset,
+                       M, N, K,
+                       1, matrixA, K,
+                       0, matrixB, K, 0, 0, output, M, &co);
+}
+void MKLIntGemvS8U8S32R(
+    int8_t* matrixA,
+    uint8_t* matrixB,
+    int M,
+    int N,
+    int K,
+    int32_t* output) {
+  MKL_INT32 co = 0;
+  cblas_gemm_s8u8s32(CBLAS_LAYOUT::CblasColMajor, CBLAS_TRANSPOSE::CblasTrans, CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_OFFSET::CblasFixOffset,
+                     M, N, K,
+                     1, matrixA, K,
+                     0, matrixB, K, 0, 0, output, M, &co);
+}
+#endif
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/extern/igemv_mkl.h b/onnxruntime/core/providers/nuphar/extern/igemv_mkl.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+
+#ifdef NUPHAR_USE_MKL
+// Need to build with USE_MKLML
+#include <mkl_cblas.h>
+#endif  // NUPHAR_USE_MKL
+
+namespace onnxruntime {
+#ifdef NUPHAR_USE_MKL
+void MKLIntGemvS16S16S32R(
+    int16_t* matrixA,
+    int16_t* matrixB,
+    int M,
+    int N,
+    int K,
+    int32_t* output);
+
+void MKLIntGemvS8U8S32R(
+    int8_t* matrixA,
+    uint8_t* matrixB,
+    int M,
+    int N,
+    int K,
+    int32_t* output);
+#endif
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/quantize/imatmul16_extern.cc b/onnxruntime/core/providers/nuphar/mti_x86/quantize/imatmul16_extern.cc
@@ -5,6 +5,7 @@
 
 #include "core/common/common.h"
 #include "core/codegen/mti/mti_tvm_utils.h"
+#include "core/providers/nuphar/extern/igemv_mkl.h"
 #include "core/providers/nuphar/extern/igemv_avx2.h"
 #include <topi/detail/extern.h>
 

diff --git a/onnxruntime/core/providers/nuphar/mti_x86/quantize/imatmul_extern.cc b/onnxruntime/core/providers/nuphar/mti_x86/quantize/imatmul_extern.cc
@@ -5,6 +5,7 @@
 
 #include "core/common/common.h"
 #include "core/codegen/mti/mti_tvm_utils.h"
+#include "core/providers/nuphar/extern/igemv_mkl.h"
 #include "core/providers/nuphar/extern/igemv_avx2.h"
 #include <topi/detail/extern.h>