Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF)
option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_MKLML "Build the default cpu provider with MKL-ML binary dependency" OFF)
option(onnxruntime_USE_FEATURIZERS "Build ML Featurizers support" OFF)
option(onnxruntime_USE_NGRAPH "Build with nGraph support" OFF)
option(onnxruntime_USE_OPENBLAS "Use openblas" OFF)
Expand Down Expand Up @@ -199,7 +200,12 @@ if(onnxruntime_USE_OPENMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
include_directories(${OpenMP_CXX_INCLUDE_DIR})
include_directories(${OpenMP_CXX_INCLUDE_DIR})
# MKLML and NGraph depend on their own OpenMP library that may be different with the compiler's.
# Disable the options to build mklml/NGraph and OpenMP together.
if((WIN32 OR APPLE) AND onnxruntime_USE_MKLML)
message(FATAL_ERROR "Please use only one of onnxruntime_USE_MKLML, onnxruntime_USE_OPENMP")
endif()
if(onnxruntime_USE_NGRAPH)
message(FATAL_ERROR "Please use only one of onnxruntime_USE_NGRAPH, onnxruntime_USE_OPENMP")
endif()
Expand Down Expand Up @@ -756,7 +762,8 @@ if (onnxruntime_USE_ARMNN)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES armnn pthread arm_compute_core arm_compute arm_compute_graph)
endif()

if (onnxruntime_USE_DNNL)
# MKLML
if (onnxruntime_USE_DNNL OR onnxruntime_USE_MKLML)
include(dnnl)
endif()

Expand All @@ -772,13 +779,29 @@ if (onnxruntime_USE_TVM)
if (onnxruntime_USE_OPENMP)
set(USE_OPENMP "gnu")
endif()
if (onnxruntime_USE_MKLML)
set(USE_OPENMP "intel")
# make sure MKLML in ORT is used by TVM
if (WIN32)
set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
else()
set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
endif()
endif()

add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
set_target_properties(tvm_runtime PROPERTIES FOLDER "External/tvm")
set_target_properties(nnvm_compiler PROPERTIES FOLDER "External/tvm")

if (onnxruntime_USE_MKLML)
add_dependencies(tvm project_mklml)
add_dependencies(tvm_topi project_mklml)
add_dependencies(tvm_runtime project_mklml)
add_dependencies(nnvm_compiler project_mklml)
endif()

set(TVM_INCLUDES ${PROJECT_SOURCE_DIR}/external/tvm/include
${PROJECT_SOURCE_DIR}/external/tvm/3rdparty/dmlc-core/include
${PROJECT_SOURCE_DIR}/external/tvm/3rdparty/dlpack/include
Expand Down Expand Up @@ -827,6 +850,22 @@ endif()
set_target_properties(onnx PROPERTIES FOLDER "External/ONNX")
set_target_properties(onnx_proto PROPERTIES FOLDER "External/ONNX")

if (onnxruntime_USE_MKLML)
add_definitions(-DUSE_MKLML=1 -DUSE_MKLML_FOR_BLAS=1)
if (WIN32 OR APPLE)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES mklml)
else()
if(onnxruntime_USE_OPENMP)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES mklml_gnu)
else()
list(APPEND onnxruntime_EXTERNAL_LIBRARIES mklml_intel)
endif()
endif()
list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES project_mklml)
include_directories(${MKLML_INCLUDE_DIR})
link_directories(${MKLML_LIB_DIR})
endif()

if (onnxruntime_USE_NGRAPH)
if (NOT onnxruntime_USE_FULL_PROTOBUF)
message(FATAL_ERROR "Please set onnxruntime_USE_FULL_PROTOBUF=ON for nGraph execution provider.")
Expand Down
48 changes: 45 additions & 3 deletions cmake/external/dnnl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,62 @@ include (ExternalProject)
set(DNNL_URL https://github.com/oneapi-src/onednn)
# If DNNL_TAG is updated, check if MKLML_VERSION and platform.cmake.patch need to be updated.
set(DNNL_TAG v1.7)
set(MKLML_VERSION 2019.0.5.20190502)

if(WIN32)
set(MKLML_OS_VERSION_STR "win")
set(MKLML_FILE_EXTENSION "zip")
set(DNNL_SHARED_LIB dnnl.dll)
set(DNNL_IMPORT_LIB dnnl.lib)
set(DNNL_IMPORT_LIB dnnl.lib)
if(onnxruntime_USE_MKLML)
# Windows-only updated MKLML binary which contains fix for thread cleanup hang.
set(MKLML_VERSION 2020.0.20190813)
set(MKLML_SHARED_LIB mklml.dll)
set(MKLML_IMPORT_LIB mklml.lib)
set(IOMP5MD_SHARED_LIB libiomp5md.dll)
set(IOMP5MD_IMPORT_LIB libiomp5md.lib)
endif()
else()
set(MKLML_FILE_EXTENSION "tgz")
if (APPLE)
set(DNNL_SHARED_LIB libdnnl.1.dylib)
set(MKLML_OS_VERSION_STR "mac")
set(MKLML_OS_VERSION_STR "mac")
if(onnxruntime_USE_MKLML)
set(MKLML_SHARED_LIB libmklml.dylib)
set(IOMP5MD_SHARED_LIB libiomp5.dylib)
endif()
else()
set(DNNL_SHARED_LIB libdnnl.so.1)
set(MKLML_OS_VERSION_STR "lnx")
set(MKLML_OS_VERSION_STR "lnx")
if(onnxruntime_USE_MKLML)
if(onnxruntime_USE_OPENMP)
set(MKLML_SHARED_LIB libmklml_gnu.so)
else()
set(MKLML_SHARED_LIB libmklml_intel.so)
set(IOMP5MD_SHARED_LIB libiomp5.so)
endif()
endif()
endif()
endif()

if (onnxruntime_USE_MKLML)
set(MKLDNN_VERSION_SHORT v0.20)
set(MKLML_URL https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_VERSION_SHORT}/mklml_${MKLML_OS_VERSION_STR}_${MKLML_VERSION}.${MKLML_FILE_EXTENSION})

ExternalProject_Add(project_mklml
PREFIX mklml
URL ${MKLML_URL}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND "" )

set(MKML_DIR ${CMAKE_CURRENT_BINARY_DIR}/mklml/src/project_mklml)
set(MKLML_INCLUDE_DIR "${MKML_DIR}/include")
set(MKLML_LIB_DIR "${MKML_DIR}/lib")
link_directories(${MKLML_LIB_DIR})
endif()

if (onnxruntime_USE_DNNL AND onnxruntime_DNNL_GPU_RUNTIME STREQUAL "ocl" AND onnxruntime_DNNL_OPENCL_ROOT STREQUAL "")
message(FATAL_ERROR "onnxruntime_DNNL_OPENCL_ROOT required for onnxruntime_DNNL_GPU_RUNTIME")
elseif(onnxruntime_USE_DNNL AND onnxruntime_DNNL_GPU_RUNTIME STREQUAL "ocl")
Expand Down Expand Up @@ -57,4 +96,7 @@ if (onnxruntime_USE_DNNL)
CMAKE_ARGS -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${DNNL_INSTALL} ${DNNL_GPU_CMAKE_ARGS}
)
link_directories(${DNNL_LIB_DIR})
#if (onnxruntime_USE_MKLML)
# add_dependencies(project_dnnl project_mklml)
#endif()
endif()
9 changes: 9 additions & 0 deletions cmake/onnxruntime_python.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,15 @@ if (onnxruntime_USE_TVM)
)
endif()

if (onnxruntime_USE_MKLML)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${MKLML_LIB_DIR}/${MKLML_SHARED_LIB} ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB}
$<TARGET_FILE_DIR:${test_data_target}>/onnxruntime/capi/
)
endif()

if (onnxruntime_USE_NUPHAR)
file(GLOB onnxruntime_python_nuphar_python_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/nuphar/scripts/*"
Expand Down
8 changes: 8 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,14 @@ if (onnxruntime_USE_DNNL)
COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $<TARGET_FILE_DIR:${test_data_target}>
)
endif()
if (onnxruntime_USE_MKLML)
add_custom_command(
TARGET ${test_data_target} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${MKLML_LIB_DIR}/${MKLML_SHARED_LIB} ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB}
$<TARGET_FILE_DIR:${test_data_target}>
)
endif()
if(WIN32)
if (onnxruntime_USE_NGRAPH)
add_custom_command(
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/contrib_ops/cpu/bert/attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ class Attention : public OpKernel, public AttentionCPUBase {
explicit Attention(const OpKernelInfo& info);

Status Compute(OpKernelContext* context) const override;
#if !defined(USE_MKLML_FOR_BLAS)
Status PrePack(const Tensor& tensor, int input_idx, bool& is_packed) override;
#endif

private:
BufferUniquePtr packed_weights_;
Expand Down Expand Up @@ -173,6 +175,7 @@ template <typename T>
Attention<T>::Attention(const OpKernelInfo& info) : OpKernel(info), AttentionCPUBase(info) {
}

#if !defined(USE_MKLML_FOR_BLAS)

template <typename T>
Status Attention<T>::PrePack(const Tensor& weights, int input_idx, bool& is_packed) {
Expand Down Expand Up @@ -219,6 +222,8 @@ Status Attention<T>::PrePack(const Tensor& weights, int input_idx, bool& is_pack
return Status::OK();
}

#endif

template <typename T>
Status Attention<T>::Compute(OpKernelContext* context) const {
const Tensor* input = context->Input<Tensor>(0);
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/cpu/cdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ static void CalculateSqeuclidean(const Tensor& a, const Tensor& b, Tensor& c, co
// in Xij and Yjk are very similar, so subtracting can be problematic.
// Due to that we calculate -2*sum_k(Xik*Yjk) using GEMM, add sum_k(Xik**2) next, and add sum_k(Yjk**2) last.

// use MLAS on 64-bit (no 32-bit dgemm)
#if defined(_M_AMD64) || defined(__x86_64__)
// use MLAS on 64-bit (no 32-bit dgemm), or MKL on 32-bit or 64-bit
#if defined(_M_AMD64) || defined(__x86_64__) || defined(USE_MKLML_FOR_BLAS)
// Use GEMM of A and B^T with -2 as alpha to calculate -2*sum_k(Xik*Yjk)
math::Gemm<T>(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
m, n, k,
Expand Down
4 changes: 4 additions & 0 deletions onnxruntime/core/providers/cpu/math/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
return Status::OK();
}

#if !defined(USE_MKLML_FOR_BLAS)
Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, bool& is_packed) {
is_packed = false;

Expand All @@ -135,6 +136,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, bool& is_pack
}
return Status::OK();
}
#endif

Status MatMul<float>::Compute(OpKernelContext* ctx) const {
concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
Expand Down Expand Up @@ -162,6 +164,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
// TODO: replace it with GemmBatch for performance, it's OK for now as GemmBatch unrolls as well
size_t max_len = helper.OutputOffsets().size();
for (size_t i = 0; i < max_len; i++) {
#if !defined(USE_MKLML_FOR_BLAS)
if (packed_b_) {
MlasGemm(
trans_a ? CblasTrans : CblasNoTrans,
Expand All @@ -178,6 +181,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
thread_pool);
continue;
}
#endif
math::Gemm<float, concurrency::ThreadPool>(
trans_a ? CblasTrans : CblasNoTrans,
trans_b ? CblasTrans : CblasNoTrans,
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/cpu/math/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ class MatMul<float> final : public OpKernel {
info.GetAttrOrDefault<float>("alpha", &alpha_attr_, 1.0);
}

#if !defined(USE_MKLML_FOR_BLAS)
Status PrePack(const Tensor& tensor, int input_idx, bool& is_packed) override;
#endif

Status Compute(OpKernelContext* context) const override;

Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ Status DeepCpuLstmOp::TryPackWeights(const Tensor& weights, PackedWeights& packe
return Status::OK();
}

#if !defined(USE_MKLML_FOR_BLAS)
Status DeepCpuLstmOp::PrePack(const Tensor& tensor, int input_idx, bool& is_packed) {
is_packed = false;

Expand All @@ -217,6 +218,7 @@ Status DeepCpuLstmOp::PrePack(const Tensor& tensor, int input_idx, bool& is_pack

return Status::OK();
}
#endif

Status DeepCpuLstmOp::Compute(OpKernelContext* context) const {
const Tensor& X = *context->Input<Tensor>(0); // inputs. [seq_length, batch_size, input_size]
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ class DeepCpuLstmOp final : public OpKernel, public LSTMBase {
public:
DeepCpuLstmOp(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}

#if !defined(USE_MKLML_FOR_BLAS)
Status PrePack(const Tensor& tensor, int input_idx, bool& is_packed) override;
#endif
Status Compute(OpKernelContext* context) const override;

~DeepCpuLstmOp() override = default;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));

std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
#if defined(_OPENMP)
#if defined(_OPENMP) || defined(USE_MKLML)
// a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
// user may change it to 0 to turn it off,
// or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL
Expand Down
36 changes: 36 additions & 0 deletions onnxruntime/core/providers/nuphar/extern/igemv_mkl.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "igemv_mkl.h"

namespace onnxruntime {
#ifdef NUPHAR_USE_MKL
void MKLIntGemvS16S16S32R(
int16_t* matrixA,
int16_t* matrixB,
int M,
int N,
int K,
int32_t* output) {
MKL_INT32 co = 0;
cblas_gemm_s16s16s32(CBLAS_LAYOUT::CblasColMajor, CBLAS_TRANSPOSE::CblasTrans, CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_OFFSET::CblasFixOffset,
M, N, K,
1, matrixA, K,
0, matrixB, K, 0, 0, output, M, &co);
}
void MKLIntGemvS8U8S32R(
int8_t* matrixA,
uint8_t* matrixB,
int M,
int N,
int K,
int32_t* output) {
MKL_INT32 co = 0;
cblas_gemm_s8u8s32(CBLAS_LAYOUT::CblasColMajor, CBLAS_TRANSPOSE::CblasTrans, CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_OFFSET::CblasFixOffset,
M, N, K,
1, matrixA, K,
0, matrixB, K, 0, 0, output, M, &co);
}
#endif

} // namespace onnxruntime
30 changes: 30 additions & 0 deletions onnxruntime/core/providers/nuphar/extern/igemv_mkl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once
#include <stdint.h>

#ifdef NUPHAR_USE_MKL
// Need to build with USE_MKLML
#include <mkl_cblas.h>
#endif // NUPHAR_USE_MKL

namespace onnxruntime {
#ifdef NUPHAR_USE_MKL
void MKLIntGemvS16S16S32R(
int16_t* matrixA,
int16_t* matrixB,
int M,
int N,
int K,
int32_t* output);

void MKLIntGemvS8U8S32R(
int8_t* matrixA,
uint8_t* matrixB,
int M,
int N,
int K,
int32_t* output);
#endif
} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "core/common/common.h"
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/providers/nuphar/extern/igemv_mkl.h"
#include "core/providers/nuphar/extern/igemv_avx2.h"
#include <topi/detail/extern.h>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "core/common/common.h"
#include "core/codegen/mti/mti_tvm_utils.h"
#include "core/providers/nuphar/extern/igemv_mkl.h"
#include "core/providers/nuphar/extern/igemv_avx2.h"
#include <topi/detail/extern.h>

Expand Down
Loading