Merge pull request #9765 from microsoft/user/dwayner/DML1.8forORT1.10

Update DirectML 1.5.1 to 1.8.0 for ORT1.10
microsoft · Nov 20, 2021 · 7396689 · 7396689
2 parents 6856619 + f28d7ec
commit 7396689
Show file tree

Hide file tree

Showing 33 changed files with 2,300 additions and 1,083 deletions.
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
@@ -20,7 +20,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.5.1)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.8.0)
   set(DML_SHARED_LIB DirectML.dll)
 
   # Restore nuget packages, which will pull down the DirectML redist package

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h
@@ -122,6 +122,10 @@ namespace Windows::AI::MachineLearning::Adapter
         // Operator supports true 64-bit tensors directly, no strides needed.
         // So fallback to strided 32-bit only occurs when the device lacks 64-bit support.
         bool prefer64BitTensorsDirectly = false;
+
+        // The operator supports emulation for uint64/int64 even if the hardware doesn't
+        // support native uint64/int64 data types.
+        bool support64BitTensorsViaEmulation = false;
     };
 
     using InternalRegistrationInfoMap = std::unordered_map<onnxruntime::KernelDef*, std::shared_ptr<InternalRegistrationInfo>>;

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -345,6 +345,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
     bool supportedWith64BitTensorsVia32BitStrides,
     bool supportedWith64BitTensorsVia32BitStridesFromAnyEp,
     bool prefer64BitTensorsDirectly,
+    bool support64BitTensorsViaEmulation,
     _In_reads_(constantCpuInputCount) const uint32_t* requiredConstantCpuInputs,
     uint32_t constantCpuInputCount) const noexcept
 {
@@ -472,6 +473,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
         regInfo->supportedWith64BitTensorsVia32BitStrides = supportedWith64BitTensorsVia32BitStrides;
         regInfo->supportedWith64BitTensorsVia32BitStridesFromAnyEp = supportedWith64BitTensorsVia32BitStridesFromAnyEp;
         regInfo->prefer64BitTensorsDirectly = prefer64BitTensorsDirectly;
+        regInfo->support64BitTensorsViaEmulation = support64BitTensorsViaEmulation;
 
         // Only internal operators support usage in DML graphs
         if (supportsGraph)
@@ -546,7 +548,8 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
             requiredConstantCpuInputs ||
             supportedWith64BitTensorsVia32BitStrides ||
             supportedWith64BitTensorsVia32BitStridesFromAnyEp ||
-            prefer64BitTensorsDirectly)
+            prefer64BitTensorsDirectly ||
+            support64BitTensorsViaEmulation)
         {
             ORT_THROW_HR(E_INVALIDARG);
         }

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.h
@@ -44,6 +44,7 @@ class AbiCustomRegistry : public WRL::Base<IMLOperatorRegistry, IMLOperatorRegis
         bool supportedWith64BitTensorsVia32BitStrides = false,
         bool supportedWith64BitTensorsVia32BitStridesFromAnyEp = false,
         bool prefer64BitTensorsDirectly = false,
+        bool support64BitTensorsViaEmulation = false,
         _In_reads_(constantCpuInputCount) const uint32_t* requiredConstantCpuInputs = nullptr,
         uint32_t constantCpuInputCount = 0) const noexcept override;
 

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommon.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommon.cpp
@@ -20,7 +20,7 @@ DML_TENSOR_DATA_TYPE GetDmlDataTypeFromMlDataTypeNoThrow(MLOperatorTensorDataTyp
     case MLOperatorTensorDataType::String: return DML_TENSOR_DATA_TYPE_UNKNOWN;
     case MLOperatorTensorDataType::Bool: return DML_TENSOR_DATA_TYPE_UINT8;
     case MLOperatorTensorDataType::Float16: return DML_TENSOR_DATA_TYPE_FLOAT16;
-    case MLOperatorTensorDataType::Double: return DML_TENSOR_DATA_TYPE_UNKNOWN;
+    case MLOperatorTensorDataType::Double: return DML_TENSOR_DATA_TYPE_FLOAT64;
     case MLOperatorTensorDataType::UInt32: return DML_TENSOR_DATA_TYPE_UINT32;
     case MLOperatorTensorDataType::UInt64: return DML_TENSOR_DATA_TYPE_UINT64;
     case MLOperatorTensorDataType::Complex64: return DML_TENSOR_DATA_TYPE_UNKNOWN;
@@ -119,7 +119,7 @@ uint32_t GetSupportedDeviceDataTypeMask(IDMLDevice* dmlDevice)
     uint32_t deviceTypeMask = 0u;
 
     // Form the bitmask of all supported data types.
-    for (uint32_t i = 0; i <= DML_TENSOR_DATA_TYPE_INT8; ++i)
+    for (uint32_t i = 0; i <= DML_TENSOR_DATA_TYPE_INT64; ++i)
     {
         DML_FEATURE_QUERY_TENSOR_DATA_TYPE_SUPPORT dataTypeQuery = { static_cast<DML_TENSOR_DATA_TYPE>(i) };
         DML_FEATURE_DATA_TENSOR_DATA_TYPE_SUPPORT dataTypeSupport = {};

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommon.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommon.h
@@ -96,4 +96,24 @@ namespace Dml
 
         return minimumImpliedSizeInBytes;
     }
+
+    template <typename T>
+    void CastToClampedScalarUnion(DML_TENSOR_DATA_TYPE dataType, T value, DML_SCALAR_UNION* outputValue)
+    {
+        switch (dataType)
+        {
+        case DML_TENSOR_DATA_TYPE_UINT8:    outputValue->UInt8   = clamp_cast<uint8_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_UINT16:   outputValue->UInt16  = clamp_cast<uint16_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_UINT32:   outputValue->UInt32  = clamp_cast<uint32_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_UINT64:   outputValue->UInt64  = clamp_cast<uint64_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_INT8:     outputValue->Int8    = clamp_cast<int8_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_INT16:    outputValue->Int16   = clamp_cast<int16_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_INT32:    outputValue->Int32   = clamp_cast<int32_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_INT64:    outputValue->Int64   = clamp_cast<int64_t, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_FLOAT16:  outputValue->Float32 = clamp_cast<float, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_FLOAT32:  outputValue->Float32 = clamp_cast<float, T>(value); break;
+        case DML_TENSOR_DATA_TYPE_FLOAT64:  outputValue->Float64 = clamp_cast<double, T>(value); break;
+        default: assert(false);
+        }
+    }
 } // namespace Dml