Merge branch 'main' of github.com:microsoft/onnxruntime into user/pav…

…ignol/add-embed-layer-norm
microsoft · Dec 8, 2022 · 06da5ab · 06da5ab
2 parents 254adc9 + c1cc1d5
commit 06da5ab
Show file tree

Hide file tree

Showing 59 changed files with 585 additions and 730 deletions.
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
@@ -292,7 +292,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "8de7772cc72daca8e947b79b83fea46214931604",
+          "commitHash": "80dc998efced8ceb2be59756668a7e90e8bef917",
           "repositoryUrl": "https://github.com/pybind/pybind11.git"
         },
         "comments": "pybind11"

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1224,6 +1224,7 @@ endif()
 if (onnxruntime_ENABLE_TRAINING)
   add_compile_definitions(ENABLE_TRAINING)
   add_compile_definitions(ENABLE_TRAINING_OPS)
+  add_compile_definitions(ENABLE_STRIDED_TENSORS)
 
   if (UNIX)
     if (EXISTS "${onnxruntime_MPI_HOME}")

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -28,7 +28,7 @@ onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/87c7a70688fd98fb355b
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v3.18.3.zip;b95bf7e9de9c2249b6c1f2ca556ace49999e90bd
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/1787867f6183f056420e532eec640cba25efafea.zip;e43e80781560c5ab404a4da20f34d846f5f5d101
-pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.6.2.zip;950b3b319384a1a36b252cc821953e5f9be14840
+pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
 pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/5916273f79a21551890fd3d56fc5375a78d1598d.zip;2be4d2ae321fada97cb39eaf4eeba5f8c85597cf
 re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
 safeint;https://github.com/dcleblanc/SafeInt/archive/ff15c6ada150a5018c5ef2172401cb4529eac9c0.zip;913a4046e5274d329af2806cb53194f617d8c0ab

diff --git a/cmake/onnxruntime_session.cmake b/cmake/onnxruntime_session.cmake
@@ -66,3 +66,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
+
+if (onnxruntime_USE_NCCL AND onnxruntime_USE_ROCM)
+  add_dependencies(onnxruntime_session generate_hipified_files)
+endif()
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh
@@ -34,7 +34,7 @@ if [ $RunTestCsharp = "true" ]; then
 
   if [ $PACKAGENAME = "Microsoft.ML.OnnxRuntime.Gpu" ]; then
     export TESTONGPU=ON
-    dotnet test -p:DefineConstants=USE_CUDA $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed --filter "DisplayName!=TestCUDAProviderOptions"
+    dotnet test -p:DefineConstants=USE_CUDA $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed
     if [ $? -ne 0 ]; then
       echo "Failed to build or execute the end-to-end test"
       exit 1

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -1708,6 +1708,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>no repeat ngrams size</dd>
 <dt><tt>pad_token_id</tt> : int (required)</dt>
 <dd>The id of the padding token</dd>
+<dt><tt>vocab_size</tt> : int</dt>
+<dd>Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape</dd>
 </dl>
 
 #### Inputs (2 - 7)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -1129,6 +1129,7 @@ Do not modify directly.*
 |QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
+|SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**|1+|**T** = tensor(float), tensor(float16)|
 | |
 | |
 |**Operator Domain:** *com.microsoft.dml*||||

diff --git a/include/onnxruntime/core/framework/kernel_def_builder.h b/include/onnxruntime/core/framework/kernel_def_builder.h
@@ -85,7 +85,7 @@ class KernelDef {
 
   bool HasExternalOutputs() const { return external_outputs_; }
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   const std::vector<int>& MayStridedInput() const { return may_strided_inputs_; }
   const std::vector<std::pair<int, int>>& MayStridedOutput() const { return may_strided_output_map_; }
 #endif
@@ -143,7 +143,7 @@ class KernelDef {
   // Whether the outputs are from external.
   bool external_outputs_ = false;
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   // An element i means i-th input can be strided tensor.
   std::vector<int> may_strided_inputs_;
 
@@ -261,7 +261,7 @@ class KernelDefBuilder {
     return *this;
   }
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   /**
      Specify that the input_index-th input can be strided tensor.
    */

diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
@@ -248,7 +248,7 @@ class Tensor final {
   */
   size_t SizeInBytes() const;
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   /**
    * Get the strides of the tensor.
    */
@@ -276,7 +276,7 @@ class Tensor final {
 
   void ReleaseBuffer();
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   bool CheckIsContiguous() const;
 #endif
 
@@ -289,7 +289,7 @@ class Tensor final {
   AllocatorPtr buffer_deleter_;
 
   TensorShape shape_;
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   mutable TensorShapeVector strides_;
   bool is_contiguous_ = true;
 #endif

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_parameters.cc
@@ -14,6 +14,7 @@ void GreedySearchParameters::ParseFromAttributes(const OpKernelInfo& info) {
   pad_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("pad_token_id", -1));
   decoder_start_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("decoder_start_token_id", -1));
   no_repeat_ngram_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_repeat_ngram_size", 0));
+  vocab_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("vocab_size", -1));
 }
 
 void GreedySearchParameters::ParseFromInputs(OpKernelContext* context) {

diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -244,7 +244,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   // NOTE: `padded_vocab_size` MAY be different from `vocab_size`.
   // But the following implementation should work correctly if they are the same
   // or different.
-  int padded_vocab_size = static_cast<int>(logits_shape[2]);
+  auto padded_vocab_size = static_cast<int>(logits_shape[2]);
 
   cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
@@ -475,12 +475,17 @@ Status GreedySearchProcessLogits(
   typedef typename ToCudaType<T>::MappedType CudaT;
   const CudaT* logits_data = reinterpret_cast<const CudaT*>(logits.Get<Tensor>().Data<T>());
 
-  // Logits has shape (batch_size, input_length, vocab_size),
+  // Logits has shape (batch_size, input_length, padded_vocab_size),
   // where input_length equals to parameters_->sequence_length for first subgraph call, and 1 for the remaining calls.
   const TensorShape& logits_shape = logits.Get<Tensor>().Shape();
   ORT_ENFORCE(logits_shape.NumDimensions() == 3);
   auto input_length = logits_shape[1];
 
+  // NOTE: `padded_vocab_size` MAY be different from `vocab_size`.
+  // But the following implementation should work correctly if they are the same
+  // or different.
+  auto padded_vocab_size = static_cast<int>(logits_shape[2]);
+
   cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
   // Get logits for the last token:
@@ -489,13 +494,18 @@ Status GreedySearchProcessLogits(
   gsl::span<T>& next_token_scores = greedy_state->next_token_scores;
 
   // TODO(tianleiwu): use one kernel to replace a loop of memory copy.
-  const CudaT* current_logits = logits_data + (input_length - 1) * vocab_size;
+  // Move the pointer in increments of padded_vocab_size to account for any padding
+  // if any in the logits weight of the MatMul.
+  const CudaT* current_logits = logits_data + (input_length - 1) * padded_vocab_size;
   for (int i = 0; i < batch_beam_size; i++) {
+    // We only copy what is relevant (i.e.) vocab_size as padded_vocab_size will contain
+    // some logits corresponding to the "padded" vocab size which we will ignore
+    // for token generation.
     gsl::span<const T> source(reinterpret_cast<const T*>(current_logits), vocab_size);
     gsl::span<T> target = next_token_scores.subspan(i * vocab_size, vocab_size);
     CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target.data(), source.data(), sizeof(T) * vocab_size,
                                          cudaMemcpyDeviceToDevice, cuda_stream));
-    current_logits += input_length * vocab_size;
+    current_logits += input_length * padded_vocab_size;
   }
 
 #ifdef DEBUG_GENERATION

diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu_tunable_op.h b/onnxruntime/contrib_ops/rocm/bert/fast_gelu_tunable_op.h
@@ -11,6 +11,9 @@
 #include "core/providers/rocm/cu_inc/common.cuh"
 #include "contrib_ops/rocm/bert/fast_gelu_impl_kernel.h"
 
+using onnxruntime::rocm::CeilDiv;
+using onnxruntime::rocm::GPU_WARP_SIZE;
+
 namespace onnxruntime {
 namespace contrib {
 namespace rocm {
@@ -33,19 +36,28 @@ struct FastGeluParams : onnxruntime::rocm::tunable::OpParams {
 };
 
 template <typename T, int ThreadsPerBlock, int VecSize>
-Status FastGeluOp(const FastGeluParams<T>* params) {
-  // TODO(anyone): Add tail handling for FastGelu
-  TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
-      !((params->bias_length > 0 && params->bias_length % VecSize == 0 && params->input_length % VecSize == 0) ||
-        (params->bias_length == 0 && params->input_length % VecSize == 0)));
-
-  FastGeluKernelVec<T, ThreadsPerBlock, VecSize>
-      <<<dim3(onnxruntime::rocm::CeilDiv(params->input_length, ThreadsPerBlock * VecSize)),
-         dim3(ThreadsPerBlock),
-         0, params->stream>>>(
-          params->input_length, params->bias_length, params->input, params->bias, params->output);
-  return HIP_CALL(hipGetLastError());
-}
+class FastGeluOp {
+ public:
+  Status operator()(const FastGeluParams<T>* params) {
+    FastGeluKernelVec<T, ThreadsPerBlock, VecSize>
+        <<<dim3(CeilDiv(params->input_length, ThreadsPerBlock * VecSize)),
+           dim3(ThreadsPerBlock),
+           0, params->stream>>>(
+            params->input_length, params->bias_length, params->input, params->bias, params->output);
+    return HIP_CALL(hipGetLastError());
+  }
+
+  Status IsSupported(const FastGeluParams<T>* params) {
+    // TODO(anyone): Add tail handling for FastGelu
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
+        !((params->bias_length > 0 && params->bias_length % VecSize == 0 && params->input_length % VecSize == 0) ||
+          (params->bias_length == 0 && params->input_length % VecSize == 0)));
+    // Avoid redundant configurations
+    TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->input_length > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize));
+
+    return Status::OK();
+  }
+};
 
 template <typename T>
 Status FastGeluStaticSelection(const FastGeluParams<T>* params) {
@@ -99,12 +111,12 @@ Status FastGeluStaticSelection(const FastGeluParams<half>* params) {
   return HIP_CALL(hipGetLastError());
 }
 
-#define ADD_OP(threads_per_block)                               \
-  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 1>); \
-  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 2>); \
-  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 4>); \
-  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 8>); \
-  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 16>);
+#define ADD_OP(threads_per_block)                                 \
+  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 1>{}); \
+  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 2>{}); \
+  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 4>{}); \
+  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 8>{}); \
+  this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 16>{});
 
 template <typename T>
 class FastGeluTunableOp : public onnxruntime::rocm::tunable::TunableOp<FastGeluParams<T>> {

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
@@ -363,7 +363,7 @@ class PlannerImpl {
       }
     }
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
     // If any output of the kernel can support strided tensor, and all its consumers' inputs also support
     // strided tensors at the corresponding position, this output will generate a strided tensor
     // and share the data from the corresponding input specified in MayStridedOutputsMap.
@@ -1018,11 +1018,11 @@ class PlannerImpl {
           // and optional types if the kernel has marked certain inputs as
           // possible candidates for re-use
           Reuse(reused, current, AllocKind::kReuse);
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
           if (is_strided_tensor) AllocPlan(current).is_strided_tensor = true;
 #else
           ORT_ENFORCE(!is_strided_tensor, "Strided tensor is not supported in non-training build for now.");
-#endif  // ENABLE_TRAINING
+#endif  // ENABLE_STRIDED_TENSORS
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
           InplaceReuse(reused, current);
 #endif

diff --git a/onnxruntime/core/framework/data_transfer.cc b/onnxruntime/core/framework/data_transfer.cc
@@ -7,7 +7,7 @@
 #include "core/framework/sparse_tensor.h"
 #endif
 #include "core/framework/ortdevice.h"
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
 #include "core/framework/copy.h"
 #include "core/session/environment.h"
 #include "core/common/logging/logging.h"
@@ -50,7 +50,7 @@ common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /
     return Status::OK();
   }
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   if (!src.IsContiguous() || !dst.IsContiguous()) {
     auto dst_stride_vec = dst.Strides();
     auto src_stride_vec = src.Strides();
@@ -71,7 +71,7 @@ common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /
     }
 
     return Status::OK();
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   }
 #endif
 }

diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
@@ -580,9 +580,9 @@ Status ExecutionFrame::AllocateMLValueTensorPreAllocateBuffer(OrtValue& ort_valu
 
   // Training starts to support strided tensor that the shape size may be larger (like Expand), smaller (like Split) or
   // equal (like Transpose) to the shared tensor's shape size, so below check is no longer valid.
-#ifndef ENABLE_TRAINING
+#ifndef ENABLE_STRIDED_TENSORS
   ORT_ENFORCE(!is_strided_tensor);
-#endif  // ENABLE_TRAINING
+#endif  // ENABLE_STRIDED_TENSORS
   if (!is_strided_tensor) {
     auto buffer_num_elements = reuse_tensor->Shape().Size();
     auto required_num_elements = shape.Size();
@@ -733,9 +733,9 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
         ORT_RETURN_IF_ERROR(AllocateReusedOrtValueIfNotAllocatedHelper(reuse_mlvalue_index, shape));
 
         bool is_strided_tensor = false;
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
         is_strided_tensor = per_alloc_plan.is_strided_tensor;
-#endif  // ENABLE_TRAINING
+#endif  // ENABLE_STRIDED_TENSORS
         ORT_RETURN_IF_ERROR(
             AllocateMLValueTensorPreAllocateBuffer(ort_value, reuse_mlvalue_index, ml_data_type, alloc_info, *shape,
                                                    per_alloc_plan.create_fence_if_async, is_strided_tensor));

diff --git a/onnxruntime/core/framework/kernel_def_builder.cc b/onnxruntime/core/framework/kernel_def_builder.cc
@@ -179,7 +179,7 @@ KernelDefBuilder& KernelDefBuilder::VariadicAlias(int input_offset, int output_o
   return *this;
 }
 
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
 KernelDefBuilder& KernelDefBuilder::MayStridedInput(int input_index) {
   kernel_def_->may_strided_inputs_.emplace_back(input_index);
   return *this;

diff --git a/onnxruntime/core/framework/sequential_execution_plan.h b/onnxruntime/core/framework/sequential_execution_plan.h
@@ -41,7 +41,7 @@ struct AllocPlanPerValue {
   IntervalT allocate_interval{0, 0};
   OrtValueIndex inplace_reuse{-1}; //No in-place reuse
 #endif
-#ifdef ENABLE_TRAINING
+#ifdef ENABLE_STRIDED_TENSORS
   // is_strided_tensor indicates if this OrtValue is strided tensor.
   // If alloc_kind is kReuse, it reuses one of the node inputs (like Expand),
   // if alloc_kind is kAllocate, it will only allocate required buffer size (like ConstantOfShape).