Skip to content

Commit

Permalink
Merge branch 'main' of github.com:microsoft/onnxruntime into user/pav…
Browse files Browse the repository at this point in the history
…ignol/add-embed-layer-norm
  • Loading branch information
PatriceVignola committed Dec 8, 2022
2 parents 254adc9 + c1cc1d5 commit 06da5ab
Show file tree
Hide file tree
Showing 59 changed files with 585 additions and 730 deletions.
2 changes: 1 addition & 1 deletion cgmanifests/generated/cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@
"component": {
"type": "git",
"git": {
"commitHash": "8de7772cc72daca8e947b79b83fea46214931604",
"commitHash": "80dc998efced8ceb2be59756668a7e90e8bef917",
"repositoryUrl": "https://github.com/pybind/pybind11.git"
},
"comments": "pybind11"
Expand Down
1 change: 1 addition & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1224,6 +1224,7 @@ endif()
if (onnxruntime_ENABLE_TRAINING)
add_compile_definitions(ENABLE_TRAINING)
add_compile_definitions(ENABLE_TRAINING_OPS)
add_compile_definitions(ENABLE_STRIDED_TENSORS)

if (UNIX)
if (EXISTS "${onnxruntime_MPI_HOME}")
Expand Down
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/87c7a70688fd98fb355b
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v3.18.3.zip;b95bf7e9de9c2249b6c1f2ca556ace49999e90bd
psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/1787867f6183f056420e532eec640cba25efafea.zip;e43e80781560c5ab404a4da20f34d846f5f5d101
pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.6.2.zip;950b3b319384a1a36b252cc821953e5f9be14840
pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/5916273f79a21551890fd3d56fc5375a78d1598d.zip;2be4d2ae321fada97cb39eaf4eeba5f8c85597cf
re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
safeint;https://github.com/dcleblanc/SafeInt/archive/ff15c6ada150a5018c5ef2172401cb4529eac9c0.zip;913a4046e5274d329af2806cb53194f617d8c0ab
Expand Down
4 changes: 4 additions & 0 deletions cmake/onnxruntime_session.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()

if (onnxruntime_USE_NCCL AND onnxruntime_USE_ROCM)
add_dependencies(onnxruntime_session generate_hipified_files)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ if [ $RunTestCsharp = "true" ]; then

if [ $PACKAGENAME = "Microsoft.ML.OnnxRuntime.Gpu" ]; then
export TESTONGPU=ON
dotnet test -p:DefineConstants=USE_CUDA $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed --filter "DisplayName!=TestCUDAProviderOptions"
dotnet test -p:DefineConstants=USE_CUDA $BUILD_SOURCESDIRECTORY/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj --no-restore --verbosity detailed
if [ $? -ne 0 ]; then
echo "Failed to build or execute the end-to-end test"
exit 1
Expand Down
2 changes: 2 additions & 0 deletions docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,8 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>no repeat ngrams size</dd>
<dt><tt>pad_token_id</tt> : int (required)</dt>
<dd>The id of the padding token</dd>
<dt><tt>vocab_size</tt> : int</dt>
<dd>Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape</dd>
</dl>

#### Inputs (2 - 7)
Expand Down
1 change: 1 addition & 0 deletions docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,7 @@ Do not modify directly.*
|QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
|SkipLayerNormalization|*in* input:**T**<br> *in* skip:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* bias:**T**<br> *out* output:**T**<br> *out* mean:**U**<br> *out* inv_std_var:**U**|1+|**T** = tensor(float), tensor(float16)|
| |
| |
|**Operator Domain:** *com.microsoft.dml*||||
Expand Down
6 changes: 3 additions & 3 deletions include/onnxruntime/core/framework/kernel_def_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class KernelDef {

bool HasExternalOutputs() const { return external_outputs_; }

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
const std::vector<int>& MayStridedInput() const { return may_strided_inputs_; }
const std::vector<std::pair<int, int>>& MayStridedOutput() const { return may_strided_output_map_; }
#endif
Expand Down Expand Up @@ -143,7 +143,7 @@ class KernelDef {
// Whether the outputs are from external.
bool external_outputs_ = false;

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
// An element i means i-th input can be strided tensor.
std::vector<int> may_strided_inputs_;

Expand Down Expand Up @@ -261,7 +261,7 @@ class KernelDefBuilder {
return *this;
}

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
/**
Specify that the input_index-th input can be strided tensor.
*/
Expand Down
6 changes: 3 additions & 3 deletions include/onnxruntime/core/framework/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ class Tensor final {
*/
size_t SizeInBytes() const;

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
/**
* Get the strides of the tensor.
*/
Expand Down Expand Up @@ -276,7 +276,7 @@ class Tensor final {

void ReleaseBuffer();

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
bool CheckIsContiguous() const;
#endif

Expand All @@ -289,7 +289,7 @@ class Tensor final {
AllocatorPtr buffer_deleter_;

TensorShape shape_;
#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
mutable TensorShapeVector strides_;
bool is_contiguous_ = true;
#endif
Expand Down
33 changes: 27 additions & 6 deletions js/web/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ void GreedySearchParameters::ParseFromAttributes(const OpKernelInfo& info) {
pad_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("pad_token_id", -1));
decoder_start_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("decoder_start_token_id", -1));
no_repeat_ngram_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_repeat_ngram_size", 0));
vocab_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("vocab_size", -1));
}

void GreedySearchParameters::ParseFromInputs(OpKernelContext* context) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ Status ProcessLogits(const OrtValue& logits, //
// NOTE: `padded_vocab_size` MAY be different from `vocab_size`.
// But the following implementation should work correctly if they are the same
// or different.
int padded_vocab_size = static_cast<int>(logits_shape[2]);
auto padded_vocab_size = static_cast<int>(logits_shape[2]);

cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);

Expand Down Expand Up @@ -475,12 +475,17 @@ Status GreedySearchProcessLogits(
typedef typename ToCudaType<T>::MappedType CudaT;
const CudaT* logits_data = reinterpret_cast<const CudaT*>(logits.Get<Tensor>().Data<T>());

// Logits has shape (batch_size, input_length, vocab_size),
// Logits has shape (batch_size, input_length, padded_vocab_size),
// where input_length equals to parameters_->sequence_length for first subgraph call, and 1 for the remaining calls.
const TensorShape& logits_shape = logits.Get<Tensor>().Shape();
ORT_ENFORCE(logits_shape.NumDimensions() == 3);
auto input_length = logits_shape[1];

// NOTE: `padded_vocab_size` MAY be different from `vocab_size`.
// But the following implementation should work correctly if they are the same
// or different.
auto padded_vocab_size = static_cast<int>(logits_shape[2]);

cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);

// Get logits for the last token:
Expand All @@ -489,13 +494,18 @@ Status GreedySearchProcessLogits(
gsl::span<T>& next_token_scores = greedy_state->next_token_scores;

// TODO(tianleiwu): use one kernel to replace a loop of memory copy.
const CudaT* current_logits = logits_data + (input_length - 1) * vocab_size;
// Move the pointer in increments of padded_vocab_size to account for any padding
// if any in the logits weight of the MatMul.
const CudaT* current_logits = logits_data + (input_length - 1) * padded_vocab_size;
for (int i = 0; i < batch_beam_size; i++) {
// We only copy what is relevant (i.e.) vocab_size as padded_vocab_size will contain
// some logits corresponding to the "padded" vocab size which we will ignore
// for token generation.
gsl::span<const T> source(reinterpret_cast<const T*>(current_logits), vocab_size);
gsl::span<T> target = next_token_scores.subspan(i * vocab_size, vocab_size);
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target.data(), source.data(), sizeof(T) * vocab_size,
cudaMemcpyDeviceToDevice, cuda_stream));
current_logits += input_length * vocab_size;
current_logits += input_length * padded_vocab_size;
}

#ifdef DEBUG_GENERATION
Expand Down
50 changes: 31 additions & 19 deletions onnxruntime/contrib_ops/rocm/bert/fast_gelu_tunable_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include "core/providers/rocm/cu_inc/common.cuh"
#include "contrib_ops/rocm/bert/fast_gelu_impl_kernel.h"

using onnxruntime::rocm::CeilDiv;
using onnxruntime::rocm::GPU_WARP_SIZE;

namespace onnxruntime {
namespace contrib {
namespace rocm {
Expand All @@ -33,19 +36,28 @@ struct FastGeluParams : onnxruntime::rocm::tunable::OpParams {
};

template <typename T, int ThreadsPerBlock, int VecSize>
Status FastGeluOp(const FastGeluParams<T>* params) {
// TODO(anyone): Add tail handling for FastGelu
TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
!((params->bias_length > 0 && params->bias_length % VecSize == 0 && params->input_length % VecSize == 0) ||
(params->bias_length == 0 && params->input_length % VecSize == 0)));

FastGeluKernelVec<T, ThreadsPerBlock, VecSize>
<<<dim3(onnxruntime::rocm::CeilDiv(params->input_length, ThreadsPerBlock * VecSize)),
dim3(ThreadsPerBlock),
0, params->stream>>>(
params->input_length, params->bias_length, params->input, params->bias, params->output);
return HIP_CALL(hipGetLastError());
}
class FastGeluOp {
public:
Status operator()(const FastGeluParams<T>* params) {
FastGeluKernelVec<T, ThreadsPerBlock, VecSize>
<<<dim3(CeilDiv(params->input_length, ThreadsPerBlock * VecSize)),
dim3(ThreadsPerBlock),
0, params->stream>>>(
params->input_length, params->bias_length, params->input, params->bias, params->output);
return HIP_CALL(hipGetLastError());
}

Status IsSupported(const FastGeluParams<T>* params) {
// TODO(anyone): Add tail handling for FastGelu
TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
!((params->bias_length > 0 && params->bias_length % VecSize == 0 && params->input_length % VecSize == 0) ||
(params->bias_length == 0 && params->input_length % VecSize == 0)));
// Avoid redundant configurations
TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->input_length > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize));

return Status::OK();
}
};

template <typename T>
Status FastGeluStaticSelection(const FastGeluParams<T>* params) {
Expand Down Expand Up @@ -99,12 +111,12 @@ Status FastGeluStaticSelection(const FastGeluParams<half>* params) {
return HIP_CALL(hipGetLastError());
}

#define ADD_OP(threads_per_block) \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 1>); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 2>); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 4>); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 8>); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 16>);
#define ADD_OP(threads_per_block) \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 1>{}); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 2>{}); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 4>{}); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 8>{}); \
this->ops_.emplace_back(FastGeluOp<T, threads_per_block, 16>{});

template <typename T>
class FastGeluTunableOp : public onnxruntime::rocm::tunable::TunableOp<FastGeluParams<T>> {
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/framework/allocation_planner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ class PlannerImpl {
}
}

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
// If any output of the kernel can support strided tensor, and all its consumers' inputs also support
// strided tensors at the corresponding position, this output will generate a strided tensor
// and share the data from the corresponding input specified in MayStridedOutputsMap.
Expand Down Expand Up @@ -1018,11 +1018,11 @@ class PlannerImpl {
// and optional types if the kernel has marked certain inputs as
// possible candidates for re-use
Reuse(reused, current, AllocKind::kReuse);
#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
if (is_strided_tensor) AllocPlan(current).is_strided_tensor = true;
#else
ORT_ENFORCE(!is_strided_tensor, "Strided tensor is not supported in non-training build for now.");
#endif // ENABLE_TRAINING
#endif // ENABLE_STRIDED_TENSORS
#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
InplaceReuse(reused, current);
#endif
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/framework/data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "core/framework/sparse_tensor.h"
#endif
#include "core/framework/ortdevice.h"
#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
#include "core/framework/copy.h"
#include "core/session/environment.h"
#include "core/common/logging/logging.h"
Expand Down Expand Up @@ -50,7 +50,7 @@ common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /
return Status::OK();
}

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
if (!src.IsContiguous() || !dst.IsContiguous()) {
auto dst_stride_vec = dst.Strides();
auto src_stride_vec = src.Strides();
Expand All @@ -71,7 +71,7 @@ common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int /
}

return Status::OK();
#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
}
#endif
}
Expand Down
8 changes: 4 additions & 4 deletions onnxruntime/core/framework/execution_frame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -580,9 +580,9 @@ Status ExecutionFrame::AllocateMLValueTensorPreAllocateBuffer(OrtValue& ort_valu

// Training starts to support strided tensor that the shape size may be larger (like Expand), smaller (like Split) or
// equal (like Transpose) to the shared tensor's shape size, so below check is no longer valid.
#ifndef ENABLE_TRAINING
#ifndef ENABLE_STRIDED_TENSORS
ORT_ENFORCE(!is_strided_tensor);
#endif // ENABLE_TRAINING
#endif // ENABLE_STRIDED_TENSORS
if (!is_strided_tensor) {
auto buffer_num_elements = reuse_tensor->Shape().Size();
auto required_num_elements = shape.Size();
Expand Down Expand Up @@ -733,9 +733,9 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
ORT_RETURN_IF_ERROR(AllocateReusedOrtValueIfNotAllocatedHelper(reuse_mlvalue_index, shape));

bool is_strided_tensor = false;
#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
is_strided_tensor = per_alloc_plan.is_strided_tensor;
#endif // ENABLE_TRAINING
#endif // ENABLE_STRIDED_TENSORS
ORT_RETURN_IF_ERROR(
AllocateMLValueTensorPreAllocateBuffer(ort_value, reuse_mlvalue_index, ml_data_type, alloc_info, *shape,
per_alloc_plan.create_fence_if_async, is_strided_tensor));
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/framework/kernel_def_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ KernelDefBuilder& KernelDefBuilder::VariadicAlias(int input_offset, int output_o
return *this;
}

#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
KernelDefBuilder& KernelDefBuilder::MayStridedInput(int input_index) {
kernel_def_->may_strided_inputs_.emplace_back(input_index);
return *this;
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/framework/sequential_execution_plan.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct AllocPlanPerValue {
IntervalT allocate_interval{0, 0};
OrtValueIndex inplace_reuse{-1}; //No in-place reuse
#endif
#ifdef ENABLE_TRAINING
#ifdef ENABLE_STRIDED_TENSORS
// is_strided_tensor indicates if this OrtValue is strided tensor.
// If alloc_kind is kReuse, it reuses one of the node inputs (like Expand),
// if alloc_kind is kAllocate, it will only allocate required buffer size (like ConstantOfShape).
Expand Down

0 comments on commit 06da5ab

Please sign in to comment.