Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions include/onnxruntime/core/framework/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ class Tensor final {
MLDataType DataType() const { return dtype_; }

/**
Returns the data type enum contant
Returns the data type enum constant
@remarks Use utils::ToTensorProtoElementType<T> for comparison.
*/
int32_t GetElementType() const {
return dtype_->GetDataType();
Expand All @@ -104,8 +105,8 @@ class Tensor final {
}

// Checks if the Tensor contains data type T
template<class T>
bool IsDataType () const {
template <class T>
bool IsDataType() const {
return utils::IsPrimitiveDataType<T>(dtype_);
}

Expand Down
320 changes: 314 additions & 6 deletions onnxruntime/core/providers/cpu/tensor/transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Licensed under the MIT License.

#include "core/providers/cpu/tensor/transpose.h"

#include "core/framework/utils.h"
namespace onnxruntime {

/* A permutation [a,b,c,...] indicates that
Expand Down Expand Up @@ -238,6 +238,296 @@ static Status DoUntypedTranspose(const std::vector<size_t>& permutations, const
return Status::OK();
}

/*
Optimizations for moving a single axis either inwards or outwards.

If moving outwards we can use a single reader and multiple writers. The number of writers is equal to the value of
the axis being moved.

e.g. if the input is NHWC with shape {N, 300, 300, 3}, we can transpose to NCHW by reading once and having
one writer for each of the 3 channels at a different offset in the output, updating the offset for each item
in the batch of N.

Similarly if one axis is moving inwards we can use a single writer and multiple readers. The number of readers is equal
to the value of the axis being moved.

e.g. if the input is NCHW with shape {N, 3, 300, 300}, we can transpose to NHWC by writing once using one reader for
each of the 3 channels at a different offset in the input, updating the read offset for each item in the batch
of N.

This can be generalized for any input where only one axis is being moved, with the block size for each read/write
being dependent on which axis is moving, what direction it's moving in, and where it's moving to.

We use simple pointer arithmetic if the size of each read/write is a power of 2 and between 8 and 64 bits.
We use memcpy if the block size is larger.

We fall back to the default implementation in all other cases, and if the input is std::string.
*/

// moving a single axis outwards where the read/write size is a power of 2 and between 8 and 64 bits.
template <typename T>
static void SimpleTransposeSingleAxisOutwards(const T* input_data, T* output_data,
int64_t num_loops, int64_t num_writers,
int64_t writes_per_loop, int64_t writes_per_writer_per_loop) {
std::vector<T*> writers;
writers.resize(num_writers);

for (int64_t l = 0; l < num_loops; ++l) {
for (auto w = 0; w < num_writers; ++w) {
writers[w] = (output_data + (w * writes_per_writer_per_loop));
}

for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
for (int64_t w = 0; w < num_writers; ++w) {
*(writers[w]++) = *input_data++;
}
}

output_data += writes_per_loop;
}
}

static void TranposeSingleAxisOutwards(const std::vector<size_t>& permutations, const Tensor& input, Tensor& output,
int64_t from, int64_t to) {
ORT_UNUSED_PARAMETER(permutations);

const auto& input_shape = input.Shape();
const auto& input_dims = input_shape.GetDims();

const auto element_size = input.DataType()->Size();

const auto* input_data = reinterpret_cast<const uint8_t*>(input.DataRaw());
auto* output_data = reinterpret_cast<uint8_t*>(output.MutableDataRaw());

auto num_loops = input_shape.SizeToDimension(to);
auto num_writers = input_dims[from];
auto block_size = input_shape.SizeFromDimension(from + 1);
auto writes_per_loop = int64_t(input_shape.Size() / num_loops / block_size);
auto writes_per_writer_per_loop = int64_t(writes_per_loop / num_writers);
const int64_t bytes_per_write = block_size * element_size;

switch (bytes_per_write) {
case (sizeof(uint8_t)): {
SimpleTransposeSingleAxisOutwards(input_data, output_data,
num_loops, num_writers, writes_per_loop, writes_per_writer_per_loop);
break;
}
case (sizeof(uint16_t)): {
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint16_t*>(input_data),
reinterpret_cast<uint16_t*>(output_data),
num_loops, num_writers, writes_per_loop, writes_per_writer_per_loop);
break;
}
case (sizeof(uint32_t)): {
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint32_t*>(input_data),
reinterpret_cast<uint32_t*>(output_data),
num_loops, num_writers, writes_per_loop, writes_per_writer_per_loop);
break;
}
case (sizeof(uint64_t)): {
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint64_t*>(input_data),
reinterpret_cast<uint64_t*>(output_data),
num_loops, num_writers, writes_per_loop, writes_per_writer_per_loop);
break;
}
default: {
// we need to use memcpy for each block
std::vector<uint8_t*> writers;
writers.resize(num_writers);

for (int64_t l = 0; l < num_loops; ++l) {
for (auto w = 0; w < num_writers; ++w) {
writers[w] = (output_data + (w * writes_per_writer_per_loop * bytes_per_write));
}

for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
for (int64_t w = 0; w < num_writers; ++w) {
memcpy(writers[w], input_data, bytes_per_write);
writers[w] += bytes_per_write;
input_data += bytes_per_write;
}
}

output_data += writes_per_loop * bytes_per_write;
}
}
}
}

template <typename T>
static void SimpleTransposeSingleAxisInwards(const T* input_data, T* output_data,
int64_t num_loops, int64_t num_readers,
int64_t reads_per_loop, int64_t reads_per_reader_per_loop) {
std::vector<const T*> readers;
readers.resize(num_readers);

for (int64_t l = 0; l < num_loops; ++l) {
for (auto r = 0; r < num_readers; ++r) {
readers[r] = (input_data + (r * reads_per_reader_per_loop));
}

for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
for (int64_t r = 0; r < num_readers; ++r) {
*output_data++ = *(readers[r]++);
}
}

input_data += reads_per_loop;
}
}

// moving a single axis inwards where the read/write size is a power of 2 and between 8 and 64 bits.
static void TranposeSingleAxisInwards(const std::vector<size_t>& permutations, const Tensor& input, Tensor& output,
int64_t from, int64_t to) {
ORT_UNUSED_PARAMETER(permutations);

const auto& input_shape = input.Shape();
const auto& input_dims = input_shape.GetDims();

const auto element_size = input.DataType()->Size();

const auto* input_data = reinterpret_cast<const uint8_t*>(input.DataRaw());
auto* output_data = reinterpret_cast<uint8_t*>(output.MutableDataRaw());

auto num_loops = input_shape.SizeToDimension(from);
auto num_readers = input_dims[from];
auto block_size = input_shape.SizeFromDimension(to + 1);
auto reads_per_loop = int64_t(input_shape.Size() / num_loops / block_size);
auto reads_per_reader_per_loop = int64_t(reads_per_loop / num_readers);
const int64_t bytes_per_read = block_size * element_size;

switch (bytes_per_read) {
case (sizeof(uint8_t)): {
SimpleTransposeSingleAxisInwards(input_data, output_data,
num_loops, num_readers, reads_per_loop, reads_per_reader_per_loop);
break;
}
case (sizeof(uint16_t)): {
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint16_t*>(input_data),
reinterpret_cast<uint16_t*>(output_data),
num_loops, num_readers, reads_per_loop, reads_per_reader_per_loop);
break;
}
case (sizeof(uint32_t)): {
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint32_t*>(input_data),
reinterpret_cast<uint32_t*>(output_data),
num_loops, num_readers, reads_per_loop, reads_per_reader_per_loop);
break;
}
case (sizeof(uint64_t)): {
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint64_t*>(input_data),
reinterpret_cast<uint64_t*>(output_data),
num_loops, num_readers, reads_per_loop, reads_per_reader_per_loop);
break;
}
default: {
// we need to use memcpy for each block
std::vector<const uint8_t*> readers;
readers.resize(num_readers);

for (int64_t l = 0; l < num_loops; ++l) {
for (auto r = 0; r < num_readers; ++r) {
readers[r] = (input_data + (r * reads_per_reader_per_loop * bytes_per_read));
}

for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
for (int64_t r = 0; r < num_readers; ++r) {
memcpy(output_data, readers[r], bytes_per_read);
readers[r] += bytes_per_read;
output_data += bytes_per_read;
}
}

input_data += reads_per_loop * bytes_per_read;
}
}
}
}

static void SingleAxisTranspose(const std::vector<size_t>& permutations, const Tensor& input, Tensor& output,
size_t from, size_t to) {
// TODO: We may want to fall back to the default implementation if the size of the axis being moved is large
// compared to the other axes.
// e.g. transpose {3, 2048} with permutation of {1, 0} would result in 2048 writers being created
// (std::vector<uint8_t*> of size 2048) but only used in 3 loops. however that may still be cheaper than
// calling ComputeOffset and IncrementIndex 6K times.

if (from > to) {
TranposeSingleAxisOutwards(permutations, input, output, from, to);
} else {
TranposeSingleAxisInwards(permutations, input, output, from, to);
}
}

static bool IsMovingSingleAxis(const std::vector<size_t>& permutations, size_t& from, size_t& to) {
// if a single axis moved to an outer dimension, the values should be one lower than the index until the slot the
// axis was moved from, and equal to the index after that.
// e.g. axis 3 moves out to 1 would be: 0, 3, 1, 2, 4
auto check_moved_outwards = [&permutations](size_t cur, size_t moved_from) {
// we start processing with the slot after the moved one, so the expected value is one less than the index
size_t expected = cur - 1;
for (size_t end = permutations.size(); cur < end; ++cur) {
if (permutations[cur] != expected) {
return false;
}

// we are at the slot the axis moved from, so do an additional increment before checking the next value
if (cur == moved_from) {
++expected;
}

++expected;
}

return true;
};

// if a single axis moved to an inner dimension, the values should be one higher than the index until the slot the
// axis was moved to, and equal to the index after that.
// e.g. axis 1 moves inwards to 3 would be: 0, 2, 3, 1, 4
auto check_moved_inwards = [&permutations](size_t cur, size_t& moved_to) {
size_t started_at = cur;
size_t expected = cur + 1;
moved_to = std::numeric_limits<size_t>::max();

for (size_t end = permutations.size(); cur < end; ++cur) {
if (permutations[cur] != expected) {
// if a single axis moved it must have come from the location we started at
if (started_at != permutations[cur]) {
return false;
}

moved_to = cur;
} else {
++expected;
}
}

return moved_to != std::numeric_limits<size_t>::max();
};

bool single_axis_moved = false;
// check axis moving outwards (earlier entry in permutations)
for (size_t i = 0, end = permutations.size(); i < end; ++i) {
size_t axis = permutations[i];

if (axis != i) {
if (check_moved_outwards(i + 1, axis)) {
single_axis_moved = true;
to = i;
from = axis;
} else if (check_moved_inwards(i, to)) {
single_axis_moved = true;
from = i;
}

break;
}
}

return single_axis_moved;
}

Status TransposeBase::DoTranspose(const std::vector<size_t>& permutations, const Tensor& input, Tensor& output) {
Status status = Status::OK();

Expand All @@ -248,14 +538,21 @@ Status TransposeBase::DoTranspose(const std::vector<size_t>& permutations, const
status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Mismatched data types between input and output Tensors. ",
input_type, " != ", output_type);
} else {
status = DoUntypedTranspose(permutations, input, output);
size_t from = 0, to = 0;
bool moving_single_axis = IsMovingSingleAxis(permutations, from, to);

if (moving_single_axis && !input.IsDataTypeString()) {
SingleAxisTranspose(permutations, input, output, from, to);
} else {
// fall back to default implementation
status = DoUntypedTranspose(permutations, input, output);
}
}

return status;
}

Status Transpose::Compute(OpKernelContext* ctx) const {
// Get input and output:
const auto* input_tensor_ptr = ctx->Input<Tensor>(0);
ORT_ENFORCE(input_tensor_ptr != nullptr);
const Tensor& X = *input_tensor_ptr;
Expand All @@ -266,16 +563,27 @@ Status Transpose::Compute(OpKernelContext* ctx) const {
std::vector<int64_t> output_dims(rank);
const std::vector<size_t>* p_perm;
std::vector<size_t> default_perm(rank);
const auto& status = ComputeOutputShape(X, output_dims, default_perm, p_perm);
Status status = ComputeOutputShape(X, output_dims, default_perm, p_perm);
if (!status.IsOK())
return status;

TensorShape output_shape{output_dims};
Tensor& Y = *ctx->Output(0, output_shape);

DoUntypedTranspose(*p_perm, X, Y);
if (output_shape.Size() == 0)
return Status::OK();

return Status::OK();
size_t from = 0, to = 0;
bool moving_single_axis = IsMovingSingleAxis(*p_perm, from, to);

if (moving_single_axis && !X.IsDataTypeString()) {
SingleAxisTranspose(*p_perm, X, Y, from, to);
} else {
// fall back to default implementation
status = DoUntypedTranspose(*p_perm, X, Y);
}

return status;
}

ONNX_CPU_OPERATOR_KERNEL(
Expand Down
Loading