Skip to content

Commit

Permalink
[mlir][bufferize] Avoid tensor copies when the data is not read
Browse files Browse the repository at this point in the history
There are various shortcuts in `BufferizationState::getBuffer` that avoid a buffer copy when we just need an allocation (and no initialization). This change adds those shortcuts to the TensorCopyInsertion pass, so that `getBuffer` can be simplified in a subsequent change.

Differential Revision: https://reviews.llvm.org/D126821
  • Loading branch information
matthias-springer committed Jun 10, 2022
1 parent 914e30c commit 79f1159
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 6 deletions.
Expand Up @@ -356,6 +356,10 @@ class AnalysisState {
/// an alias. Return false if the op is not bufferizable.
bool bufferizesToAliasOnly(OpOperand &opOperand) const;

/// Return true if a copy can always be avoided when allocating a new tensor
/// for the given OpOperand.
bool canOmitTensorCopy(OpOperand &opOperand) const;

/// Return true if the given value is read by an op that bufferizes to a
/// memory read. Also takes into account ops that create an alias but do not
/// read by themselves (e.g., ExtractSliceOp).
Expand Down
66 changes: 60 additions & 6 deletions mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
Expand Up @@ -10,6 +10,7 @@
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/BuiltinOps.h"
Expand Down Expand Up @@ -42,12 +43,40 @@ using namespace bufferization;
constexpr const ::llvm::StringLiteral
bufferization::BufferizableOpInterface::kInplaceableAttrName;

/// Create an AllocTensorOp for the given shaped value. Only ranked tensors are
/// supported at the moment. If `copy` is set, the shaped value is copied.
/// Otherwise, a tensor with undefined contents is allocated.
static Value allocateTensorForShapedValue(OpBuilder &b, Location loc,
Value shapedValue, bool escape,
bool copy = true) {
auto tensorType = shapedValue.getType().dyn_cast<RankedTensorType>();
assert(tensorType && "only RankedTensorType supported at the moment");
Value alloc;
if (!copy) {
// No copy needed: Just allocate.
SmallVector<Value> dynamicSizes;
for (int64_t i = 0; i < tensorType.getRank(); ++i)
if (tensorType.isDynamicDim(i))
dynamicSizes.push_back(b.create<tensor::DimOp>(loc, shapedValue, i));
alloc = b.create<AllocTensorOp>(loc, tensorType, dynamicSizes,
/*copy=*/Value(), escape);
} else {
// Allocate and copy.
alloc = b.create<AllocTensorOp>(loc, tensorType,
/*dynamicSizes=*/ValueRange(), shapedValue,
escape);
}
return alloc;
}

LogicalResult BufferizableOpInterface::resolveTensorOpOperandConflicts(
RewriterBase &rewriter, const AnalysisState &state) {
OpBuilder::InsertionGuard g(rewriter);
Operation *op = getOperation();
SmallVector<OpOperand *> outOfPlaceOpOperands;
DenseSet<OpOperand *> copiedOpOperands;
SmallVector<OpResult> outOfPlaceOpResults;
DenseSet<OpResult> copiedOpResults;

// Find all out-of-place OpOperands.
for (OpOperand &opOperand : op->getOpOperands()) {
Expand All @@ -69,32 +98,36 @@ LogicalResult BufferizableOpInterface::resolveTensorOpOperandConflicts(
// be smaller than the OpOperand (e.g., in the case of an extract_slice,
// where the result is usually a smaller part of the source).
outOfPlaceOpResults.push_back(aliasingOpResults.front());
if (!state.canOmitTensorCopy(opOperand))
copiedOpResults.insert(aliasingOpResults.front());
} else {
// In all other cases, make a copy of the OpOperand.
outOfPlaceOpOperands.push_back(&opOperand);
if (!state.canOmitTensorCopy(opOperand))
copiedOpOperands.insert(&opOperand);
}
}

// Insert copies of OpOperands.
rewriter.setInsertionPoint(op);
for (OpOperand *opOperand : outOfPlaceOpOperands) {
auto tensorType = opOperand->get().getType().cast<RankedTensorType>();
SmallVector<OpResult> aliasingOpResults =
state.getAliasingOpResult(*opOperand);
bool escape = llvm::any_of(
aliasingOpResults, [&](Value v) { return state.isTensorYielded(v); });
Value copy = rewriter.create<AllocTensorOp>(
op->getLoc(), tensorType, ValueRange(), opOperand->get(), escape);
Value copy = allocateTensorForShapedValue(
rewriter, op->getLoc(), opOperand->get(), escape,
copiedOpOperands.contains(opOperand));
rewriter.updateRootInPlace(op, [&]() { opOperand->set(copy); });
}

// Insert copies of OpResults.
rewriter.setInsertionPointAfter(op);
for (OpResult opResult : outOfPlaceOpResults) {
auto tensorType = opResult.getType().cast<RankedTensorType>();
bool escape = state.isTensorYielded(opResult);
Value copy = rewriter.create<AllocTensorOp>(op->getLoc(), tensorType,
ValueRange(), opResult, escape);
Value copy =
allocateTensorForShapedValue(rewriter, op->getLoc(), opResult, escape,
copiedOpResults.count(opResult));
SmallVector<OpOperand *> uses = llvm::to_vector(llvm::map_range(
opResult.getUses(), [](OpOperand &use) { return &use; }));
for (OpOperand *use : uses) {
Expand Down Expand Up @@ -313,6 +346,27 @@ AnalysisState::AnalysisState(const BufferizationOptions &options)
fn(*this);
}

bool AnalysisState::canOmitTensorCopy(OpOperand &opOperand) const {
// Do not copy if the tensor has undefined contents.
if (hasUndefinedContents(&opOperand))
return true;

// Do not copy if the buffer of the tensor is entirely overwritten (with
// values that do not depend on the old tensor).
if (bufferizesToMemoryWrite(opOperand) && !bufferizesToMemoryRead(opOperand))
return true;

// Do not copy if the tensor is never read.
SmallVector<OpResult> aliasingOpResults = getAliasingOpResult(opOperand);
if (!bufferizesToMemoryRead(opOperand) &&
llvm::none_of(aliasingOpResults,
[&](OpResult opResult) { return isValueRead(opResult); }))
return true;

// Default: Cannot omit the copy.
return false;
}

// bufferization.to_memref is not allowed to change the rank.
static void ensureToMemrefOpIsValid(Value tensor, Type memrefType) {
#ifndef NDEBUG
Expand Down
Expand Up @@ -25,3 +25,54 @@ func.func @return_alloc_tensor() -> (tensor<5xf32>) {
%0 = bufferization.alloc_tensor() : tensor<5xf32>
return %0 : tensor<5xf32>
}

// -----

// CHECK-LABEL: func @do_not_copy_undefined_tensor
func.func @do_not_copy_undefined_tensor(%f: f32, %idx: index)
-> (tensor<5xf32>, tensor<5xf32>)
{
// CHECK: bufferization.alloc_tensor() {escape = false} : tensor<5xf32>
// The second alloc_tensor should not have a copy operand.
// CHECK: bufferization.alloc_tensor() {escape = false} : tensor<5xf32>
%0 = bufferization.alloc_tensor() : tensor<5xf32>
%1 = tensor.insert %f into %0[%idx] : tensor<5xf32>
return %0, %1 : tensor<5xf32>, tensor<5xf32>
}

// -----

// CHECK-LABEL: func @do_not_copy_when_overwritten
func.func @do_not_copy_when_overwritten(%t: tensor<5xf32>, %f: f32)
-> (tensor<5xf32>, tensor<5xf32>)
{
// CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {escape = false} : tensor<5xf32>
// CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<5xf32>)
%r = linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>],
iterator_types = ["parallel"]}
outs(%t : tensor<5xf32>) {
^bb0(%arg0 : f32) :
linalg.yield %f : f32
} -> tensor<5xf32>
return %t, %r : tensor<5xf32>, tensor<5xf32>
}

// -----

// CHECK-LABEL: func @do_not_copy_when_result_not_read
func.func @do_not_copy_when_result_not_read(%t: tensor<5xf32>, %f: f32)
-> (tensor<3xf32>)
{
%0 = tensor.extract_slice %t[0][3][1] : tensor<5xf32> to tensor<3xf32>
// CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {escape = false} : tensor<3xf32>
// CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<3xf32>)
%r = linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>],
iterator_types = ["parallel"]}
outs(%0 : tensor<3xf32>) {
^bb0(%arg0 : f32) :
linalg.yield %f : f32
} -> tensor<3xf32>
return %r : tensor<3xf32>
}

0 comments on commit 79f1159

Please sign in to comment.