[mlir][bufferize] Avoid tensor copies when the data is not read

There are various shortcuts in `BufferizationState::getBuffer` that avoid a buffer copy when we just need an allocation (and no initialization). This change adds those shortcuts to the TensorCopyInsertion pass, so that `getBuffer` can be simplified in a subsequent change. Differential Revision: https://reviews.llvm.org/D126821
llvm · Jun 10, 2022 · 79f1159 · 79f1159
1 parent 914e30c
commit 79f1159
Show file tree

Hide file tree

Showing 3 changed files with 115 additions and 6 deletions.
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -356,6 +356,10 @@ class AnalysisState {
   /// an alias. Return false if the op is not bufferizable.
   bool bufferizesToAliasOnly(OpOperand &opOperand) const;
 
+  /// Return true if a copy can always be avoided when allocating a new tensor
+  /// for the given OpOperand.
+  bool canOmitTensorCopy(OpOperand &opOperand) const;
+
   /// Return true if the given value is read by an op that bufferizes to a
   /// memory read. Also takes into account ops that create an alias but do not
   /// read by themselves (e.g., ExtractSliceOp).

diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -42,12 +43,40 @@ using namespace bufferization;
 constexpr const ::llvm::StringLiteral
     bufferization::BufferizableOpInterface::kInplaceableAttrName;
 
+/// Create an AllocTensorOp for the given shaped value. Only ranked tensors are
+/// supported at the moment. If `copy` is set, the shaped value is copied.
+/// Otherwise, a tensor with undefined contents is allocated.
+static Value allocateTensorForShapedValue(OpBuilder &b, Location loc,
+                                          Value shapedValue, bool escape,
+                                          bool copy = true) {
+  auto tensorType = shapedValue.getType().dyn_cast<RankedTensorType>();
+  assert(tensorType && "only RankedTensorType supported at the moment");
+  Value alloc;
+  if (!copy) {
+    // No copy needed: Just allocate.
+    SmallVector<Value> dynamicSizes;
+    for (int64_t i = 0; i < tensorType.getRank(); ++i)
+      if (tensorType.isDynamicDim(i))
+        dynamicSizes.push_back(b.create<tensor::DimOp>(loc, shapedValue, i));
+    alloc = b.create<AllocTensorOp>(loc, tensorType, dynamicSizes,
+                                    /*copy=*/Value(), escape);
+  } else {
+    // Allocate and copy.
+    alloc = b.create<AllocTensorOp>(loc, tensorType,
+                                    /*dynamicSizes=*/ValueRange(), shapedValue,
+                                    escape);
+  }
+  return alloc;
+}
+
 LogicalResult BufferizableOpInterface::resolveTensorOpOperandConflicts(
     RewriterBase &rewriter, const AnalysisState &state) {
   OpBuilder::InsertionGuard g(rewriter);
   Operation *op = getOperation();
   SmallVector<OpOperand *> outOfPlaceOpOperands;
+  DenseSet<OpOperand *> copiedOpOperands;
   SmallVector<OpResult> outOfPlaceOpResults;
+  DenseSet<OpResult> copiedOpResults;
 
   // Find all out-of-place OpOperands.
   for (OpOperand &opOperand : op->getOpOperands()) {
@@ -69,32 +98,36 @@ LogicalResult BufferizableOpInterface::resolveTensorOpOperandConflicts(
       // be smaller than the OpOperand (e.g., in the case of an extract_slice,
       // where the result is usually a smaller part of the source).
       outOfPlaceOpResults.push_back(aliasingOpResults.front());
+      if (!state.canOmitTensorCopy(opOperand))
+        copiedOpResults.insert(aliasingOpResults.front());
     } else {
       // In all other cases, make a copy of the OpOperand.
       outOfPlaceOpOperands.push_back(&opOperand);
+      if (!state.canOmitTensorCopy(opOperand))
+        copiedOpOperands.insert(&opOperand);
     }
   }
 
   // Insert copies of OpOperands.
   rewriter.setInsertionPoint(op);
   for (OpOperand *opOperand : outOfPlaceOpOperands) {
-    auto tensorType = opOperand->get().getType().cast<RankedTensorType>();
     SmallVector<OpResult> aliasingOpResults =
         state.getAliasingOpResult(*opOperand);
     bool escape = llvm::any_of(
         aliasingOpResults, [&](Value v) { return state.isTensorYielded(v); });
-    Value copy = rewriter.create<AllocTensorOp>(
-        op->getLoc(), tensorType, ValueRange(), opOperand->get(), escape);
+    Value copy = allocateTensorForShapedValue(
+        rewriter, op->getLoc(), opOperand->get(), escape,
+        copiedOpOperands.contains(opOperand));
     rewriter.updateRootInPlace(op, [&]() { opOperand->set(copy); });
   }
 
   // Insert copies of OpResults.
   rewriter.setInsertionPointAfter(op);
   for (OpResult opResult : outOfPlaceOpResults) {
-    auto tensorType = opResult.getType().cast<RankedTensorType>();
     bool escape = state.isTensorYielded(opResult);
-    Value copy = rewriter.create<AllocTensorOp>(op->getLoc(), tensorType,
-                                                ValueRange(), opResult, escape);
+    Value copy =
+        allocateTensorForShapedValue(rewriter, op->getLoc(), opResult, escape,
+                                     copiedOpResults.count(opResult));
     SmallVector<OpOperand *> uses = llvm::to_vector(llvm::map_range(
         opResult.getUses(), [](OpOperand &use) { return &use; }));
     for (OpOperand *use : uses) {
@@ -313,6 +346,27 @@ AnalysisState::AnalysisState(const BufferizationOptions &options)
     fn(*this);
 }
 
+bool AnalysisState::canOmitTensorCopy(OpOperand &opOperand) const {
+  // Do not copy if the tensor has undefined contents.
+  if (hasUndefinedContents(&opOperand))
+    return true;
+
+  // Do not copy if the buffer of the tensor is entirely overwritten (with
+  // values that do not depend on the old tensor).
+  if (bufferizesToMemoryWrite(opOperand) && !bufferizesToMemoryRead(opOperand))
+    return true;
+
+  // Do not copy if the tensor is never read.
+  SmallVector<OpResult> aliasingOpResults = getAliasingOpResult(opOperand);
+  if (!bufferizesToMemoryRead(opOperand) &&
+      llvm::none_of(aliasingOpResults,
+                    [&](OpResult opResult) { return isValueRead(opResult); }))
+    return true;
+
+  // Default: Cannot omit the copy.
+  return false;
+}
+
 // bufferization.to_memref is not allowed to change the rank.
 static void ensureToMemrefOpIsValid(Value tensor, Type memrefType) {
 #ifndef NDEBUG

diff --git a/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir b/mlir/test/Dialect/Bufferization/Transforms/tensor-copy-insertion.mlir
@@ -25,3 +25,54 @@ func.func @return_alloc_tensor() -> (tensor<5xf32>) {
   %0 = bufferization.alloc_tensor() : tensor<5xf32>
   return %0 : tensor<5xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @do_not_copy_undefined_tensor
+func.func @do_not_copy_undefined_tensor(%f: f32, %idx: index)
+  -> (tensor<5xf32>, tensor<5xf32>)
+{
+  // CHECK: bufferization.alloc_tensor() {escape = false} : tensor<5xf32>
+  // The second alloc_tensor should not have a copy operand.
+  // CHECK: bufferization.alloc_tensor() {escape = false} : tensor<5xf32>
+  %0 = bufferization.alloc_tensor() : tensor<5xf32>
+  %1 = tensor.insert %f into %0[%idx] : tensor<5xf32>
+  return %0, %1 : tensor<5xf32>, tensor<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @do_not_copy_when_overwritten
+func.func @do_not_copy_when_overwritten(%t: tensor<5xf32>, %f: f32)
+  -> (tensor<5xf32>, tensor<5xf32>)
+{
+  // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {escape = false} : tensor<5xf32>
+  // CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<5xf32>)
+  %r = linalg.generic {
+    indexing_maps = [affine_map<(d0) -> (d0)>],
+    iterator_types = ["parallel"]}
+    outs(%t : tensor<5xf32>) {
+      ^bb0(%arg0 : f32) :
+        linalg.yield %f : f32
+    } -> tensor<5xf32>
+  return %t, %r : tensor<5xf32>, tensor<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @do_not_copy_when_result_not_read
+func.func @do_not_copy_when_result_not_read(%t: tensor<5xf32>, %f: f32)
+  -> (tensor<3xf32>)
+{
+  %0 = tensor.extract_slice %t[0][3][1] : tensor<5xf32> to tensor<3xf32>
+  // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() {escape = false} : tensor<3xf32>
+  // CHECK: linalg.generic {{.*}} outs(%[[alloc]] : tensor<3xf32>)
+  %r = linalg.generic {
+    indexing_maps = [affine_map<(d0) -> (d0)>],
+    iterator_types = ["parallel"]}
+    outs(%0 : tensor<3xf32>) {
+      ^bb0(%arg0 : f32) :
+        linalg.yield %f : f32
+    } -> tensor<3xf32>
+  return %r : tensor<3xf32>
+}