Skip to content

Commit

Permalink
[mlir][bufferize] Allow non-equivalent yields from scf.for loops
Browse files Browse the repository at this point in the history
This removes a restriction wrt. scf.for loops during One-Shot Bufferization. Such IR was previously rejected. It is still rejected by default because the bufferized IR could be slow. But such IR can now be bufferized with `allow-return-allocs`.

Differential Revision: https://reviews.llvm.org/D121529
  • Loading branch information
matthias-springer committed Mar 16, 2022
1 parent ac64d0d commit 1e1eeae
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 18 deletions.
72 changes: 55 additions & 17 deletions mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
Expand Up @@ -10,6 +10,7 @@

#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/Operation.h"
Expand Down Expand Up @@ -272,17 +273,13 @@ struct ForOpInterface

bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
// Tensor iter_args of scf::ForOps are always considered as a write. This is
// to simplify the analysis.
// TODO: Consider doing sth. like isValueWritten.
// Tensor iter_args of scf::ForOps are always considered as a write.
return true;
}

SmallVector<OpResult> getAliasingOpResult(Operation *op, OpOperand &opOperand,
const AnalysisState &state) const {
auto forOp = cast<scf::ForOp>(op);
if (!opOperand.get().getType().isa<RankedTensorType>())
return {};
return {forOp.getResultForOpOperand(opOperand)};
}

Expand All @@ -293,7 +290,8 @@ struct ForOpInterface
auto forOp = cast<scf::ForOp>(op);
OpOperand &forOperand = forOp.getOpOperandForResult(opResult);
auto bbArg = forOp.getRegionIterArgForOpOperand(forOperand);
auto yieldOp = cast<scf::YieldOp>(&forOp.getLoopBody().front().back());
auto yieldOp =
cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
bool equivalentYield = state.areEquivalentBufferizedValues(
bbArg, yieldOp->getOperand(opResult.getResultNumber()));
return equivalentYield ? BufferRelation::Equivalent : BufferRelation::None;
Expand All @@ -313,14 +311,25 @@ struct ForOpInterface
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
BufferizationState &state) const {
auto forOp = cast<scf::ForOp>(op);
auto bufferizableOp = cast<BufferizableOpInterface>(op);
Block *oldLoopBody = &forOp.getLoopBody().front();

// Indices of all iter_args that have tensor type. These are the ones that
// are bufferized.
DenseSet<int64_t> indices;
for (const auto &it : llvm::enumerate(forOp.getInitArgs()))
if (it.value().getType().isa<TensorType>())
// For every yielded value, is the value equivalent to its corresponding
// bbArg?
SmallVector<bool> equivalentYields;
for (const auto &it : llvm::enumerate(forOp.getInitArgs())) {
if (it.value().getType().isa<TensorType>()) {
indices.insert(it.index());
BufferRelation relation = bufferizableOp.bufferRelation(
forOp->getResult(it.index()), state.getAnalysisState());
equivalentYields.push_back(relation == BufferRelation::Equivalent);
} else {
equivalentYields.push_back(false);
}
}

// Given a range of values, apply `func` to those marked in `indices`.
// Otherwise, store the unmodified value in the result vector.
Expand Down Expand Up @@ -374,8 +383,35 @@ struct ForOpInterface
SmallVector<Value> yieldValues =
convert(yieldOp.getResults(), [&](Value val, int64_t index) {
ensureToMemrefOpIsValid(val, initArgs[index].getType());
return rewriter.create<bufferization::ToMemrefOp>(
Value yieldedVal = rewriter.create<bufferization::ToMemrefOp>(
val.getLoc(), initArgs[index].getType(), val);

if (equivalentYields[index])
// Yielded value is equivalent to the corresponding iter_arg bbArg.
// Yield the value directly. Most IR should be like that. Everything
// else must be resolved with copies and is potentially inefficient.
// By default, such problematic IR would already have been rejected
// during `verifyAnalysis`, unless `allow-return-allocs`.
return yieldedVal;

// It is not certain that the yielded value and the iter_arg bbArg
// have the same buffer. Allocate a new buffer and copy. The yielded
// buffer will get deallocated by `deallocateBuffers`.

// TODO: There are cases in which it is not neccessary to return a new
// buffer allocation. E.g., when equivalent values are yielded in a
// different order. This could be resolved with copies.
Optional<Value> yieldedAlloc = state.createAlloc(
rewriter, val.getLoc(), yieldedVal, /*deallocMemref=*/false);
// TODO: We should rollback, but for now just assume that this always
// succeeds.
assert(yieldedAlloc.hasValue() && "could not create alloc");
LogicalResult copyStatus =
bufferization::createMemCpy(rewriter, val.getLoc(), yieldedVal,
*yieldedAlloc, state.getOptions());
(void)copyStatus;
assert(succeeded(copyStatus) && "could not create memcpy");
return *yieldedAlloc;
});
yieldOp.getResultsMutable().assign(yieldValues);

Expand All @@ -385,12 +421,17 @@ struct ForOpInterface
return success();
}

/// Assert that yielded values of an scf.for op are aliasing with their
/// corresponding bbArgs. This is required because the i-th OpResult of an
/// scf.for op is currently assumed to alias with the i-th iter_arg (in the
/// absence of conflicts).
/// Assert that yielded values of an scf.for op are equivalent to their
/// corresponding bbArgs. Otherwise, an alloc+copy are inserted and yielded
/// from the loop. This could be a performance problem, so it must be
/// explicitly activated with `alloc-return-allocs`.
LogicalResult verifyAnalysis(Operation *op,
const AnalysisState &state) const {
const auto &options =
static_cast<const OneShotBufferizationOptions &>(state.getOptions());
if (options.allowReturnAllocs)
return success();

auto forOp = cast<scf::ForOp>(op);
auto yieldOp =
cast<scf::YieldOp>(forOp.getLoopBody().front().getTerminator());
Expand All @@ -405,13 +446,10 @@ struct ForOpInterface
// Note: This is overly strict. We should check for aliasing bufferized
// values. But we don't have a "must-alias" analysis yet.
if (!state.areEquivalentBufferizedValues(operand.get(), bbArg))
// TODO: this could get resolved with copies but it can also turn into
// swaps so we need to be careful about order of copies.
return yieldOp->emitError()
<< "Yield operand #" << operand.getOperandNumber()
<< " does not bufferize to a buffer that is aliasing the "
"matching"
<< " enclosing scf::for operand";
"matching enclosing scf::for operand";
}
return success();
}
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
Expand Up @@ -23,7 +23,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
MLIRArithmetic
MLIRBufferization
MLIRBufferizationTransforms
MLIRDialectUtils
MLIRDialectUtils
MLIRIR
MLIRMemRef
MLIRPass
Expand Down
105 changes: 105 additions & 0 deletions mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
Expand Up @@ -1218,3 +1218,108 @@ func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32 {
%f = tensor.extract %r[%j] : tensor<?xf32>
return %f : f32
}

// -----

// Note: This bufferizes to inefficient code, but bufferization should not see
// such IR in the first place. The iter_arg would canonicalize away. This test
// case is just to ensure that the bufferization generates correct code.

// CHECK-LABEL: func @scf_for_yield_non_equivalent(
// CHECK-SAME: %[[t:.*]]: memref<?xf32
// CHECK: %[[alloc:.*]] = memref.alloc(%{{.*}})
// CHECK: %[[casted:.*]] = memref.cast %[[alloc]]
// CHECK: %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[casted]])
// CHECK: memref.dealloc %[[iter]]
// CHECK: %[[alloc2:.*]] = memref.alloc(%{{.*}})
// CHECK: memref.copy %[[t]], %[[alloc2]]
// CHECK: %[[casted2:.*]] = memref.cast %[[alloc2]]
// CHECK: scf.yield %[[casted2]]
// CHECK: return %[[for]]
func @scf_for_yield_non_equivalent(
%t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
%r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
scf.yield %t : tensor<?xf32>
}

return %r : tensor<?xf32>
}

// -----

// Note: This bufferizes to inefficient code, but bufferization should not see
// such IR in the first place. The iter_arg would canonicalize away. This test
// case is just to ensure that the bufferization generates correct code.

// CHECK-LABEL: func @scf_for_yield_allocation(
// CHECK-SAME: %[[t:.*]]: memref<?xf32
// CHECK: %[[cloned:.*]] = bufferization.clone %[[t]]
// CHECK: %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
// This alloc is for the linalg.init_tensor.
// CHECK: %[[alloc2:.*]] = memref.alloc(%{{.*}})
// CHECK: memref.dealloc %[[iter]]
// This alloc is for the scf.yield.
// CHECK: %[[alloc3:.*]] = memref.alloc(%{{.*}})
// CHECK: memref.copy %[[alloc2]], %[[alloc3]]
// CHECK: memref.dealloc %[[alloc2]]
// CHECK: %[[casted3:.*]] = memref.cast %[[alloc3]]
// CHECK: scf.yield %[[casted3]]
// CHECK: return %[[for]]
func @scf_for_yield_allocation(%t: tensor<?xf32>, %lb : index, %ub : index,
%step : index) -> tensor<?xf32> {
%r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
%t2 = linalg.init_tensor [%i] : tensor<?xf32>
scf.yield %t2 : tensor<?xf32>
}

return %r : tensor<?xf32>
}

// -----

// TODO: The scf.yield could bufferize to 1 alloc and 2 copies (instead of
// 2 allocs and 2 copies).

// CHECK-LABEL: func @scf_for_swapping_yields(
// CHECK-SAME: %[[A:.*]]: memref<?xf32, #{{.*}}>, %[[B:.*]]: memref<?xf32, #{{.*}}>

func @scf_for_swapping_yields(
%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true},
%C : tensor<4xf32>, %lb : index, %ub : index, %step : index)
-> (f32, f32)
{
// CHECK-DAG: %[[clone1:.*]] = bufferization.clone %[[A]]
// CHECK-DAG: %[[clone2:.*]] = bufferization.clone %[[B]]
// CHECK: %[[for:.*]]:2 = scf.for {{.*}} iter_args(%[[iter1:.*]] = %[[clone1]], %[[iter2:.*]] = %[[clone2]])
%r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-> (tensor<?xf32>, tensor<?xf32>)
{
// CHECK: %[[sv1:.*]] = memref.subview %[[iter1]]
// CHECK: memref.copy %{{.*}}, %[[sv1]]
%ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
// CHECK: %[[sv2:.*]] = memref.subview %[[iter2]]
// CHECK: memref.copy %{{.*}}, %[[sv2]]
%ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>

// CHECK: %[[alloc2:.*]] = memref.alloc(%{{.*}})
// CHECK: memref.copy %[[iter2]], %[[alloc2]]
// CHECK: memref.dealloc %[[iter2]]
// CHECK: %[[alloc1:.*]] = memref.alloc(%{{.*}})
// CHECK: memref.copy %[[iter1]], %[[alloc1]]
// CHECK: memref.dealloc %[[iter1]]
// CHECK: %[[casted1:.*]] = memref.cast %[[alloc1]]
// CHECK: %[[casted2:.*]] = memref.cast %[[alloc2]]
// CHECK: scf.yield %[[casted2]], %[[casted1]]
// Yield tensors in different order.
scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
}

// CHECK: %[[r0:.*]] = memref.load %[[for]]#0
// CHECK: memref.dealloc %[[for]]#0
// CHECK: %[[r1:.*]] = memref.load %[[for]]#1
// CHECK: memref.dealloc %[[for]]#1
%f0 = tensor.extract %r0#0[%step] : tensor<?xf32>
%f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
// CHECK: return %[[r0]], %[[r1]]
return %f0, %f1: f32, f32
}

0 comments on commit 1e1eeae

Please sign in to comment.