Skip to content

Commit

Permalink
Revert "[mlir][Linalg] Replace SimplePad with PadTensor in hoist-padd…
Browse files Browse the repository at this point in the history
…ing"

This reverts commit d9b953d.

This commit resulted in build bot failures and the author is away from a
computer, so I am reverting on their behalf until they have a chance to
look into this.
  • Loading branch information
tpopp committed Feb 1, 2021
1 parent a4b7d52 commit 2790cbe
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 76 deletions.
39 changes: 32 additions & 7 deletions mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,6 @@ def Linalg_PadTensorOp : Linalg_Op<"pad_tensor",
return "static_high";
}

RankedTensorType getSourceType() {
return source().getType().cast<RankedTensorType>();
}
RankedTensorType getResultType() {
return getResult().getType().cast<RankedTensorType>();
}

// Infer the shape of the result tensor given the static shapes
// and element type of the result tensor.
static RankedTensorType inferResultType(RankedTensorType sourceType,
Expand Down Expand Up @@ -494,6 +487,38 @@ def Linalg_SliceOp : Linalg_Op<"slice", [
let hasFolder = 1;
}

def Linalg_SimplePadOp : Linalg_Op<"simple_pad", [NoSideEffect]> {
let summary = "TODO: replace with pad_tensors when ready.";

let description = [{
`linalg.simple_pad` is a tmp placeholder for padding and packing on tensors.
Its semantics are to pad a partially dynamic tensor to a fully static tensor
where the static sizes are assumed to be greater than the dynamic sizes. The
op perforrms "high" padding (i.e. it adds trailing padding values until the
desired size is met).
}];

let arguments = (ins AnyRankedTensor:$tensor, AnyType:$padding);
let results = (outs AnyRankedTensor:$result);

// TODO: verify all static result, some dynamic input, static shapes match,
// element types match, ranks match etc. Use pad_tensors when ready but for
// now just let it ne fully specified by traits.
let verifier = ?;

let extraClassDeclaration = [{
RankedTensorType getSourceType() {
return tensor().getType().cast<RankedTensorType>(); }
RankedTensorType getResultType() {
return getResult().getType().cast<RankedTensorType>(); }
}];

let assemblyFormat = [{
$tensor `pad` $padding attr-dict `:`
type($tensor) `to` type($result) `pad` type($padding)
}];
}

def Linalg_YieldOp : Linalg_Op<"yield", [NoSideEffect, ReturnLike, Terminator]>,
Arguments<(ins Variadic<AnyType>:$values)> {
let summary = "Linalg yield operation";
Expand Down
21 changes: 8 additions & 13 deletions mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class FuncOp;
struct LogicalResult;

namespace linalg {
class PadTensorOp;
class SimplePadOp;

/// Hoist alloc/dealloc pairs and alloca op out of immediately enclosing
/// scf::ForOp if both conditions are true:
Expand Down Expand Up @@ -44,7 +44,7 @@ void hoistRedundantVectorTransfersOnTensor(FuncOp func);

/// Mechanically hoist padding operations on tensors by `nLoops` into a new,
/// generally larger tensor. This achieves packing of multiple padding ops into
/// a larger tensor. On success, `padTensorOp` is replaced by the cloned version
/// a larger tensor. On success, `simplePadOp` is replaced by the cloned version
/// in the packing loop so the caller can continue reasoning about the padding
/// operation.
///
Expand All @@ -55,10 +55,8 @@ void hoistRedundantVectorTransfersOnTensor(FuncOp func);
/// ```
/// scf.for (%i, %j, %k)
/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] {
/// ^bb0( ... ):
/// linalg.yield %pad
/// } : tensor<?x?xf32> to tensor<4x8xf32>
/// %0 = linalg.simple_pad %st0 pad %pad :
/// tensor<?x?xf32> to tensor<4x8xf32>
/// compute(%0)
/// ```
///
Expand All @@ -67,13 +65,10 @@ void hoistRedundantVectorTransfersOnTensor(FuncOp func);
/// ```
/// scf.for (%i) {
/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
/// %packed = scf.for (%k) iter_args(%p : %packed_init) {
/// %packed = scf.for (%k) iter_args(%p : %packed_init)
/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] {
/// ^bb0( ... ):
/// linalg.yield %pad
/// } : tensor<?x?xf32> to tensor<4x8xf32>
/// %1 = subtensor_insert %0 ... : tensor<4x8xf32> to tensor<?x4x8xf32>
/// %0 = linalg.simple_pad %st0 pad %pad :
/// tensor<?x?xf32> to tensor<4x8xf32>
/// scf.yield %1: tensor<?x4x8xf32>
/// } -> tensor<?x4x8xf32>
/// scf.for (%j, %k) {
Expand All @@ -83,7 +78,7 @@ void hoistRedundantVectorTransfersOnTensor(FuncOp func);
/// }
/// }
/// ```
LogicalResult hoistPaddingOnTensors(PadTensorOp &padTensorOp, unsigned nLoops);
LogicalResult hoistPaddingOnTensors(SimplePadOp &simplePadOp, unsigned nLoops);

} // namespace linalg
} // namespace mlir
Expand Down
1 change: 0 additions & 1 deletion mlir/lib/Analysis/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ add_mlir_library(MLIRLoopAnalysis
MLIRCallInterfaces
MLIRControlFlowInterfaces
MLIRInferTypeOpInterface
MLIRLinalg
MLIRPresburger
MLIRSCF
)
Expand Down
3 changes: 1 addition & 2 deletions mlir/lib/Analysis/SliceAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ static void getBackwardSliceImpl(Operation *op,
return;

assert((op->getNumRegions() == 0 ||
isa<AffineForOp, scf::ForOp, linalg::LinalgOp, linalg::PadTensorOp>(
op)) &&
isa<AffineForOp, scf::ForOp, linalg::LinalgOp>(op)) &&
"unexpected generic op with regions");

// Evaluate whether we should keep this def.
Expand Down
99 changes: 76 additions & 23 deletions mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {

/// Ensure prerequisites that guarantee pad op hoisting can occur.
/// Return failure in the cases when we cannot perform hoisting; i.e. if either:
/// 1. There exists a use of `padTensorOp` that is not a linalg input operand.
/// 1. There exists a use of `simplePadOp` that is not a linalg input operand.
/// 2. There isn't an enclosing `outermostEnclosingForOp` loop.
/// 3. There exists an op with a region that is dominated by
/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
Expand All @@ -353,12 +353,12 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
/// remain in `backwardSlice` but that are not in `packingLoops` are
/// dimensions of reuse.
static LogicalResult
hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
hoistPaddingOnTensorsPrerequisites(linalg::SimplePadOp simplePadOp, int nLevels,
llvm::SetVector<Operation *> &backwardSlice,
llvm::SetVector<Operation *> &packingLoops) {
// Bail on any use that isn't an input of a Linalg op.
// Hoisting of inplace updates happens after vectorization.
for (OpOperand &use : padTensorOp.result().getUses()) {
for (OpOperand &use : simplePadOp.result().getUses()) {
auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
if (!linalgUser || !linalgUser.isInputTensor(&use))
return failure();
Expand All @@ -368,7 +368,7 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
SmallVector<LoopLikeOpInterface> reverseEnclosingLoops;
Operation *outermostEnclosingForOp = nullptr,
*nextEnclosingForOp =
padTensorOp->getParentOfType<LoopLikeOpInterface>();
simplePadOp->getParentOfType<LoopLikeOpInterface>();
while (nLevels-- > 0 && nextEnclosingForOp) {
outermostEnclosingForOp = nextEnclosingForOp;
reverseEnclosingLoops.push_back(outermostEnclosingForOp);
Expand All @@ -378,20 +378,37 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
if (!outermostEnclosingForOp)
return failure();

// Get the backwards slice from `padTensorOp` that is dominated by the
// Get the backwards slice from `simplePadOp` that is dominated by the
// outermost enclosing loop.
DominanceInfo domInfo(outermostEnclosingForOp);
getBackwardSlice(padTensorOp, &backwardSlice, [&](Operation *op) {
getBackwardSlice(simplePadOp, &backwardSlice, [&](Operation *op) {
return domInfo.dominates(outermostEnclosingForOp, op);
});

#if 0

// Bail on any op with a region that is not a LoopLikeInterface or a LinalgOp.
// Bail on any op with side effects that is not a LoopLikeInterface.
if (llvm::any_of(backwardSlice, [](Operation *op) {
if (isa<LoopLikeOpInterface>(op))
return false;
if (!MemoryEffectOpInterface::hasNoEffect(op))
return true;
return op->getNumRegions() > 0 && !isa<LinalgOp>(op);
}))
return failure();

#else

// Bail on any op with a region that is not a LoopLikeInterface or a LinalgOp.
if (llvm::any_of(backwardSlice, [](Operation *op) {
return op->getNumRegions() > 0 && !isa<LoopLikeOpInterface>(op) &&
!isa<LinalgOp>(op);
}))
return failure();

#endif

// Filter out the loops whose induction variable is not used to compute the
// padded result. As a first approximation, just look for IVs that have no use
// in the backwardSlice.
Expand Down Expand Up @@ -427,18 +444,54 @@ static Value buildLoopTripCount(OpBuilder &b, Operation *op) {
ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
}

LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
/// Mechanically hoist padding operations on tensors by at most `nLoops` into a
/// new, generally larger tensor. This achieves packing of multiple padding ops
/// into a larger tensor. On success, `simplePadOp` is replaced by the cloned
/// version in the packing loop so the caller can continue reasoning about the
/// padding operation.
///
/// Example in pseudo-mlir:
/// =======================
///
/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
/// ```
/// scf.for (%i, %j, %k)
/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
/// %0 = linalg.simple_pad %st0 pad %pad :
/// tensor<?x?xf32> to tensor<4x8xf32>
/// compute(%0)
/// ```
///
/// IR resembling the following is produced:
///
/// ```
/// scf.for (%i) {
/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
/// %packed = scf.for (%k) iter_args(%p : %packed_init)
/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
/// %0 = linalg.simple_pad %st0 pad %pad :
/// tensor<?x?xf32> to tensor<4x8xf32>
/// scf.yield %1: tensor<?x4x8xf32>
/// } -> tensor<?x4x8xf32>
/// scf.for (%j, %k) {
/// %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
/// tensor<?x4x8xf32> to tensor<4x8xf32>
/// compute(%st0)
/// }
/// }
/// ```
LogicalResult mlir::linalg::hoistPaddingOnTensors(SimplePadOp &simplePadOp,
unsigned nLoops) {
llvm::SetVector<Operation *> backwardSlice, packingLoops;
if (failed(hoistPaddingOnTensorsPrerequisites(padTensorOp, nLoops,
if (failed(hoistPaddingOnTensorsPrerequisites(simplePadOp, nLoops,
backwardSlice, packingLoops)))
return failure();

// Update actual number of loops, which may be smaller.
nLoops = packingLoops.size();

Location loc = padTensorOp->getLoc();
RankedTensorType paddedTensorType = padTensorOp.getResultType();
Location loc = simplePadOp->getLoc();
RankedTensorType paddedTensorType = simplePadOp.getResultType();
unsigned paddedRank = paddedTensorType.getRank();

// Backward slice is a topologically sorted list of ops starting at
Expand All @@ -450,7 +503,7 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
// Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
// padding.
SmallVector<int64_t> packedShape(nLoops, ShapedType::kDynamicSize);
// TODO: go grab dims when necessary, for now PadTensorOp returns a static
// TODO: go grab dims when necessary, for now SimplePadOp returns a static
// tensor.
llvm::append_range(packedShape, paddedTensorType.getShape());
auto packedTensorType =
Expand All @@ -473,10 +526,10 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
clonedLoopIvs.reserve(nLoops);
BlockAndValueMapping bvm;
// Stack step 1. iteratively clone loops and push `packedTensor`.
// Insert `padTensorOp` into the backwardSlice so we clone it too.
backwardSlice.insert(padTensorOp);
// Insert `simplePadOp` into the backwardSlice so we clone it too.
backwardSlice.insert(simplePadOp);
for (Operation *op : backwardSlice) {
if (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op)) {
if (op->getNumRegions() == 0) {
b.clone(*op, bvm);
continue;
}
Expand All @@ -503,7 +556,7 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
// sizes = [1 .. 1, paddedShape].
SmallVector<OpFoldResult> sizes(nLoops, b.getIndexAttr(1));
for (int64_t sz : paddedTensorType.getShape()) {
// TODO: go grab dims when necessary, for now PadTensorOp returns a static
// TODO: go grab dims when necessary, for now SimplePadOp returns a static
// tensor.
assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
sizes.push_back(b.getIndexAttr(sz));
Expand All @@ -512,7 +565,7 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
SmallVector<OpFoldResult> strides(nLoops + paddedRank, b.getIndexAttr(1));

Value inserted =
b.create<SubTensorInsertOp>(loc, bvm.lookup(padTensorOp.result()),
b.create<SubTensorInsertOp>(loc, bvm.lookup(simplePadOp.result()),
packedTensor, offsets, sizes, strides);

// Stack step 3. iteratively pop the stack and propagate the yield.
Expand All @@ -526,7 +579,7 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,

// Now the packed tensor is ready, replace the original padding op by a
// 1x..x1 SubTensor [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
b.setInsertionPoint(padTensorOp);
b.setInsertionPoint(simplePadOp);
SmallVector<Value> originalLoopIvs =
llvm::to_vector<4>(llvm::map_range(packingLoops, [](Operation *loop) {
return cast<scf::ForOp>(loop).getInductionVar();
Expand All @@ -538,16 +591,16 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
// strides = [1 .. 1] (defined above)
packedTensor =
scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
padTensorOp.replaceAllUsesWith(
b.create<SubTensorOp>(loc, padTensorOp.getResultType(), packedTensor,
simplePadOp.replaceAllUsesWith(
b.create<SubTensorOp>(loc, simplePadOp.getResultType(), packedTensor,
offsets, sizes, strides)
->getResult(0));

Operation *toErase = padTensorOp;
Operation *toErase = simplePadOp;

// Make the newly cloned `padTensorOp` available to the caller.
padTensorOp =
cast<PadTensorOp>(bvm.lookup(padTensorOp.result()).getDefiningOp());
// Make the newly cloned `simplePadOp` available to the caller.
simplePadOp =
cast<SimplePadOp>(bvm.lookup(simplePadOp.result()).getDefiningOp());

toErase->erase();

Expand Down
39 changes: 11 additions & 28 deletions mlir/test/Dialect/Linalg/hoist-padding.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ func @matmul_tensors(
// CHECK: %[[A:.*]] = scf.for
// CHECK-NOT: scf.for
// CHECK: subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: linalg.pad_tensor %{{.*}}
// CHECK: : tensor<?x?xf32> to tensor<2x4xf32>
// CHECK: linalg.simple_pad %{{.*}} : tensor<?x?xf32> to tensor<2x4xf32> pad f32
// CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, 0, 0]
// CHECK-SAME: [1, 2, 4] [1, 1, 1] : tensor<2x4xf32> into tensor<?x2x4xf32>
// 2-D loop
Expand All @@ -37,8 +36,7 @@ func @matmul_tensors(
// CHECK: scf.for
// CHECK-NOT: scf.for
// CHECK: subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: linalg.pad_tensor %{{.*}}
// CHECK: : tensor<?x?xf32> to tensor<4x3xf32>
// CHECK: linalg.simple_pad %{{.*}} : tensor<?x?xf32> to tensor<4x3xf32> pad f32
// CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0]
// CHECK-SAME: [1, 1, 4, 3] [1, 1, 1, 1] : tensor<4x3xf32> into tensor<?x?x4x3xf32>
// 2-D loop
Expand All @@ -49,8 +47,8 @@ func @matmul_tensors(
// CHECK-SAME: tensor<?x2x4xf32> to tensor<2x4xf32>
// CHECK: %[[stB:.*]] = subtensor %[[B]][%[[K]], %[[J]], 0, 0] [1, 1, 4, 3] [1, 1, 1, 1] :
// CHECK-SAME: tensor<?x?x4x3xf32> to tensor<4x3xf32>
// CHECK: %[[stC:.*]] = linalg.pad_tensor %{{.*}}
// CHECK: : tensor<?x?xf32> to tensor<2x3xf32>
// CHECK: %[[stC:.*]] = linalg.simple_pad %{{.*}} pad %{{.*}} :
// CHECK-SAME: tensor<?x?xf32> to tensor<2x3xf32> pad f32
// CHECK: linalg.matmul ins(%[[stA]], %[[stB]] : tensor<2x4xf32>, tensor<4x3xf32>)
// CHECK-SAME: outs(%[[stC]] : tensor<2x3xf32>) -> tensor<2x3xf32>
%3 = scf.for %arg3 = %c0 to %0 step %c2 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
Expand All @@ -71,28 +69,13 @@ func @matmul_tensors(
%18 = dim %arg8, %c1 : tensor<?x?xf32>
%19 = affine.min #map4(%18, %arg5)
%20 = subtensor %arg8[%arg3, %arg5] [%17, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%21 = subi %c2, %7 : index
%22 = subi %c4, %9 : index
%23 = linalg.pad_tensor %10 low[%c0, %c0] high[%21, %22] {
^bb0(%arg9: index, %arg10: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<2x4xf32>
%24 = subi %c4, %12 : index
%25 = subi %c3, %14 : index
%26 = linalg.pad_tensor %15 low[%c0, %c0] high[%24, %25] {
^bb0(%arg9: index, %arg10: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<4x3xf32>
%27 = subi %c2, %17 : index
%28 = subi %c3, %19 : index
%29 = linalg.pad_tensor %20 low[%c0, %c0] high[%27, %28] {
^bb0(%arg9: index, %arg10: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<2x3xf32>
%30 = linalg.matmul ins(%23, %26 : tensor<2x4xf32>, tensor<4x3xf32>) outs(%29 : tensor<2x3xf32>) -> tensor<2x3xf32>
%31 = subtensor %30[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor<?x?xf32>
%32 = subtensor_insert %31 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %32 : tensor<?x?xf32>
%21 = linalg.simple_pad %10 pad %cst : tensor<?x?xf32> to tensor<2x4xf32> pad f32
%22 = linalg.simple_pad %15 pad %cst : tensor<?x?xf32> to tensor<4x3xf32> pad f32
%23 = linalg.simple_pad %20 pad %cst : tensor<?x?xf32> to tensor<2x3xf32> pad f32
%24 = linalg.matmul ins(%21, %22 : tensor<2x4xf32>, tensor<4x3xf32>) outs(%23 : tensor<2x3xf32>) -> tensor<2x3xf32>
%25 = subtensor %24[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor<?x?xf32>
%26 = subtensor_insert %25 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %26 : tensor<?x?xf32>
}
scf.yield %5 : tensor<?x?xf32>
}
Expand Down
Loading

0 comments on commit 2790cbe

Please sign in to comment.