70 changes: 50 additions & 20 deletions mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,47 @@ static Value insertSliceIntoTensor(OpBuilder &b, Location loc,
sliceOp.static_sizes(), sliceOp.static_strides());
}

template <typename LoopTy>
static SmallVector<Value, 4>
collectLoopYieldArgs(OpBuilder &b, LinalgOp clonedOp,
ArrayRef<Value> tiledOperands,
SmallVectorImpl<Value> &tensorResults) {

Location loc = clonedOp.getLoc();
SmallVector<Value, 4> yieldArgs;
unsigned resultIdx = 0;
for (OpOperand *opOperand : clonedOp.getOutputTensorOperands()) {
// TODO: use an interface/adaptor to avoid leaking position in
// `tiledOperands`.
Value outputTensor = tiledOperands[opOperand->getOperandNumber()];
// Insert a insert_slice for each output tensor.
if (auto sliceOp = outputTensor.getDefiningOp<tensor::ExtractSliceOp>()) {
yieldArgs.push_back(insertSliceIntoTensor(
b, loc, sliceOp, clonedOp->getResult(resultIdx), sliceOp.source()));
} else {
yieldArgs.push_back(clonedOp->getResult(resultIdx));
}
++resultIdx;
}
tensorResults = yieldArgs;
return yieldArgs;
}

template <>
SmallVector<Value, 4>
collectLoopYieldArgs<TiledLoopOp>(OpBuilder &b, LinalgOp clonedOp,
ArrayRef<Value> tiledOperands,
SmallVectorImpl<Value> &tensorResults) {
auto outputTensorOperands = clonedOp.getOutputTensorOperands();
size_t numOutputTensors = outputTensorOperands.size();

SmallVector<Value, 4> yieldArgs(clonedOp->getResults());
auto tiledOutputOperands = tiledOperands.take_back(numOutputTensors);
yieldArgs.append(tiledOutputOperands.begin(), tiledOutputOperands.end());

return yieldArgs;
}

template <typename LoopTy>
static Optional<TiledLinalgOp>
tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ValueRange tileSizes,
Expand Down Expand Up @@ -224,7 +265,7 @@ tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ValueRange tileSizes,
}

// 2. Create the tiled loops.
LinalgOp res = op;
LinalgOp clonedOp = op;
SmallVector<Value, 4> ivs, tensorResults;
auto tiledLoopBodyBuilder = [&](OpBuilder &b, Location loc,
ValueRange localIvs,
Expand Down Expand Up @@ -262,30 +303,18 @@ tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ValueRange tileSizes,
resultTensorTypes.push_back(
tiledOperands[opOperand->getOperandNumber()].getType());

res = op.clone(b, loc, resultTensorTypes, tiledOperands);
clonedOp = op.clone(b, loc, resultTensorTypes, tiledOperands);

// Insert a insert_slice for each output tensor.
unsigned resultIdx = 0;
for (OpOperand *opOperand : op.getOutputTensorOperands()) {
// TODO: use an interface/adaptor to avoid leaking position in
// `tiledOperands`.
Value outputTensor = tiledOperands[opOperand->getOperandNumber()];
if (auto sliceOp = outputTensor.getDefiningOp<tensor::ExtractSliceOp>()) {
tensorResults.push_back(insertSliceIntoTensor(
b, loc, sliceOp, res->getResult(resultIdx), sliceOp.source()));
} else {
tensorResults.push_back(res->getResult(resultIdx));
}
++resultIdx;
}
return scf::ValueVector(tensorResults.begin(), tensorResults.end());
auto yieldArgs =
collectLoopYieldArgs<LoopTy>(b, clonedOp, tiledOperands, tensorResults);
return {yieldArgs.begin(), yieldArgs.end()};
};
GenerateLoopNest<LoopTy>::doit(b, op.getLoc(), loopRanges, op, iteratorTypes,
tiledLoopBodyBuilder, options.distribution,
options.distributionTypes);

// 3. Transform IndexOp results w.r.t. the tiling.
transformIndexOps(b, res, ivs, loopIndexToRangeIndex);
transformIndexOps(b, clonedOp, ivs, loopIndexToRangeIndex);

// 4. Gather the newly created loops and return them with the new op.
SmallVector<Operation *, 8> loops;
Expand All @@ -308,8 +337,9 @@ tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ValueRange tileSizes,
if ((outermostLoop = loop))
break;

return TiledLinalgOp{
res, loops, outermostLoop ? outermostLoop->getResults() : tensorResults};
return TiledLinalgOp{clonedOp, loops,
outermostLoop ? outermostLoop->getResults()
: tensorResults};
}

template <typename LoopTy>
Expand Down
7 changes: 5 additions & 2 deletions mlir/lib/Dialect/Linalg/Utils/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,12 @@ void GenerateLoopNest<TiledLoopOp>::doit(
ValueRange ivs, ValueRange inputs,
ValueRange outputs) {
SmallVector<Value> outputTensors = linalgOp.getOutputTensorOperands();
scf::ValueVector results =
scf::ValueVector yieldArgs =
bodyBuilderFn(nestedBuilder, nestedLoc, ivs, outputTensors);
nestedBuilder.create<linalg::YieldOp>(nestedLoc, results);
auto yieldArgsRef = llvm::makeArrayRef(yieldArgs);
nestedBuilder.create<linalg::TiledYieldOp>(
nestedLoc, yieldArgsRef.take_front(outputTensors.size()),
yieldArgsRef.drop_front(outputTensors.size()));
};

SmallVector<Value> inputOperands = linalgOp.getInputOperands();
Expand Down
11 changes: 6 additions & 5 deletions mlir/test/Dialect/Linalg/canonicalize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func @memref_cast_into_tiled_loop(%arg0: memref<192xf32>) {
%16 = memref.subview %out[%arg3] [%14] [1]
: memref<192xf32, #map> to memref<?xf32, #map>
linalg.fill(%cst, %16) : f32, memref<?xf32, #map>
linalg.yield
linalg.tiled_yield
}
return
}
Expand Down Expand Up @@ -706,8 +706,9 @@ func @fold_tiled_loop_results(%A: memref<48xf32>, %B: tensor<48xf32>,
%CT_ = %C_tensor: tensor<48xf32>,
%C_ = %C: memref<48xf32>) {
%result = call @foo(%A_, %B_, %C_)
: (memref<48xf32>, tensor<48xf32>, memref<48xf32>)-> (tensor<48xf32>)
linalg.yield %result, %CT_ : tensor<48xf32>, tensor<48xf32>
: (memref<48xf32>, tensor<48xf32>, memref<48xf32>) -> (tensor<48xf32>)
linalg.tiled_yield %result in %B_ : tensor<48xf32>,
%CT_ in %CT_ : tensor<48xf32>
}
return %useful : tensor<48xf32>
}
Expand All @@ -726,7 +727,7 @@ func @fold_tiled_loop_results(%A: memref<48xf32>, %B: tensor<48xf32>,
// CHECK-SAME: ins (%[[A_:.*]] = %[[A]]: [[BUF_TY]])
// CHECK-SAME: outs (%[[B_:.*]] = %[[B]]: [[TY]], %[[C_:.*]] = %[[C]]: [[BUF_TY]]) {
// CHECK-NEXT: %[[RES:.*]] = call @foo(%[[A_]], %[[B_]], %[[C_]])
// CHECK-NEXT: linalg.yield %[[RES]] :
// CHECK-NEXT: linalg.tiled_yield %[[RES]] in %[[B_]]

// CHECK: return %[[RESULT]]

Expand All @@ -743,7 +744,7 @@ func @fold_tiled_loop_inputs(%A: memref<192xf32>, %A_tensor: tensor<192xf32>,
ins (%A_ = %A: memref<192xf32>, %AT_ = %A_tensor: tensor<192xf32>)
outs (%BT_ = %B_tensor: tensor<192xf32>) {
%0 = call @foo(%A_, %BT_) : (memref<192xf32>, tensor<192xf32>) -> tensor<192xf32>
linalg.yield %0 : tensor<192xf32>
linalg.tiled_yield %0 in %BT_ : tensor<192xf32>
}
return %result : tensor<192xf32>
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -507,25 +507,25 @@ func @scf_for_deps(%A : tensor<?xf32> {linalg.inplaceable = true},
// of %r3 is read.
// CHECK: linalg.tiled_loop
// CHECK-NEXT: call
// CHECK-NEXT: linalg.yield
// CHECK-NEXT: linalg.tiled_yield
// CHECK-NEXT: {__inplace_results_attr__ = ["false"]}
%r2 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step)
ins()
outs(%t = %B: tensor<?xf32>) {
call @some_use(%t) : (tensor<?xf32>) -> ()
linalg.yield %t : tensor<?xf32>
linalg.tiled_yield %t in %t : tensor<?xf32>
}

// %r3 bufferizes inplace fine.
// CHECK: linalg.tiled_loop
// CHECK-NEXT: call
// CHECK-NEXT: linalg.yield
// CHECK-NEXT: linalg.tiled_yield
// CHECK-NEXT: {__inplace_results_attr__ = ["true"]}
%r3 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step)
ins()
outs(%t = %B: tensor<?xf32>) {
call @some_use(%t) : (tensor<?xf32>) -> ()
linalg.yield %t : tensor<?xf32>
linalg.tiled_yield %t in %t : tensor<?xf32>
}

return %r1, %r3: tensor<?xf32>, tensor<?xf32>
Expand Down
11 changes: 6 additions & 5 deletions mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -550,10 +550,11 @@ func @tiled_dot(%A: tensor<?xf32>, %B: tensor<?xf32>, %c: tensor<f32> {linalg.in

// CHECK: linalg.tiled_loop {{.*}} to (%[[M]]) {{.*}} %[[A]]{{.*}}%[[B]]{{.*}}outs{{.*}}%[[c]]
%1 = linalg.tiled_loop (%arg3) = (%c0) to (%0) step (%c3)
ins (%arg4 = %A: tensor<?xf32>, %use = %effecting : memref<?xf32>, %arg5 = %B: tensor<?xf32>)
ins (%arg4 = %A: tensor<?xf32>,
%use = %effecting : memref<?xf32>,
%arg5 = %B: tensor<?xf32>)
outs (%arg6 = %c: tensor<f32>)
iterators["reduction"]
{
iterators["reduction"] {
// CHECK-NOT: alloc

%2 = tensor.dim %arg4, %c0 : tensor<?xf32>
Expand All @@ -573,8 +574,8 @@ func @tiled_dot(%A: tensor<?xf32>, %B: tensor<?xf32>, %c: tensor<f32> {linalg.in
// CHECK: call @some_use(%{{.*}}) : (memref<?xf32>) -> ()
call @some_use(%use) : (memref<?xf32>) -> ()

linalg.yield %8 : tensor<f32>
// CHECK: linalg.yield
linalg.tiled_yield %8 in %arg6 : tensor<f32>
// CHECK: linalg.tiled_yield
// CHECK-NOT: tensor
}

Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Dialect/Linalg/distribute-tiled-loop.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ func @distribute_for_gpu(%A: tensor<64x64xf32>,
distribution ["block_x", "block_y"] {
%0 = call @foo(%A_, %B_)
: (tensor<64x64xf32>, tensor<64x64xf32>) -> tensor<64x64xf32>
linalg.yield %0 : tensor<64x64xf32>
linalg.tiled_yield %0 in %B_ : tensor<64x64xf32>
}
return %0 : tensor<64x64xf32>
}
Expand Down
38 changes: 15 additions & 23 deletions mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s -test-linalg-tensor-fusion-transform-patterns -resolve-shaped-type-result-dims -canonicalize -cse --split-input-file | FileCheck %s
// RUN: mlir-opt %s -test-linalg-tiled-loop-fusion-transform-patterns -resolve-shaped-type-result-dims -canonicalize -cse --split-input-file | FileCheck %s --check-prefix=TLOOP
// RUN: mlir-opt %s -test-linalg-tiled-loop-fusion-transform-patterns -resolve-shaped-type-result-dims -canonicalize -cse --split-input-file --mlir-disable-threading | FileCheck %s --check-prefix=TLOOP

module {
func @matmul_fusion(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
Expand Down Expand Up @@ -124,7 +124,7 @@ module {
// TLOOP: %[[DIM_B_1:.*]] = tensor.dim %[[B_]], %[[C1]] : [[TY]]
// TLOOP: %[[DIM_C_1:.*]] = tensor.dim %[[C_]], %[[C1]] : [[TY]]

// TLOOP: %[[ABC_SUB_:.*]] = linalg.tiled_loop (%[[IV1:.*]], %[[IV2:.*]]) =
// TLOOP: %[[ABC_SUB:.*]] = linalg.tiled_loop (%[[IV1:.*]], %[[IV2:.*]]) =
// TLOOP-SAME: (%[[C0]], %[[C0]]) to (%[[DIM_C_1]], %[[DIM_B_1]])
// TLOOP-SAME: step (%[[C64]], %[[C16]])
// TLOOP-SAME: ins (%[[AB_SUB_:.*]] = %[[AB_SUB]]: [[TY]],
Expand All @@ -134,18 +134,15 @@ module {

// TLOOP: %[[AB_SUB_SUB:.*]] = tensor.extract_slice %[[AB_SUB_]][0, %[[IV2]]]
// TLOOP: %[[C__SUB:.*]] = tensor.extract_slice %[[C__]][%[[IV2]], %[[IV1]]]
// TLOOP: %[[ABS_INIT_SUB_SUB:.*]] = tensor.extract_slice %[[ABC_INIT_SUB_]][0, %[[IV1]]]
// TLOOP: %[[ABC_INIT_SUB_SUB:.*]] = tensor.extract_slice %[[ABC_INIT_SUB_]][0, %[[IV1]]]

// TLOOP: %[[ABC_SUB_SUB:.*]] = linalg.matmul
// TLOOP-SAME: ins(%[[AB_SUB_SUB]], %[[C__SUB]] : [[TY]], [[TY]])
// TLOOP-SAME: outs(%[[ABS_INIT_SUB_SUB]] : [[TY]]) -> [[TY]]
// TLOOP-SAME: outs(%[[ABC_INIT_SUB_SUB]] : [[TY]]) -> [[TY]]

// TLOOP: %[[RES0:.*]] = tensor.insert_slice %[[ABC_SUB_SUB]]
// TLOOP-SAME: into %[[ABC_INIT_SUB_]][0, %[[IV1]]]
// TLOOP: linalg.yield %[[RES0]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[ABC_SUB_SUB]] in %[[ABC_INIT_SUB_SUB]] : [[TY]]
// TLOOP: }
// TLOOP: %[[RES1:.*]] = tensor.insert_slice %[[ABC_SUB_]] into %[[ABC_INIT_]][%[[IV0]], 0]
// TLOOP: linalg.yield %[[RES1]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[ABC_SUB]] in %[[ABC_INIT_SUB]] : [[TY]]
// TLOOP: }
// TLOOP: return %[[ABC]] : [[TY]]

Expand Down Expand Up @@ -238,10 +235,7 @@ module {
// TLOOP: %[[DOUBLE_AB:.*]] = linalg.generic
// TLOOP-SAME: ins(%[[AB_SUB]] : [[TY]]) outs(%[[INIT_SUB]] : [[TY]])

// TLOOP: %[[RESULT_SUB:.*]] = tensor.insert_slice
// TLOOP-SAME: %[[DOUBLE_AB:.*]] into %[[INIT_]][%[[IV0]], %[[IV1]]]

// TLOOP: linalg.yield %[[RESULT_SUB]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[DOUBLE_AB]] in %[[INIT_SUB]] : [[TY]]
// TLOOP: }
// TLOOP: return %[[RESULT]] : [[TY]]

Expand Down Expand Up @@ -304,7 +298,8 @@ module {
// TLOOP: %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
// TLOOP: %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
// TLOOP: %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
// TLOOP: %[[INIT_SUB:.*]] = linalg.fill(%[[C0_F32_]], %[[OUT_SUB]])
// TLOOP: %[[OUT_SUB_2:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
// TLOOP: %[[INIT_SUB:.*]] = linalg.fill(%[[C0_F32_]], %[[OUT_SUB_2]])

// TLOOP: %[[AB_SUB:.*]] = linalg.tiled_loop (%[[K:.*]]) = (%[[C0]])
// TLOOP-SAME: to (%[[DIM_A__1]]) step (%[[C16]])
Expand All @@ -319,11 +314,9 @@ module {
// TLOOP: %[[AB_SUB_SUB:.*]] = linalg.matmul
// TLOOP-SAME: ins(%[[A_SUB_SUB]], %[[B_SUB_SUB]] : [[TY]], [[TY]])
// TLOOP-SAME: outs(%[[INIT_SUB_]] : [[TY]]) -> [[TY]]
// TLOOP: linalg.yield %[[AB_SUB_SUB]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[AB_SUB_SUB]] in %[[INIT_SUB_]] : [[TY]]
// TLOOP: }
// TLOOP: %[[SUB_RESULT:.*]] = tensor.insert_slice %[[AB_SUB]]
// TLOOP-SAME: into %[[OUT_]][%[[I]], %[[J]]]
// TLOOP: linalg.yield %[[SUB_RESULT]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[AB_SUB]] in %[[OUT_SUB]] : [[TY]]
// TLOOP: }
// TLOOP: return %[[AB]] : [[TY]]

Expand Down Expand Up @@ -375,9 +368,10 @@ module {
// TLOOP: %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
// TLOOP: %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
// TLOOP: %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
// TLOOP: %[[OUT_SUB_2:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
// TLOOP: %[[INIT_SUB:.*]] = linalg.generic
// TLOOP-SAME: ins(%[[C0_F32_]]
// TLOOP-SAME: outs(%[[OUT_SUB]]
// TLOOP-SAME: outs(%[[OUT_SUB_2]]

// TLOOP: %[[AB_SUB:.*]] = linalg.tiled_loop (%[[K:.*]]) = (%[[C0]])
// TLOOP-SAME: to (%[[DIM_A__1]]) step (%[[C16]])
Expand All @@ -392,11 +386,9 @@ module {
// TLOOP: %[[AB_SUB_SUB:.*]] = linalg.matmul
// TLOOP-SAME: ins(%[[A_SUB_SUB]], %[[B_SUB_SUB]] : [[TY]], [[TY]])
// TLOOP-SAME: outs(%[[INIT_SUB_]] : [[TY]]) -> [[TY]]
// TLOOP: linalg.yield %[[AB_SUB_SUB]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[AB_SUB_SUB]] in %[[INIT_SUB_]] : [[TY]]
// TLOOP: }
// TLOOP: %[[SUB_RESULT:.*]] = tensor.insert_slice %[[AB_SUB]]
// TLOOP-SAME: into %[[OUT_]][%[[I]], %[[J]]]
// TLOOP: linalg.yield %[[SUB_RESULT]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[AB_SUB]] in %[[OUT_SUB]] : [[TY]]
// TLOOP: }
// TLOOP: return %[[AB]] : [[TY]]

44 changes: 32 additions & 12 deletions mlir/test/Dialect/Linalg/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -582,10 +582,6 @@ func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2

// -----

#map0 = affine_map<(d0) -> (24, -d0 + 192)>
#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
#map2 = affine_map<(d0) -> (16, -d0 + 192)>

func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
%C: memref<192x192xf32>) -> ()

Expand All @@ -603,11 +599,34 @@ func @tiled_loop_incorrent_num_yield_operands(%A: memref<192x192xf32>,
call @foo(%A_, %B_, %C_)
: (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> ()
// expected-error @+1 {{expected number of tensor output args = 1 to match the number of yield operands = 0}}
linalg.yield
linalg.tiled_yield
}
return
}

// -----

func @tiled_loop_incorrect_destination_for_tile(%A: tensor<4xf32>,
%B: tensor<4xf32>) {
%c2 = constant 2 : index
%c4 = constant 2 : index
%c0 = constant 0 : index
%0 = linalg.tiled_loop (%i) = (%c0) to (%c4) step (%c2)
ins (%A_ = %A: tensor<4xf32>)
outs (%B_ = %B: tensor<4xf32>) {
%A_sub = tensor.extract_slice %A_[%i][2][1]
: tensor<4xf32> to tensor<2xf32>
%B_sub = tensor.extract_slice %B_[%i][2][1]
: tensor<4xf32> to tensor<2xf32>
%c0_f32 = constant 0.0 : f32
%tile = linalg.fill(%c0_f32, %A_sub) : f32, tensor<2xf32> -> tensor<2xf32>
// expected-error @+1 {{expected output 0 to be a subset of the corresponding block argument}}
linalg.tiled_yield %tile in %A_sub : tensor<2xf32>
}
return
}


// -----

#map0 = affine_map<(d0) -> (24, -d0 + 192)>
Expand All @@ -630,16 +649,16 @@ func @tiled_loop_incorrent_yield_operand_type(%A: memref<192x192xf32>,
%C_ = %C: memref<192x192xf32>) {
%1 = call @foo(%A_, %B_, %C_)
: (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> tensor<f32>
// expected-error @+1 {{expected yield operand 0 with type = 'tensor<f32>' to match output arg type = 'tensor<192x192xf32>}}
linalg.yield %1 : tensor<f32>
// expected-error @+1 {{expected tile operand with type = 'tensor<f32>' to match output type = 'tensor<192x192xf32>}}
"linalg.tiled_yield" (%1, %CT_) : (tensor<f32>, tensor<192x192xf32>) -> ()
}
return
}

// -----

func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
%C: memref<192x192xf32>) -> ()
%C: memref<192x192xf32>) -> (tensor<192x192xf32>)

func @tiled_loop_incorrent_iterator_types_count(%A: memref<192x192xf32>,
%B: memref<192x192xf32>, %C: memref<192x192xf32>,
Expand All @@ -652,9 +671,10 @@ func @tiled_loop_incorrent_iterator_types_count(%A: memref<192x192xf32>,
^bb0(%arg4: index, %arg5: index, %A_: memref<192x192xf32>,
%B_: memref<192x192xf32>, %CT_: tensor<192x192xf32>,
%C_: memref<192x192xf32>):
call @foo(%A_, %B_, %C_)
: (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> ()
linalg.yield %CT_ : tensor<192x192xf32>
%tile = call @foo(%A_, %B_, %C_)
: (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)
-> (tensor<192x192xf32>)
linalg.tiled_yield %tile in %CT_ : tensor<192x192xf32>
}) {
iterator_types = ["parallel"],
operand_segment_sizes = dense<2> : vector<5xi32>
Expand All @@ -676,7 +696,7 @@ func @tiled_loop_incorrent_block_arg_type(%A: memref<192xf32>) {
"linalg.tiled_loop"(%c0, %c192, %c24, %A) ( {
^bb0(%arg4: index, %A_: memref<100xf32>):
call @foo(%A_) : (memref<100xf32>)-> ()
linalg.yield
linalg.tiled_yield
}) {
iterator_types = ["parallel"],
operand_segment_sizes = dense<[1, 1, 1, 0, 1]> : vector<5xi32>
Expand Down
10 changes: 3 additions & 7 deletions mlir/test/Dialect/Linalg/roundtrip.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -648,9 +648,7 @@ func @tiled_loop(%lhs: tensor<24x64xi8>, %rhs: tensor<24x64xi8>,
linalg.yield %s : i8
} -> tensor<?x?xi8>

%sum_sub = tensor.insert_slice %sum into %out_[%i, 0][%c4, %c64][1, 1]
: tensor<?x?xi8> into tensor<24x64xi8>
linalg.yield %sum_sub : tensor<24x64xi8>
linalg.tiled_yield %sum in %out_sub : tensor<?x?xi8>
}
return %prod : tensor<24x64xi8>
}
Expand Down Expand Up @@ -711,9 +709,7 @@ func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>,
linalg.yield %1 : f32
} -> tensor<4xf32>

%sum_sub = tensor.insert_slice %acc into %o_[%j][%c4][1]
: tensor<4xf32> into tensor<24xf32>
linalg.yield %sum_sub : tensor<24xf32>
linalg.tiled_yield %acc in %sub_out : tensor<4xf32>
}
return %result : tensor<24xf32>
}
Expand Down Expand Up @@ -773,7 +769,7 @@ func @tiled_loop_on_buffers(%input_3d: memref<16x24x32xf32>,
%1 = addf %0, %i1d : f32
linalg.yield %1 : f32
}
linalg.yield
linalg.tiled_yield
}
return
}
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Dialect/Linalg/tile-tensors.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ func @matmul_tensors(
// TLOOP: %[[PROD:.*]] = linalg.matmul ins(%[[SUB_ARG_0]], %[[SUB_ARG_1]]
// TLOOP-SE: outs(%[[SUB_ARG_2]] : [[TY]]) -> [[TY]]

// TLOOP: %[[O:.*]] = tensor.insert_slice %[[PROD]] into %[[A2]][%[[I]], %[[J]]]
// TLOOP: linalg.yield %[[O]] : [[TY]]
// TLOOP: linalg.tiled_yield %[[PROD]] in %[[SUB_ARG_2]] : [[TY]]

// -----

Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Dialect/Linalg/tiled-loops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func @tiled_loop(%A: memref<192x192xf32>,
linalg.matmul ins(%1, %3 : memref<?x192xf32, #map1>,
memref<192x?xf32, #map1>)
outs(%4 : memref<?x?xf32, #map1>)
linalg.yield
linalg.tiled_yield
}
return
}
Expand Down Expand Up @@ -64,7 +64,7 @@ func @tiled_loop_reduction(%A: memref<192x192xf32>,
outs (%C_ = %C: memref<f32>)
iterators["reduction", "reduction"] {
linalg.fill(%cst, %A_) : f32, memref<192x192xf32>
linalg.yield
linalg.tiled_yield
}
return
}
Expand Down