Skip to content
18 changes: 16 additions & 2 deletions mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,24 @@ static LogicalResult processParallelLoop(
1, 2,
rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
rewriter.getAffineSymbolExpr(1));
// Map through cloningMap first so we use values valid at the launch
// scope, then ensure they are launch-independent (or cloned constants).
Value mappedStep = cloningMap.lookupOrDefault(step);
Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound);

mappedStep = ensureLaunchIndependent(mappedStep);
mappedLowerBound = ensureLaunchIndependent(mappedLowerBound);

// If either cannot be made available above the launch, fail gracefully.
if (!mappedStep || !mappedLowerBound) {
return rewriter.notifyMatchFailure(
parallelOp, "lower bound / step must be constant or defined above "
"the gpu.launch");
}

newIndex = AffineApplyOp::create(
rewriter, loc, annotation.getMap().compose(lowerAndStep),
ValueRange{operand, ensureLaunchIndependent(step),
ensureLaunchIndependent(lowerBound)});
ValueRange{operand, mappedStep, mappedLowerBound});
// If there was also a bound, insert that, too.
// TODO: Check that we do not assign bounds twice.
if (annotation.getBound()) {
Expand Down
48 changes: 48 additions & 0 deletions mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -673,3 +673,51 @@ func.func @nested_parallel_with_side_effect() {

// CHECK: gpu.launch
// CHECK-NOT: scf.parallel

// -----

func.func @scf2gpu_index_creation_2d() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index

// Single 2-D scf.parallel mapped to block_x and thread_x.
// Use both IVs so the conversion must compute indices.
scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) {
%u = arith.addi %bx, %c0 : index
%v = arith.addi %tx, %c0 : index
} {
mapping = [
#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
]
}
return
}

// CHECK-LABEL: func{{(\.func)?}} @scf2gpu_index_creation_2d
// CHECK: gpu.launch
// CHECK: %[[IDX:.*]] = affine.apply
// CHECK: arith.addi %[[IDX]],

// -----

func.func @scf2gpu_index_creation_1d() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

scf.parallel (%t) = (%c0) to (%c64) step (%c1) {
%w = arith.addi %t, %c0 : index
} {
mapping = [
#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
]
}
return
}

// CHECK-LABEL: func{{(\.func)?}} @scf2gpu_index_creation_1d
// CHECK: gpu.launch
// CHECK: %[[IDX:.*]] = affine.apply
// CHECK: arith.addi %[[IDX]],