diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index 76a822b05a652..309121f520811 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -453,10 +453,24 @@ static LogicalResult processParallelLoop( 1, 2, rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1)); + // Map through cloningMap first so we use values valid at the launch + // scope, then ensure they are launch-independent (or cloned constants). + Value mappedStep = cloningMap.lookupOrDefault(step); + Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound); + + mappedStep = ensureLaunchIndependent(mappedStep); + mappedLowerBound = ensureLaunchIndependent(mappedLowerBound); + + // If either cannot be made available above the launch, fail gracefully. + if (!mappedStep || !mappedLowerBound) { + return rewriter.notifyMatchFailure( + parallelOp, "lower bound / step must be constant or defined above " + "the gpu.launch"); + } + newIndex = AffineApplyOp::create( rewriter, loc, annotation.getMap().compose(lowerAndStep), - ValueRange{operand, ensureLaunchIndependent(step), - ensureLaunchIndependent(lowerBound)}); + ValueRange{operand, mappedStep, mappedLowerBound}); // If there was also a bound, insert that, too. // TODO: Check that we do not assign bounds twice. if (annotation.getBound()) { diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir index 26f5a3e1f0ac0..e5fe5ec543bbc 100644 --- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir @@ -673,3 +673,51 @@ func.func @nested_parallel_with_side_effect() { // CHECK: gpu.launch // CHECK-NOT: scf.parallel + +// ----- + +func.func @scf2gpu_index_creation_2d() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + + // Single 2-D scf.parallel mapped to block_x and thread_x. + // Use both IVs so the conversion must compute indices. + scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) { + %u = arith.addi %bx, %c0 : index + %v = arith.addi %tx, %c0 : index + } { + mapping = [ + #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, + #gpu.loop_dim_map (d0), bound = (d0) -> (d0)> + ] + } + return +} + +// CHECK-LABEL: func.func @scf2gpu_index_creation_2d +// CHECK: gpu.launch +// CHECK: affine.apply + +// ----- + +func.func @scf2gpu_index_creation_1d() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + + scf.parallel (%t) = (%c0) to (%c64) step (%c1) { + %w = arith.addi %t, %c0 : index + } { + mapping = [ + #gpu.loop_dim_map (d0), bound = (d0) -> (d0)> + ] + } + return +} + +// CHECK-LABEL: func.func @scf2gpu_index_creation_1d +// CHECK: gpu.launch +// CHECK: affine.apply + +