diff --git a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt index b7853634bc44e..f8112bbdd548f 100644 --- a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt +++ b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt @@ -9,6 +9,7 @@ add_mlir_conversion_library(MLIRSCFToGPU MLIRConversionPassIncGen LINK_LIBS PUBLIC + MLIRAffineAnalysis MLIRAffineDialect MLIRAffineToStandard MLIRArithDialect diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp index c816356cf3f96..4c1fd59bb9042 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp @@ -9,6 +9,7 @@ #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h" #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" +#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Transforms/DialectConversion.h" @@ -29,11 +30,33 @@ namespace { struct ForLoopMapper : public impl::ConvertAffineForToGPUPassBase { using Base::Base; + /// Checks if the loop nest rooted at 'forOp' has parallelism for the first + /// `numMappedLoops` loops. + static bool areMappedLoopsParallel(affine::AffineForOp forOp, + unsigned numMappedLoops) { + affine::AffineForOp currentLoop = forOp; + for (unsigned i = 0; i < numMappedLoops; ++i) { + if (!affine::isLoopParallel(currentLoop)) + return false; + if (i + 1 < numMappedLoops) { + auto nestedLoops = currentLoop.getBody()->getOps(); + if (nestedLoops.empty()) + // Return true here to let the conversion fail later on structural + // mismatch if the nest is not deep enough. + return true; + // Target only the first nested loop in a perfect nest. + currentLoop = *nestedLoops.begin(); + } + } + return true; + } void runOnOperation() override { for (Operation &op : llvm::make_early_inc_range( getOperation().getFunctionBody().getOps())) { if (auto forOp = dyn_cast(&op)) { + if (!areMappedLoopsParallel(forOp, numBlockDims + numThreadDims)) + continue; if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims, numThreadDims))) signalPassFailure(); diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir index 79eef8ae7eb85..107a6c3838010 100644 --- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir +++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir @@ -21,14 +21,13 @@ func.func @one_d_loop(%A : memref, %B : memref) { // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) affine.for %i = 0 to 42 { // CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]] - // CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]] + // CHECK-THREADS-NEXT: affine.load %{{.*}}[%[[INDEX]]] // CHECK-BLOCKS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[B0]] - // CHECK-BLOCKS-NEXT: memref.load %{{.*}}[%[[INDEX]]] - %0 = memref.load %A[%i] : memref - memref.store %0, %B[%i] : memref + // CHECK-BLOCKS-NEXT: affine.load %{{.*}}[%[[INDEX]]] + %0 = affine.load %A[%i] : memref + affine.store %0, %B[%i] : memref // CHECK-THREADS: gpu.terminator // CHECK-BLOCKS: gpu.terminator } return } - diff --git a/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir b/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir new file mode 100644 index 0000000000000..16770ccba58c3 --- /dev/null +++ b/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir @@ -0,0 +1,105 @@ +// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-affine-for-to-gpu{gpu-block-dims=1 gpu-thread-dims=1}))" %s | FileCheck %s + +/// Test parallelization legality checks in affine-for-to-gpu conversion. +/// The pass is configured to map the first 2 loops to GPU block (depth 0) and +/// GPU thread (depth 1) respectively. + +// CHECK-LABEL: func @map_to_gpu_inner_dep_unmapped +// CHECK-SAME: %[[MEM:.*]]: memref<10x10x10xf32> +func.func @map_to_gpu_inner_dep_unmapped(%mem: memref<10x10x10xf32>) { + /// The inner loop 'k' (depth=2) carries a dependency. However, since the + /// mapping only covers depth 0 and 1, 'k' remains sequential inside the + /// GPU kernel. The outer loops are dependency-free and safe to map. + + // CHECK: gpu.launch + affine.for %i = 0 to 10 { + affine.for %j = 0 to 10 { + // CHECK: affine.for %{{.*}} = 1 to 10 + // CHECK: affine.load %[[MEM]] + // CHECK: affine.store %{{.*}}, %[[MEM]] + affine.for %k = 1 to 10 { + %0 = affine.load %mem[%i, %j, %k - 1] : memref<10x10x10xf32> + affine.store %0, %mem[%i, %j, %k] : memref<10x10x10xf32> + } + } + } + return +} + +// CHECK-LABEL: func @negative_map_to_gpu_block_dep +func.func @negative_map_to_gpu_block_dep(%mem: memref<10xf32>) { + /// The loop 'i' is mapped to a block dimension (depth=0). + /// The loop-carried dependency makes parallelization unsafe. + + // CHECK-NOT: gpu.launch + // CHECK: affine.for + affine.for %i = 1 to 10 { + %0 = affine.load %mem[%i - 1] : memref<10xf32> + affine.store %0, %mem[%i] : memref<10xf32> + } + return +} + +// CHECK-LABEL: func @negative_map_to_gpu_thread_dep +func.func @negative_map_to_gpu_thread_dep(%mem: memref<10x10xf32>) { + /// The inner loop 'j' is mapped to a thread dimension (depth=1). + /// A dependency in any mapped loop invalidates the entire nest conversion. + + // CHECK-NOT: gpu.launch + // CHECK: affine.for + affine.for %i = 0 to 10 { + // CHECK: affine.for + affine.for %j = 1 to 10 { + %0 = affine.load %mem[%i, %j - 1] : memref<10x10xf32> + affine.store %0, %mem[%i, %j] : memref<10x10xf32> + } + } + return +} + +// CHECK-LABEL: func @negative_map_to_gpu_imperfect_nest_dep +func.func @negative_map_to_gpu_imperfect_nest_dep(%mem: memref<10x10xf32>) { + /// Imperfect nest: The first inner loop 'j' has a dependency and is mapped + /// to a thread dimension. This prevents parallelization of the parent loop. + + // CHECK-NOT: gpu.launch + // CHECK: affine.for + affine.for %i = 0 to 10 { + // CHECK: affine.for + affine.for %j = 1 to 10 { + %0 = affine.load %mem[%i, %j - 1] : memref<10x10xf32> + affine.store %0, %mem[%i, %j] : memref<10x10xf32> + } + // CHECK: affine.for + affine.for %k = 0 to 10 { + %1 = affine.load %mem[%i, %k] : memref<10x10xf32> + affine.store %1, %mem[%i, %k] : memref<10x10xf32> + } + } + return +} + +// CHECK-LABEL: func @mixed_parallel_and_seq_siblings +func.func @mixed_parallel_and_seq_siblings(%mem: memref<10x10xf32>) { + /// Sibling top-level loops are analyzed independently. The first nest is + /// safe; the second has a dependency in a mapped loop (thread dim). + + // CHECK: gpu.launch + affine.for %i = 0 to 10 { + affine.for %j = 0 to 10 { + %0 = affine.load %mem[%i, %j] : memref<10x10xf32> + affine.store %0, %mem[%i, %j] : memref<10x10xf32> + } + } + + // CHECK-NOT: gpu.launch + // CHECK: affine.for + affine.for %i2 = 0 to 10 { + // CHECK: affine.for + affine.for %j2 = 1 to 10 { + %1 = affine.load %mem[%i2, %j2 - 1] : memref<10x10xf32> + affine.store %1, %mem[%i2, %j2] : memref<10x10xf32> + } + } + return +} diff --git a/mlir/test/Conversion/SCFToGPU/step_one.mlir b/mlir/test/Conversion/SCFToGPU/step_one.mlir index be6fadfbd0ad3..3f8a1a847bc22 100644 --- a/mlir/test/Conversion/SCFToGPU/step_one.mlir +++ b/mlir/test/Conversion/SCFToGPU/step_one.mlir @@ -64,12 +64,12 @@ func.func @step_1(%A : memref, %B : memref) { // CHECK-22-NEXT: %[[jj:.*]] = arith.addi %{{.*}}, %{{.*}} : index // Using remapped values instead of loop iterators. - // CHECK-11: {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref - // CHECK-22: {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref - %0 = memref.load %A[%i, %j, %ii, %jj] : memref - // CHECK-11-NEXT: memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref - // CHECK-22-NEXT: memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref - memref.store %0, %B[%i, %j, %ii, %jj] : memref + // CHECK-11: {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref + // CHECK-22: {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref + %0 = affine.load %A[%i, %j, %ii, %jj] : memref + // CHECK-11-NEXT: affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref + // CHECK-22-NEXT: affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref + affine.store %0, %B[%i, %j, %ii, %jj] : memref // CHECK-11: gpu.terminator // CHECK-22: gpu.terminator @@ -79,4 +79,3 @@ func.func @step_1(%A : memref, %B : memref) { } return } - diff --git a/mlir/test/Conversion/SCFToGPU/step_positive.mlir b/mlir/test/Conversion/SCFToGPU/step_positive.mlir index 84e8454e56171..4da3a458a9152 100644 --- a/mlir/test/Conversion/SCFToGPU/step_positive.mlir +++ b/mlir/test/Conversion/SCFToGPU/step_positive.mlir @@ -18,10 +18,10 @@ func.func @step_var(%A : memref, %B : memref) { // CHECK-NEXT: %[[prod_j:.*]] = arith.muli %{{.*}}, %{{.*}} : index // CHECK-NEXT: %[[j:.*]] = arith.addi %{{.*}}, %[[prod_j]] : index - // CHECK: {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]]] : memref - %0 = memref.load %A[%i, %j] : memref - // CHECK: memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref - memref.store %0, %B[%i, %j] : memref + // CHECK: {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]]] : memref + %0 = affine.load %A[%i, %j] : memref + // CHECK: affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref + affine.store %0, %B[%i, %j] : memref } } return