From 4bd408995cd92f4ada8e9a8093116f1a100ee114 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 25 Sep 2025 04:10:42 +0100 Subject: [PATCH 1/6] [mlir][gpu] Add innermost-first policy when mapping loops to GPU IDs Extend the `GpuMapParallelLoopsPass` with a new `--mapping-policy` option. While the existing behavior is retained as the default option namely `outermost-first`; support for a new policy is added `innermost-first` that maps the loops in reverse order. Signed-off-by: Georgios Pinitas --- .../mlir/Dialect/GPU/Transforms/Passes.td | 11 +++ .../GPU/Transforms/ParallelLoopMapper.cpp | 62 +++++++++++--- mlir/test/Dialect/GPU/mapping.mlir | 80 +++++++++++++------ 3 files changed, 117 insertions(+), 36 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index 187ac9aa18aac..0c8a0c7a677ab 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -39,8 +39,19 @@ def GpuMapParallelLoopsPass encountered to the local workgroup. Within each mapping, the first three dimensions are mapped to x/y/z hardware ids and all following dimensions are mapped to sequential loops. + + Ordering of the loop mapping against the different dimensions is controlled + by the `mapping-policy` option. + Two policies are supported: + 1. `outermost-first` (default): the outermost loop maps to X, then Y + and finally Z. + 2. `innermost-first`: the innermost loop maps to X, then Y and finally Z. }]; let dependentDialects = ["mlir::gpu::GPUDialect"]; + let options = [Option<"mappingPolicyStr", "mapping-policy", "std::string", + /*default=*/"\"outermost-first\"", + "Policy outlining how to assign loops to GPU dimensions." + "Supported values are `outermost-first` and `innermost-first`.">]; } def GpuEliminateBarriers diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index a098e721303a8..7b335d1d5f224 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -52,6 +52,7 @@ gpu::setMappingAttr(ParallelOp ploopOp, namespace gpu { namespace { enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; +enum class MappingPolicy { OutermostFirst, InnermostFirst }; } // namespace static constexpr int kNumHardwareIds = 3; @@ -65,16 +66,29 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) { return mappingLevel; } +// Map the policy string to a typed mapping policy. +static FailureOr getMappingPolicyFromStr(StringRef policy) { + std::string policyCanonical = policy.trim().lower(); + + auto option = + llvm::StringSwitch>(policyCanonical) + .Case("innermost-first", MappingPolicy::InnermostFirst) + .Case("outermost-first", MappingPolicy::OutermostFirst) + .Default(std::nullopt); + + if (!option) + return failure(); + return *option; +} + /// Computed the hardware id to use for a given mapping level. Will /// assign x,y and z hardware ids for the first 3 dimensions and use /// sequential after. -/// TODO: Make this use x for the inner-most loop that is -/// distributed to map to x, the next innermost to y and the next innermost to -/// z. static Processor getHardwareIdForMapping(MappingLevel level, int dimension) { if (dimension >= kNumHardwareIds || level == Sequential) return Processor::Sequential; + switch (level) { case MapGrid: switch (dimension) { @@ -107,20 +121,34 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) { /// Add mapping information to the given parallel loop. Do not add /// mapping information if the loop already has it. Also, don't /// start a mapping at a nested loop. -static void mapParallelOp(ParallelOp parallelOp, - MappingLevel mappingLevel = MapGrid) { +static void +mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid, + MappingPolicy mappingPolicy = MappingPolicy::OutermostFirst) { // Do not try to add a mapping to already mapped loops or nested loops. if (parallelOp->getAttr(getMappingAttrName()) || ((mappingLevel == MapGrid) && parallelOp->getParentOfType())) return; + const int numLoops = static_cast(parallelOp.getNumLoops()); + const int loopsToMap = std::min(numLoops, kNumHardwareIds); + MLIRContext *ctx = parallelOp.getContext(); Builder b(ctx); SmallVector attrs; - attrs.reserve(parallelOp.getNumLoops()); - for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) { + attrs.reserve(numLoops); + + for (int i = 0, e = numLoops; i < e; ++i) { + + // Determine the mapping to use for this loop. + int hwMapping = kNumHardwareIds; + if (i < loopsToMap) { + hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst) + ? i + : (loopsToMap - 1 - i); + } + attrs.push_back(b.getAttr( - getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(), + getHardwareIdForMapping(mappingLevel, hwMapping), b.getDimIdentityMap(), b.getDimIdentityMap())); } (void)setMappingAttr(parallelOp, attrs); @@ -129,16 +157,30 @@ static void mapParallelOp(ParallelOp parallelOp, // walk but just iterate over the operations. for (Operation &op : *parallelOp.getBody()) { if (ParallelOp nested = dyn_cast(op)) - mapParallelOp(nested, mappingLevel); + mapParallelOp(nested, mappingLevel, mappingPolicy); } } namespace { struct GpuMapParallelLoopsPass : public impl::GpuMapParallelLoopsPassBase { + using Base::Base; + void runOnOperation() override { + // Parse the mapping policy. + auto policyOrFailure = getMappingPolicyFromStr(mappingPolicyStr); + if (failed(policyOrFailure)) { + getOperation()->emitError() << "Invalid mapping policy specified."; + return signalPassFailure(); + } + + auto policy = *policyOrFailure; + MappingLevel topLevel = MappingLevel::MapGrid; + for (Region ®ion : getOperation()->getRegions()) { - region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); }); + region.walk([&](ParallelOp parallelOp) { + mapParallelOp(parallelOp, topLevel, policy); + }); } } }; diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir index 395987317a1e6..b313ab69cc001 100644 --- a/mlir/test/Dialect/GPU/mapping.mlir +++ b/mlir/test/Dialect/GPU/mapping.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s +// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s --check-prefix=OUTER +// RUN: mlir-opt -gpu-map-parallel-loops="mapping-policy=innermost-first" -split-input-file %s | FileCheck %s --check-prefix=INNER func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) { @@ -14,14 +15,23 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, return } -// CHECK-LABEL: func @parallel_loop( -// CHECK: scf.parallel -// CHECK: scf.parallel -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK-NOT: mapping +// OUTER-LABEL: func @parallel_loop( +// OUTER: scf.parallel +// OUTER: scf.parallel +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER-NOT: mapping + +// INNER-LABEL: func @parallel_loop( +// INNER: scf.parallel +// INNER: scf.parallel +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER-NOT: mapping // ----- @@ -42,20 +52,38 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index, return } -// CHECK-LABEL: func @parallel_loop_4d( -// CHECK: scf.parallel -// CHECK: scf.parallel -// CHECK: scf.parallel -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK-NOT: mapping +// OUTER-LABEL: func @parallel_loop_4d( +// OUTER: scf.parallel +// OUTER: scf.parallel +// OUTER: scf.parallel +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER-NOT: mapping + +// INNER-LABEL: func @parallel_loop_4d( +// INNER: scf.parallel +// INNER: scf.parallel +// INNER: scf.parallel +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER-NOT: mapping From 28babcc94b635bb3b37c5f550464f8ba12e635a2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 25 Sep 2025 11:35:24 +0100 Subject: [PATCH 2/6] [mlir][gpu] Add innermost-first policy when mapping loops to GPU IDs Address review comments Signed-off-by: Georgios Pinitas --- mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index 7b335d1d5f224..1afca9d180e22 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -137,9 +137,10 @@ mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid, SmallVector attrs; attrs.reserve(numLoops); - for (int i = 0, e = numLoops; i < e; ++i) { + for (int i = 0; i < numLoops; ++i) { // Determine the mapping to use for this loop. + // If the are more loops to map than HW IDs map to map to sequential. int hwMapping = kNumHardwareIds; if (i < loopsToMap) { hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst) From 308a2f472e5407b0f08b934ea9294dce6176a4a2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 25 Sep 2025 12:00:26 +0100 Subject: [PATCH 3/6] Update ParallelLoopMapper.cpp --- mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index 1afca9d180e22..c80800c4f7832 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -140,7 +140,7 @@ mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid, for (int i = 0; i < numLoops; ++i) { // Determine the mapping to use for this loop. - // If the are more loops to map than HW IDs map to map to sequential. + // If the are more loops to map than HW IDs map to sequential. int hwMapping = kNumHardwareIds; if (i < loopsToMap) { hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst) From 956bcbe44a7cb466d31b4f16960653737d0d01ce Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 25 Sep 2025 13:36:04 +0100 Subject: [PATCH 4/6] Update ParallelLoopMapper.cpp --- mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index c80800c4f7832..5e7125148bbba 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -1,4 +1,4 @@ -//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =// +f//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -175,7 +175,7 @@ struct GpuMapParallelLoopsPass return signalPassFailure(); } - auto policy = *policyOrFailure; + MappingPolicy policy = *policyOrFailure; MappingLevel topLevel = MappingLevel::MapGrid; for (Region ®ion : getOperation()->getRegions()) { From 52c05545849b57f9b10edfb76fb6aba9917dbae3 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 25 Sep 2025 13:36:55 +0100 Subject: [PATCH 5/6] Update ParallelLoopMapper.cpp --- mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index 5e7125148bbba..66d6078202c3e 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -1,4 +1,4 @@ -f//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =// +//===- ParallelLoopMapper.cpp - Utilities for mapping parallel loops to GPU =// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 1dc199174b294b2ffa087a4ee8c54e2cf8d2f01f Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 25 Sep 2025 15:19:34 +0100 Subject: [PATCH 6/6] [mlir][gpu] Add innermost-first policy when mapping loops to GPU IDs Add TODO to potentially use a loop interchange in the future. Signed-off-by: Georgios Pinitas --- mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index 66d6078202c3e..594c7a265667e 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -67,10 +67,11 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) { } // Map the policy string to a typed mapping policy. +// TODO: Revisit this and possibly use a loop interchange pass instead. static FailureOr getMappingPolicyFromStr(StringRef policy) { std::string policyCanonical = policy.trim().lower(); - auto option = + std::optional option = llvm::StringSwitch>(policyCanonical) .Case("innermost-first", MappingPolicy::InnermostFirst) .Case("outermost-first", MappingPolicy::OutermostFirst) @@ -169,7 +170,8 @@ struct GpuMapParallelLoopsPass void runOnOperation() override { // Parse the mapping policy. - auto policyOrFailure = getMappingPolicyFromStr(mappingPolicyStr); + FailureOr policyOrFailure = + getMappingPolicyFromStr(mappingPolicyStr); if (failed(policyOrFailure)) { getOperation()->emitError() << "Invalid mapping policy specified."; return signalPassFailure();