From a19415fefaacb8d171711097f0bafa73510f600c Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 28 Apr 2025 12:27:06 -0400 Subject: [PATCH 1/4] [MLIR][GPU] Add a pattern to rewrite gpu.subgroup_id This patch impelemnts a rewrite pattern for transforming `gpu.subgroup_id` to: ``` subgroup_id = linearized_thread_id / gpu.subgroup_size ``` where: ``` linearized_thread_id = thread_id.x + block_dim.x * (thread_id.y + block_dim.y * thread_id.z) ``` --- .../mlir/Dialect/GPU/Transforms/Passes.h | 5 ++ mlir/lib/Dialect/GPU/CMakeLists.txt | 1 + .../GPU/Transforms/SubgroupIdRewriter.cpp | 82 +++++++++++++++++++ mlir/test/Dialect/GPU/subgroupId-rewrite.mlir | 26 ++++++ 4 files changed, 114 insertions(+) create mode 100644 mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp create mode 100644 mlir/test/Dialect/GPU/subgroupId-rewrite.mlir diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index a13ad33df29cd..cbb990e603a38 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -39,6 +39,10 @@ class FuncOp; /// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect. void populateGpuGlobalIdPatterns(RewritePatternSet &patterns); +/// Collect a set of patterns to rewrite SubgroupIdOp op within the GPU +/// dialect. +void populateGpuSubgroupIdPatterns(RewritePatternSet &patterns); + /// Collect a set of patterns to rewrite shuffle ops within the GPU dialect. void populateGpuShufflePatterns(RewritePatternSet &patterns); @@ -88,6 +92,7 @@ inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { populateGpuAllReducePatterns(patterns); populateGpuGlobalIdPatterns(patterns); populateGpuShufflePatterns(patterns); + populateGpuSubgroupIdPatterns(patterns); } namespace gpu { diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index be6492a22f34f..e21fa501bae6b 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -40,6 +40,7 @@ add_mlir_dialect_library(MLIRGPUTransforms Transforms/ROCDLAttachTarget.cpp Transforms/ShuffleRewriter.cpp Transforms/SPIRVAttachTarget.cpp + Transforms/SubgroupIdRewriter.cpp Transforms/SubgroupReduceLowering.cpp OBJECT diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp new file mode 100644 index 0000000000000..1c322c1016c01 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp @@ -0,0 +1,82 @@ +//===- SubgroupIdRewriter.cpp - Implementation of SugroupId rewriting ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements in-dialect rewriting of the gpu.subgroup_id op for archs +// where: +// subgroup_id = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / subgroup_size +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; + +namespace { +struct GpuSubgroupIdRewriter final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(gpu::SubgroupIdOp op, + PatternRewriter &rewriter) const override { + // Calculation of the thread's subgroup identifier. + // + // The process involves mapping the thread's 3D identifier within its + // block (b_id.x, b_id.y, b_id.z) to a 1D linear index. + // This linearization assumes a layout where the x-dimension (w_dim.x) + // varies most rapidly (i.e., it is the innermost dimension). + // + // The formula for the linearized thread index is: + // L = tid.x + dim.x * (tid.y + (dim.y * tid.z)) + // + // Subsequently, the range of linearized indices [0, N_threads-1] is + // divided into consecutive, non-overlapping segments, each representing + // a subgroup of size 'subgroup_size'. + // + // Example Partitioning (N = subgroup_size): + // | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... | + // | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... | + // + // The subgroup identifier is obtained via integer division of the + // linearized thread index by the predefined 'subgroup_size'. + // + // subgroup_id = floor( L / subgroup_size ) + // = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / + // subgroup_size + + auto loc = op->getLoc(); + + Value dimX = rewriter.create(loc, gpu::Dimension::x); + Value dimY = rewriter.create(loc, gpu::Dimension::y); + Value tidX = rewriter.create(loc, gpu::Dimension::x); + Value tidY = rewriter.create(loc, gpu::Dimension::y); + Value tidZ = rewriter.create(loc, gpu::Dimension::z); + + Value dimYxIdZ = rewriter.create(loc, dimY, tidZ); + Value dimYxIdZPlusIdY = rewriter.create(loc, dimYxIdZ, tidY); + Value dimYxIdZPlusIdYTimesDimX = + rewriter.create(loc, dimX, dimYxIdZPlusIdY); + Value IdXPlusDimYxIdZPlusIdYTimesDimX = + rewriter.create(loc, tidX, dimYxIdZPlusIdYTimesDimX); + Value subgroupSize = rewriter.create( + loc, rewriter.getIndexType(), /*upper_bound = */ nullptr); + Value subgroupIdOp = rewriter.create( + loc, IdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize); + rewriter.replaceOp(op, {subgroupIdOp}); + return success(); + } +}; + +} // namespace + +void mlir::populateGpuSubgroupIdPatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} diff --git a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir new file mode 100644 index 0000000000000..02fcb2ba21dad --- /dev/null +++ b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir @@ -0,0 +1,26 @@ +// RUN: mlir-opt --test-gpu-rewrite -split-input-file %s | FileCheck %s + +module { + // CHECK-LABEL: func.func @subgroupId + // CHECK-SAME: (%[[SZ:.*]]: index, %[[MEM:.*]]: memref) { + func.func @subgroupId(%sz : index, %mem: memref) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) + threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { + // CHECK: %[[DIMX:.*]] = gpu.block_dim x + // CHECK-NEXT: %[[DIMY:.*]] = gpu.block_dim y + // CHECK-NEXT: %[[TIDX:.*]] = gpu.thread_id x + // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y + // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z + // CHECK-NEXT: %[[T0:.*]] = index.mul %[[DIMY]], %[[TIDZ]] + // CHECK-NEXT: %[[T1:.*]] = index.add %[[T0]], %[[TIDY]] + // CHECK-NEXT: %[[T2:.*]] = index.mul %[[DIMX]], %[[T1]] + // CHECK-NEXT: %[[T3:.*]] = index.add %[[TIDX]], %[[T2]] + // CHECK-NEXT: %[[T4:.*]] = gpu.subgroup_size : index + // CHECK-NEXT: %[[T5:.*]] = index.divu %[[T3]], %[[T4]] + %idz = gpu.subgroup_id : index + memref.store %idz, %mem[] : memref + gpu.terminator + } + return + } +} From fbe3bd189e6d26cb5f2bc68143238d1024d4aa7e Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 28 Apr 2025 16:21:20 -0400 Subject: [PATCH 2/4] Update mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp index 1c322c1016c01..5fa095564c545 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp @@ -1,4 +1,4 @@ -//===- SubgroupIdRewriter.cpp - Implementation of SugroupId rewriting ----===// +//===- SubgroupIdRewriter.cpp - Implementation of SubgroupId rewriting ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From e316a6e39a37796af44798a03bb3cc8b051b999b Mon Sep 17 00:00:00 2001 From: Alan Li Date: Mon, 28 Apr 2025 16:44:00 -0400 Subject: [PATCH 3/4] remove --- .../mlir/Dialect/GPU/Transforms/Passes.h | 1 - .../GPU/Transforms/GlobalIdRewriter.cpp | 2 +- .../GPU/Transforms/SubgroupIdRewriter.cpp | 2 +- mlir/test/Dialect/GPU/subgroupId-rewrite.mlir | 42 +++++++++---------- mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp | 1 + 5 files changed, 23 insertions(+), 25 deletions(-) diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h index cbb990e603a38..6cd6f03253aea 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h @@ -92,7 +92,6 @@ inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { populateGpuAllReducePatterns(patterns); populateGpuGlobalIdPatterns(patterns); populateGpuShufflePatterns(patterns); - populateGpuSubgroupIdPatterns(patterns); } namespace gpu { diff --git a/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp index 0c730df73b519..c40ddd9b15afc 100644 --- a/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/GlobalIdRewriter.cpp @@ -26,7 +26,7 @@ struct GpuGlobalIdRewriter : public OpRewritePattern { LogicalResult matchAndRewrite(gpu::GlobalIdOp op, PatternRewriter &rewriter) const override { - auto loc = op.getLoc(); + Location loc = op.getLoc(); auto dim = op.getDimension(); auto blockId = rewriter.create(loc, dim); auto blockDim = rewriter.create(loc, dim); diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp index 5fa095564c545..72099371c0700 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp @@ -1,4 +1,4 @@ -//===- SubgroupIdRewriter.cpp - Implementation of SubgroupId rewriting ----===// +//===- SubgroupIdRewriter.cpp - Implementation of SubgroupId rewriting ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir index 02fcb2ba21dad..a0c852f6fbe88 100644 --- a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir +++ b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir @@ -1,26 +1,24 @@ // RUN: mlir-opt --test-gpu-rewrite -split-input-file %s | FileCheck %s -module { - // CHECK-LABEL: func.func @subgroupId - // CHECK-SAME: (%[[SZ:.*]]: index, %[[MEM:.*]]: memref) { - func.func @subgroupId(%sz : index, %mem: memref) { - gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) - threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { - // CHECK: %[[DIMX:.*]] = gpu.block_dim x - // CHECK-NEXT: %[[DIMY:.*]] = gpu.block_dim y - // CHECK-NEXT: %[[TIDX:.*]] = gpu.thread_id x - // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y - // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z - // CHECK-NEXT: %[[T0:.*]] = index.mul %[[DIMY]], %[[TIDZ]] - // CHECK-NEXT: %[[T1:.*]] = index.add %[[T0]], %[[TIDY]] - // CHECK-NEXT: %[[T2:.*]] = index.mul %[[DIMX]], %[[T1]] - // CHECK-NEXT: %[[T3:.*]] = index.add %[[TIDX]], %[[T2]] - // CHECK-NEXT: %[[T4:.*]] = gpu.subgroup_size : index - // CHECK-NEXT: %[[T5:.*]] = index.divu %[[T3]], %[[T4]] - %idz = gpu.subgroup_id : index - memref.store %idz, %mem[] : memref - gpu.terminator - } - return +// CHECK-LABEL: func.func @subgroupId +// CHECK-SAME: (%[[SZ:.*]]: index, %[[MEM:.*]]: memref) { +func.func @subgroupId(%sz : index, %mem: memref) { + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) + threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { + // CHECK: %[[DIMX:.*]] = gpu.block_dim x + // CHECK-NEXT: %[[DIMY:.*]] = gpu.block_dim y + // CHECK-NEXT: %[[TIDX:.*]] = gpu.thread_id x + // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y + // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z + // CHECK-NEXT: %[[T0:.*]] = index.mul %[[DIMY]], %[[TIDZ]] + // CHECK-NEXT: %[[T1:.*]] = index.add %[[T0]], %[[TIDY]] + // CHECK-NEXT: %[[T2:.*]] = index.mul %[[DIMX]], %[[T1]] + // CHECK-NEXT: %[[T3:.*]] = index.add %[[TIDX]], %[[T2]] + // CHECK-NEXT: %[[T4:.*]] = gpu.subgroup_size : index + // CHECK-NEXT: %[[T5:.*]] = index.divu %[[T3]], %[[T4]] + %idz = gpu.subgroup_id : index + memref.store %idz, %mem[] : memref + gpu.terminator } + return } diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp index fe402da4cc105..616f458e4824c 100644 --- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp +++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp @@ -41,6 +41,7 @@ struct TestGpuRewritePass void runOnOperation() override { RewritePatternSet patterns(&getContext()); populateGpuRewritePatterns(patterns); + populateGpuSubgroupIdPatterns(patterns); (void)applyPatternsGreedily(getOperation(), std::move(patterns)); } }; From 74912ac6c027f6c6756f7f9060ebb2217f8c6477 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 29 Apr 2025 09:23:05 -0400 Subject: [PATCH 4/4] Update mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp Co-authored-by: Jakub Kuderski --- mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp index 72099371c0700..0f0df08919553 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp @@ -52,7 +52,7 @@ struct GpuSubgroupIdRewriter final : OpRewritePattern { // = (tid.x + dim.x * (tid.y + dim.y * tid.z)) / // subgroup_size - auto loc = op->getLoc(); + Location loc = op->getLoc(); Value dimX = rewriter.create(loc, gpu::Dimension::x); Value dimY = rewriter.create(loc, gpu::Dimension::y);