[mlir][gpu] Add `gpu.subgroup_uniform` op #157743

Hardcode84 · 2025-09-09T20:39:54Z

Introducing a dedicated op instead of broadcast any_lane per discussion #152808

llvmbot · 2025-09-10T15:04:35Z

@llvm/pr-subscribers-mlir-gpu

Author: Ivan Butygin (Hardcode84)

Changes

Introducing a dedicated op instead of broadcast any_lane per discussion #152808

Full diff: https://github.com/llvm/llvm-project/pull/157743.diff

5 Files Affected:

(modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+33)
(modified) mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp (+18-1)
(modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+9)
(modified) mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir (+12)
(modified) mlir/test/Dialect/GPU/ops.mlir (+8)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..3e2fa6b43d5fd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
   let hasVerifier = 1;
 }
 
+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+    [Pure, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src)> {
+  let summary = "Assumes value is unform across the lanes in subgroup";
+  let description = [{
+    The "subgroup_uniform" op assumes that the value is uniform across all lanes
+    in a subgroup. This means that all active lanes in the subgroup are expected
+    to have the same value.
+
+    This op can be used to inform the compiler that a value is uniform across
+    the subgroup, enabling optimizations. The result is poison if the value
+    is not actually uniform.
+
+    This op is functionally no-op as no valid program should change its
+    semantics if this op is removed. Backends can choose to ignore it or do
+    some optimizations (e.g. put value into scalar registers).
+
+    This op can be freely speculated across structured control flow as parent
+    active mask is always superset of current mask and if can hoist input
+    calculation you can hoist the operation itself as well.
+
+    Example:
+
+    ```mlir
+    %1 = gpu.subgroup_uniform %0 : f32
+    ```
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$src attr-dict `:` type($result)";
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 807d1f52ee69b..5377ce709497e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL
   }
 };
 
+struct GPUSubgroupUniformOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::SubgroupUniformOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (!isSupportedReadLaneType(src.getType()))
+      return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+    rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
-               GPUSubgroupBroadcastOpToROCDL>(converter);
+               GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>(
+      converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 43b02f16aa829..6022fa517421a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupUniformOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index c6261b37ef8f2..69fb45d9097b8 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
   func.return %0, %1 : index, index
 }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @unifprm
+//  CHECK-SAME:   (%[[ARG:.*]]: i64)
+func.func @unifprm(%arg0 : index) -> index {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+  %0 = gpu.subgroup_uniform %arg0 : index
+  func.return %0 : index
+}
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e3e2474d917c8..d45d1cf52d91d 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
   %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
   func.return %0, %1 : f32, f32
 }
+
+// CHECK-LABEL: func @subgroup_uniform
+//  CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+  // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+  %0 = gpu.subgroup_uniform %arg0 : f32
+  func.return %0 : f32
+}

llvmbot · 2025-09-10T15:04:36Z

@llvm/pr-subscribers-mlir

Author: Ivan Butygin (Hardcode84)

Changes

Introducing a dedicated op instead of broadcast any_lane per discussion #152808

Full diff: https://github.com/llvm/llvm-project/pull/157743.diff

5 Files Affected:

(modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+33)
(modified) mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp (+18-1)
(modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+9)
(modified) mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir (+12)
(modified) mlir/test/Dialect/GPU/ops.mlir (+8)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..3e2fa6b43d5fd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
   let hasVerifier = 1;
 }
 
+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+    [Pure, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src)> {
+  let summary = "Assumes value is unform across the lanes in subgroup";
+  let description = [{
+    The "subgroup_uniform" op assumes that the value is uniform across all lanes
+    in a subgroup. This means that all active lanes in the subgroup are expected
+    to have the same value.
+
+    This op can be used to inform the compiler that a value is uniform across
+    the subgroup, enabling optimizations. The result is poison if the value
+    is not actually uniform.
+
+    This op is functionally no-op as no valid program should change its
+    semantics if this op is removed. Backends can choose to ignore it or do
+    some optimizations (e.g. put value into scalar registers).
+
+    This op can be freely speculated across structured control flow as parent
+    active mask is always superset of current mask and if can hoist input
+    calculation you can hoist the operation itself as well.
+
+    Example:
+
+    ```mlir
+    %1 = gpu.subgroup_uniform %0 : f32
+    ```
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$src attr-dict `:` type($result)";
+}
+
 #endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 807d1f52ee69b..5377ce709497e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL
   }
 };
 
+struct GPUSubgroupUniformOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::SubgroupUniformOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (!isSupportedReadLaneType(src.getType()))
+      return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+    rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
-               GPUSubgroupBroadcastOpToROCDL>(converter);
+               GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>(
+      converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 43b02f16aa829..6022fa517421a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupUniformOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index c6261b37ef8f2..69fb45d9097b8 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
   func.return %0, %1 : index, index
 }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @unifprm
+//  CHECK-SAME:   (%[[ARG:.*]]: i64)
+func.func @unifprm(%arg0 : index) -> index {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+  %0 = gpu.subgroup_uniform %arg0 : index
+  func.return %0 : index
+}
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e3e2474d917c8..d45d1cf52d91d 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
   %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
   func.return %0, %1 : f32, f32
 }
+
+// CHECK-LABEL: func @subgroup_uniform
+//  CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+  // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+  %0 = gpu.subgroup_uniform %arg0 : f32
+  func.return %0 : f32
+}

kuhar

Maybe also add an example and a test case with a vector type?

kuhar · 2025-09-10T15:06:18Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+
+    This op is functionally no-op as no valid program should change its
+    semantics if this op is removed. Backends can choose to ignore it or do
+    some optimizations (e.g. put value into scalar registers).


Suggested change

some optimizations (e.g. put value into scalar registers).

some optimizations (e.g., put value into scalar registers).

fabianmcg

Please add a pattern for replacing the op by the input, allowing it to be truly a nop. You can then add this pattern to gpu-to-rocdl to handle the cases where it's not possible to use readfirstlane

fabianmcg · 2025-09-10T15:12:46Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$src attr-dict `:` type($result)";
+}


Please add a verifier checking %src has one use. This ensures a consistent view of the effects of uniformity.

I don't think this fits the local verification only rule: https://mlir.llvm.org/getting_started/DeveloperGuide/#ir-verifier

And we don't actually need a has-one-use constraint?

A value can be made/marked uniform but also not - that's not an error, that's just a missed optimization opportunity

To me, gpu.subgroup_uniform is a metadata op signaling that a value is uniform, so either:

It produces no results, and it's up to an analysis to determine whether the value is uniform by looking for an occurrence of the op, in which case the op cannot be pure (otherwise it can get elided), and it becomes harder to promote to something like a read first lane.

We ensure that all the uses know this comes from a uniform value to have a consistent view that a value is uniform, and the op is pure.

We need to specify what happens in the following case:

%u0 = gpu.subgroup_uniform %v0 : index %u1 = gpu.subgroup_uniform %v1 : index // ... %v2 = arith.addi %v0, %v1 : index %v3 = arith.addi %u0, %u1 : index

In %v2, is %v0 uniform? Are %v2 and %v3 the same value? Can one be marked as uniform and the other not?
These can be defined, but I'd rather avoid defining what happens in those cases.

However, I do agree this is stretching what the guide says about the verifier being local, because while we are not looking at the producer of an operand, nor traversing the use-def chain, the effect of the constraint is not local.

%u0 = gpu.subgroup_uniform %v0 : index %u1 = gpu.subgroup_uniform %v1 : index // ... %v2 = arith.addi %v0, %v1 : index %v3 = arith.addi %u0, %u1 : index

Has the exact same one-of-these-is-underannotated semantics as

%m : memref<?xf32> %m1 = memref.assume_align %m, 16 : memref<?xf32> // ... %l1 = memref.load %m1[%c0] : f32 %l2 = memref.load %m[%c0] : f32

Here, %l1 can be analyzed to be loaded with an alignment of 16 bytes, %l2 need not be.

One could, of course, have a rewrite that maximizes assumptions - replacing %m with %m1 in %l2 is correct, but not required.

In terms of analysis, in your example, the value %v0 is uniform, but may not be known to be uniform. An analyzer can (probably should) conclude that %v2 is not known to be uniform because it doesn't know what %v0's deal is. %v3 does know, however, because it's using %u0, which is annotated.

That same interchangability logic from memref.assume_alignment applies - you can "freely" swap between %v0 / %u0, gaining and losing assumptions as you go.

(Operationally, gpu.subgroup_uniform is expected to lower to material changes in register class on targets where that's a thing, but, semantically, that's not all that relevant)

Always the same thing: that breaks something fundamental about SSA. The same examples from the guideline applies: how is a transformation which seems harmless supposed to not break this verifier?
Think of simple cases: the result of the assume is used in a scf.for loop with 2 iterations, I unroll the loop (the assume is outside of the loop), now loop-unrolling and boom...

Thank you for the explanation, that makes it clear, and I'm dropping the has one use proposal.

I don't follow, can you expand? This looks to me exactly like how all the variant of "assume" in a dataflow context I've seen worked out: you tag an SSA Value and add information in the data low. How is it different from memref.assume for example?

I argue that assume semantics imply things about the operands, not the results. This works well with dataflow, the semantics are clear, and it is the model of llvm.assume.

Taking the case of memref.assume, or gpu.subgroup_uniform, the implication is on the result of the op, and not the operands. I claim these are not really compiler assumptions, as it's a new value with new information, you are not assuming anything on the previous value. Thus, their role is an active one, and have closer semantics to a nop cast.
(To be consistent, I'd argue memref.assume_alignment, should probably be called memref.has_alignment or memref.add_alignment). So it's not really tagging, but creating a new value with added semantics.

Let's take a memref.assume_alignment that returns results and one that doesn't:
1.

%1 = memref.assume_alignment %0, 8 : memref<4x4xf16> %2 = memref.assume_alignment %0, 32 : memref<4x4xf16>

memref.assume_alignment %0, 8 : memref<4x4xf16> memref.assume_alignment %0, 32 : memref<4x4xf16>

On 2, it's clear the assumption is on %0, so you could potentially combine those assumptions, and say %0 has a 32 alignment.

On the other hand, on 1, I don't think one should combine them. They have different effects that should be scoped to the uses of each of the operations, thus they are more like value promotions.

Just to be clear, I'm not saying we can't have that, I'm saying that's not the description of the op.

However, since, it appears there's consensus on accepting assumes in such form, I'm not blocking the PR on this issue.

I argue that assume semantics imply things about the operands, not the results. This works well with dataflow, the semantics are clear

I don't see the argument here? You seem to be repeatingly making claims that I don't understand the rationale behind and I don't see much explanation on the fundamentals behind it.

and it is the model of llvm.assume.

Actually no: llvm.assume is not dataflow based, it a path-sensitive representation. You want to add information about a condition at a specific point, which is at-odd with the properties you'd want from a "dataflow based" approach.

. I claim these are not really compiler assumptions

How so? We encode an assumption on a value, I don't quite see why this makes it "not a compiler assumption". You're identifying yourself later that the operation is a "nop": it's only effect is on the compiler itself, as a metadata that is inserted in the dataflow.

On the other hand, on 1, I don't think one should combine them.

Right, one probably shouldn't: the assumption is dataflow based instead of path-based.
This is in line with the deferred UB provided by poison here.
You could be smart and combine some of these if you reach a point where you're using a value in a context where it creates UB to violate the assumption.
Basically from a:

%1 = memref.assume_alignment %0, 8 : memref<4x4xf16>

which isn't an immediate UB when violated, you can generate later:

memref.assume_alignment %0, 32 : memref<4x4xf16>

at a point where %1 is used in a manner that triggers UB.

This dataflow vs path sensitivity, alongside with deferred UB is pretty key to understand the design space here.

I think the property we want to capture is something like this: assume_uniform requires the operand to be uniform at definition and then the both the use in assume_uniform and the result are also uniform at that program point. This avoids talking about active/inactive lanes

If the operand is not uniform at definition, you should probably use subgroup_broadcast.

result are also uniform at that program point

... or poison.
(otherwise, agreed)

After some internal discussion, even uniform-at-definition requirement may not be enough. Consider the following example:

%cond = cmp thread_id > 10 %v = select %cond %c10, %c20 // %v is not uniform scf.if %cond { %v1 = arith.add %v, %c1 // %v1 is uniform at def currently, but won't be if `arith.add` is hoisted outside %u = assume_uniform %v1 }

We can, conservatively, request all lanes to be active at def in addition to input to be uniform at def (and it even covers my original usecase I intended this op for, as it requires all lanes active anyway for unrelated reasons), but I feel there should be a less restrictive definition which is still useful.

fabianmcg · 2025-09-10T15:14:10Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+    [Pure, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src)> {


NIT: use let arguments = (ins ...)

... Since when?

Since never, but I would like to standardize a single form of expressing these upstream. So maybe NIT is not the right request, but rather would you consider?

Ah

Yeah, we might want to discuss that somewhere other than PR comments

The Arguments<...> construct exists, I believe, for helping defining classes instead of definition, I find it quite surprising to see it on an operation definition like this, I'm not sure what's the motivation for doing this instead of the normal let arguments = (ins ...) ?

A lot of the code I've seen upstream uses Arguments<> so I figured that was local style when I started working with MLIR

fabianmcg · 2025-09-10T15:15:21Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+    some optimizations (e.g. put value into scalar registers).
+
+    This op can be freely speculated across structured control flow as parent
+    active mask is always superset of current mask and if can hoist input


Suggested change

active mask is always superset of current mask and if can hoist input

active mask is always a superset of the current mask, and if it is possible to the hoist input

fabianmcg · 2025-09-10T15:16:48Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+    semantics if this op is removed. Backends can choose to ignore it or do
+    some optimizations (e.g. put value into scalar registers).
+
+    This op can be freely speculated across structured control flow as parent


Suggested change

This op can be freely speculated across structured control flow as parent

This op can be freely speculated across structured control flow as the parent

fabianmcg · 2025-09-10T15:17:59Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+
+    This op can be freely speculated across structured control flow as parent
+    active mask is always superset of current mask and if can hoist input
+    calculation you can hoist the operation itself as well.


Suggested change

calculation you can hoist the operation itself as well.

calculation then the operation can be hoisted.

Jianhui-Li · 2025-09-11T16:31:42Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

  let hasVerifier = 1;
 }

+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",


Consider
%ret_val = assume_uniform @scope=subgroup %uniform_val?

Rational: 1) prefix assume* imply compiler hint, to be consistent with other dialects like memref.assume*. 2) extensible scope for potential needs.

Jianhui-Li · 2025-09-11T16:56:45Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+  let description = [{
+    The "subgroup_uniform" op assumes that the value is uniform across all lanes
+    in a subgroup. This means that all active lanes in the subgroup are expected
+    to have the same value.


Can you clarify the uniformity is for input or output? it is for all lanes, or for all active lanes.
My understanding was it is for input and for all lanes, regardless of inactive or not active. So that we can lowered it to scalar register for all lanes, and we can always speculate it - as the speculation means the computation yields the same result even if it is executed in a broad range of threads including those threads which may not use the output (as it becomes inactive at the use point).
But the second statement wrote as if it for output, only active lanes getting the same value. It invites questions like Fabian raised in https://github.com/llvm/llvm-project/pull/157743/files#r2337838103

Jianhui-Li · 2025-09-11T17:05:49Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$src attr-dict `:` type($result)";
+}


It produces no results, and it's up to an analysis to determine whether the value is uniform by looking for an occurrence of the op, in which case the op cannot be pure (otherwise it can get elided), and it becomes harder to promote to something like a read first lane.

I prefer no result.

Let's say you can't speculate my.special_op, can an analysis conclude %v0 use in [tag] is uniform? Why or why not? What's the criteria.

My understanding is that the use case above means that %v0 will be lowered to a scalar register which is shared by all lanes, thus it should be uniform regardless how my.special_op semantics is. The op definition and description should support the lowering to scalar.

Jianhui-Li · 2025-09-11T17:08:31Z

mlir/test/Dialect/GPU/ops.mlir

+// CHECK-LABEL: func @subgroup_uniform
+//  CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+  // CHECK: gpu.subgroup_uniform %[[ARG]] : f32


Could you please add the following before the operation - just to remind what a typical use case looks like:

%arg0 = [thread ID x] / [subgroup size]; // instead of receiving from argument.

Jianhui-Li · 2025-09-11T17:09:37Z

mlir/test/Dialect/GPU/ops.mlir

+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+  // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+  %0 = gpu.subgroup_uniform %arg0 : f32
+  func.return %0 : f32


Also: Could you add one more test to show the effects of speculation in related pass test?

joker-eph · 2025-09-11T23:44:13Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

+  Arguments<(ins AnyType:$src)> {
+  let summary = "Assumes value is unform across the lanes in subgroup";
+  let description = [{
+    The "subgroup_uniform" op assumes that the value is uniform across all lanes


I'd include "assume" in the op name to make it clear what this op is about.

krzysz00 · 2025-09-12T20:31:23Z

Just to get a sense of how people are feeling, how would folks feel if this thing was called gpu.as_subgroup_uniform or gpu.make_subgroup_uniform?

kuhar · 2025-09-12T21:23:54Z

Just to get a sense of how people are feeling, how would folks feel if this thing was called gpu.as_subgroup_uniform or gpu.make_subgroup_uniform?

gpu.subgroup_assume_uniform?

Jianhui-Li · 2025-09-12T21:42:31Z

Just to get a sense of how people are feeling, how would folks feel if this thing was called gpu.as_subgroup_uniform or gpu.make_subgroup_uniform?

gpu.subgroup_assume_uniform?

I was thinking gpu.assume_uniform with subgroup as attribute.

assume_ prefix indicates that the op works as a compiler hint. Subgroup_ prefix implies that it is a subgroup operation, and the lane is cooperating with Neighbour lanes.
From the motivation example @krzysz00 shared: "[thread ID x] / [subgroup size]", the value can be uniform across workgroup, or cluster, if the example uses workgroup size or cluster size. So why not treat subgroup as an attribute, instead of part of the name?

joker-eph · 2025-09-15T14:26:57Z

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

 }

+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+    [Pure, AllTypesMatch<["result", "src"]>,


Suggested change

[Pure, AllTypesMatch<["result", "src"]>,

[NoMemoryEffect, AllTypesMatch<["result", "src"]>,

It's not clear to me that this is all speculatable with respect to the property it provides?

See my other comment on uniformity, but yeah, I think we can either make it speculatable if we require all lanes to be active at definition or make it non-speculatable but without requiring all lanes

[mlir][gpu] Add gpu.subgroup_uniform op

45f37e1

Hardcode84 marked this pull request as ready for review September 10, 2025 15:04

Hardcode84 requested a review from fabianmcg as a code owner September 10, 2025 15:04

llvmbot added mlir:gpu mlir labels Sep 10, 2025

Hardcode84 requested review from krzysz00, kuhar and Jianhui-Li September 10, 2025 15:04

kuhar approved these changes Sep 10, 2025

View reviewed changes

fabianmcg reviewed Sep 10, 2025

View reviewed changes

Jianhui-Li reviewed Sep 11, 2025

View reviewed changes

joker-eph reviewed Sep 11, 2025

View reviewed changes

joker-eph reviewed Sep 15, 2025

View reviewed changes

	some optimizations (e.g. put value into scalar registers).
	some optimizations (e.g., put value into scalar registers).

	active mask is always superset of current mask and if can hoist input
	active mask is always a superset of the current mask, and if it is possible to the hoist input

	This op can be freely speculated across structured control flow as parent
	This op can be freely speculated across structured control flow as the parent

	calculation you can hoist the operation itself as well.
	calculation then the operation can be hoisted.

	[Pure, AllTypesMatch<["result", "src"]>,
	[NoMemoryEffect, AllTypesMatch<["result", "src"]>,

[mlir][gpu] Add gpu.subgroup_uniform op #157743

Are you sure you want to change the base?

[mlir][gpu] Add gpu.subgroup_uniform op #157743

Uh oh!

Conversation

Hardcode84 commented Sep 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 10, 2025

Uh oh!

llvmbot commented Sep 10, 2025

Uh oh!

kuhar left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

fabianmcg left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

fabianmcg Sep 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

joker-eph Sep 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Jianhui-Li Sep 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

krzysz00 commented Sep 12, 2025

Uh oh!

kuhar commented Sep 12, 2025

Uh oh!

[mlir][gpu] Add `gpu.subgroup_uniform` op #157743

[mlir][gpu] Add `gpu.subgroup_uniform` op #157743

Hardcode84 commented Sep 9, 2025 •

edited

Loading

fabianmcg Sep 12, 2025 •

edited

Loading

joker-eph Sep 12, 2025 •

edited

Loading

Jianhui-Li Sep 11, 2025 •

edited

Loading