-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][gpu] Add gpu.subgroup_uniform
op
#157743
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-mlir-gpu Author: Ivan Butygin (Hardcode84) ChangesIntroducing a dedicated op instead of broadcast Full diff: https://github.com/llvm/llvm-project/pull/157743.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..3e2fa6b43d5fd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
let hasVerifier = 1;
}
+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+ [Pure, AllTypesMatch<["result", "src"]>,
+ DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+ ElementwiseMappable.traits>,
+ Arguments<(ins AnyType:$src)> {
+ let summary = "Assumes value is unform across the lanes in subgroup";
+ let description = [{
+ The "subgroup_uniform" op assumes that the value is uniform across all lanes
+ in a subgroup. This means that all active lanes in the subgroup are expected
+ to have the same value.
+
+ This op can be used to inform the compiler that a value is uniform across
+ the subgroup, enabling optimizations. The result is poison if the value
+ is not actually uniform.
+
+ This op is functionally no-op as no valid program should change its
+ semantics if this op is removed. Backends can choose to ignore it or do
+ some optimizations (e.g. put value into scalar registers).
+
+ This op can be freely speculated across structured control flow as parent
+ active mask is always superset of current mask and if can hoist input
+ calculation you can hoist the operation itself as well.
+
+ Example:
+
+ ```mlir
+ %1 = gpu.subgroup_uniform %0 : f32
+ ```
+ }];
+ let results = (outs AnyType:$result);
+ let assemblyFormat = "$src attr-dict `:` type($result)";
+}
+
#endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 807d1f52ee69b..5377ce709497e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL
}
};
+struct GPUSubgroupUniformOpToROCDL
+ : public ConvertOpToLLVMPattern<gpu::SubgroupUniformOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value src = adaptor.getSrc();
+ if (!isSupportedReadLaneType(src.getType()))
+ return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+ rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+ return success();
+ }
+};
+
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
@@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
- GPUSubgroupBroadcastOpToROCDL>(converter);
+ GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>(
+ converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 43b02f16aa829..6022fa517421a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() {
}
}
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupUniformOp::inferResultRanges(
+ ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+ setResultRange(getResult(), argRanges.front());
+}
+
//===----------------------------------------------------------------------===//
// GPU KernelMetadataAttr
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index c6261b37ef8f2..69fb45d9097b8 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
func.return %0, %1 : index, index
}
}
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @unifprm
+// CHECK-SAME: (%[[ARG:.*]]: i64)
+func.func @unifprm(%arg0 : index) -> index {
+// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+ %0 = gpu.subgroup_uniform %arg0 : index
+ func.return %0 : index
+}
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e3e2474d917c8..d45d1cf52d91d 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
%1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
func.return %0, %1 : f32, f32
}
+
+// CHECK-LABEL: func @subgroup_uniform
+// CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+ // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+ %0 = gpu.subgroup_uniform %arg0 : f32
+ func.return %0 : f32
+}
|
@llvm/pr-subscribers-mlir Author: Ivan Butygin (Hardcode84) ChangesIntroducing a dedicated op instead of broadcast Full diff: https://github.com/llvm/llvm-project/pull/157743.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..3e2fa6b43d5fd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
let hasVerifier = 1;
}
+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+ [Pure, AllTypesMatch<["result", "src"]>,
+ DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+ ElementwiseMappable.traits>,
+ Arguments<(ins AnyType:$src)> {
+ let summary = "Assumes value is unform across the lanes in subgroup";
+ let description = [{
+ The "subgroup_uniform" op assumes that the value is uniform across all lanes
+ in a subgroup. This means that all active lanes in the subgroup are expected
+ to have the same value.
+
+ This op can be used to inform the compiler that a value is uniform across
+ the subgroup, enabling optimizations. The result is poison if the value
+ is not actually uniform.
+
+ This op is functionally no-op as no valid program should change its
+ semantics if this op is removed. Backends can choose to ignore it or do
+ some optimizations (e.g. put value into scalar registers).
+
+ This op can be freely speculated across structured control flow as parent
+ active mask is always superset of current mask and if can hoist input
+ calculation you can hoist the operation itself as well.
+
+ Example:
+
+ ```mlir
+ %1 = gpu.subgroup_uniform %0 : f32
+ ```
+ }];
+ let results = (outs AnyType:$result);
+ let assemblyFormat = "$src attr-dict `:` type($result)";
+}
+
#endif // GPU_OPS
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 807d1f52ee69b..5377ce709497e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL
}
};
+struct GPUSubgroupUniformOpToROCDL
+ : public ConvertOpToLLVMPattern<gpu::SubgroupUniformOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value src = adaptor.getSrc();
+ if (!isSupportedReadLaneType(src.getType()))
+ return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+ rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+ return success();
+ }
+};
+
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
@@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
- GPUSubgroupBroadcastOpToROCDL>(converter);
+ GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>(
+ converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 43b02f16aa829..6022fa517421a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() {
}
}
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupUniformOp::inferResultRanges(
+ ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+ setResultRange(getResult(), argRanges.front());
+}
+
//===----------------------------------------------------------------------===//
// GPU KernelMetadataAttr
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index c6261b37ef8f2..69fb45d9097b8 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
func.return %0, %1 : index, index
}
}
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @unifprm
+// CHECK-SAME: (%[[ARG:.*]]: i64)
+func.func @unifprm(%arg0 : index) -> index {
+// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+ %0 = gpu.subgroup_uniform %arg0 : index
+ func.return %0 : index
+}
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e3e2474d917c8..d45d1cf52d91d 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
%1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
func.return %0, %1 : f32, f32
}
+
+// CHECK-LABEL: func @subgroup_uniform
+// CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+ // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+ %0 = gpu.subgroup_uniform %arg0 : f32
+ func.return %0 : f32
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe also add an example and a test case with a vector type?
|
||
This op is functionally no-op as no valid program should change its | ||
semantics if this op is removed. Backends can choose to ignore it or do | ||
some optimizations (e.g. put value into scalar registers). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
some optimizations (e.g. put value into scalar registers). | |
some optimizations (e.g., put value into scalar registers). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a pattern for replacing the op by the input, allowing it to be truly a nop. You can then add this pattern to gpu-to-rocdl to handle the cases where it's not possible to use readfirstlane
}]; | ||
let results = (outs AnyType:$result); | ||
let assemblyFormat = "$src attr-dict `:` type($result)"; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a verifier checking %src
has one use. This ensures a consistent view of the effects of uniformity.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this fits the local verification only rule: https://mlir.llvm.org/getting_started/DeveloperGuide/#ir-verifier
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And we don't actually need a has-one-use constraint?
A value can be made/marked uniform but also not - that's not an error, that's just a missed optimization opportunity
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To me, gpu.subgroup_uniform
is a metadata op signaling that a value is uniform, so either:
- It produces no results, and it's up to an analysis to determine whether the value is uniform by looking for an occurrence of the op, in which case the op cannot be pure (otherwise it can get elided), and it becomes harder to promote to something like a read first lane.
- We ensure that all the uses know this comes from a uniform value to have a consistent view that a value is uniform, and the op is pure.
- We need to specify what happens in the following case:
%u0 = gpu.subgroup_uniform %v0 : index
%u1 = gpu.subgroup_uniform %v1 : index
// ...
%v2 = arith.addi %v0, %v1 : index
%v3 = arith.addi %u0, %u1 : index
In %v2
, is %v0
uniform? Are %v2
and %v3
the same value? Can one be marked as uniform and the other not?
These can be defined, but I'd rather avoid defining what happens in those cases.
However, I do agree this is stretching what the guide says about the verifier being local, because while we are not looking at the producer of an operand, nor traversing the use-def chain, the effect of the constraint is not local.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%u0 = gpu.subgroup_uniform %v0 : index
%u1 = gpu.subgroup_uniform %v1 : index
// ...
%v2 = arith.addi %v0, %v1 : index
%v3 = arith.addi %u0, %u1 : index
Has the exact same one-of-these-is-underannotated semantics as
%m : memref<?xf32>
%m1 = memref.assume_align %m, 16 : memref<?xf32>
// ...
%l1 = memref.load %m1[%c0] : f32
%l2 = memref.load %m[%c0] : f32
Here, %l1
can be analyzed to be loaded with an alignment of 16 bytes, %l2
need not be.
One could, of course, have a rewrite that maximizes assumptions - replacing %m
with %m1
in %l2
is correct, but not required.
In terms of analysis, in your example, the value %v0
is uniform, but may not be known to be uniform. An analyzer can (probably should) conclude that %v2
is not known to be uniform because it doesn't know what %v0
's deal is. %v3
does know, however, because it's using %u0
, which is annotated.
That same interchangability logic from memref.assume_alignment
applies - you can "freely" swap between %v0
/ %u0
, gaining and losing assumptions as you go.
(Operationally, gpu.subgroup_uniform
is expected to lower to material changes in register class on targets where that's a thing, but, semantically, that's not all that relevant)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Always the same thing: that breaks something fundamental about SSA. The same examples from the guideline applies: how is a transformation which seems harmless supposed to not break this verifier?
Think of simple cases: the result of the assume is used in a scf.for loop with 2 iterations, I unroll the loop (the assume is outside of the loop), now loop-unrolling and boom...
Thank you for the explanation, that makes it clear, and I'm dropping the has one use proposal.
I don't follow, can you expand? This looks to me exactly like how all the variant of "assume" in a dataflow context I've seen worked out: you tag an SSA Value and add information in the data low. How is it different from memref.assume for example?
I argue that assume
semantics imply things about the operands, not the results. This works well with dataflow, the semantics are clear, and it is the model of llvm.assume
.
Taking the case of memref.assume
, or gpu.subgroup_uniform
, the implication is on the result of the op, and not the operands. I claim these are not really compiler assumptions, as it's a new value with new information, you are not assuming anything on the previous value. Thus, their role is an active one, and have closer semantics to a nop
cast.
(To be consistent, I'd argue memref.assume_alignment
, should probably be called memref.has_alignment
or memref.add_alignment
). So it's not really tagging, but creating a new value with added semantics.
Let's take a memref.assume_alignment
that returns results and one that doesn't:
1.
%1 = memref.assume_alignment %0, 8 : memref<4x4xf16>
%2 = memref.assume_alignment %0, 32 : memref<4x4xf16>
memref.assume_alignment %0, 8 : memref<4x4xf16>
memref.assume_alignment %0, 32 : memref<4x4xf16>
On 2, it's clear the assumption is on %0
, so you could potentially combine those assumptions, and say %0
has a 32
alignment.
On the other hand, on 1, I don't think one should combine them. They have different effects that should be scoped to the uses of each of the operations, thus they are more like value promotions.
Just to be clear, I'm not saying we can't have that, I'm saying that's not the description of the op.
However, since, it appears there's consensus on accepting assumes in such form, I'm not blocking the PR on this issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I argue that assume semantics imply things about the operands, not the results. This works well with dataflow, the semantics are clear
I don't see the argument here? You seem to be repeatingly making claims that I don't understand the rationale behind and I don't see much explanation on the fundamentals behind it.
and it is the model of llvm.assume.
Actually no: llvm.assume
is not dataflow based, it a path-sensitive representation. You want to add information about a condition at a specific point, which is at-odd with the properties you'd want from a "dataflow based" approach.
. I claim these are not really compiler assumptions
How so? We encode an assumption on a value, I don't quite see why this makes it "not a compiler assumption". You're identifying yourself later that the operation is a "nop": it's only effect is on the compiler itself, as a metadata that is inserted in the dataflow.
On the other hand, on 1, I don't think one should combine them.
Right, one probably shouldn't: the assumption is dataflow based instead of path-based.
This is in line with the deferred UB provided by poison here.
You could be smart and combine some of these if you reach a point where you're using a value in a context where it creates UB to violate the assumption.
Basically from a:
%1 = memref.assume_alignment %0, 8 : memref<4x4xf16>
which isn't an immediate UB when violated, you can generate later:
memref.assume_alignment %0, 32 : memref<4x4xf16>
at a point where %1 is used in a manner that triggers UB.
This dataflow vs path sensitivity, alongside with deferred UB is pretty key to understand the design space here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the property we want to capture is something like this: assume_uniform
requires the operand to be uniform at definition and then the both the use in assume_uniform
and the result are also uniform at that program point. This avoids talking about active/inactive lanes
If the operand is not uniform at definition, you should probably use subgroup_broadcast
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
result are also uniform at that program point
... or poison.
(otherwise, agreed)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After some internal discussion, even uniform-at-definition requirement may not be enough. Consider the following example:
%cond = cmp thread_id > 10
%v = select %cond %c10, %c20
// %v is not uniform
scf.if %cond {
%v1 = arith.add %v, %c1
// %v1 is uniform at def currently, but won't be if `arith.add` is hoisted outside
%u = assume_uniform %v1
}
We can, conservatively, request all lanes to be active at def in addition to input to be uniform at def (and it even covers my original usecase I intended this op for, as it requires all lanes active anyway for unrelated reasons), but I feel there should be a less restrictive definition which is still useful.
[Pure, AllTypesMatch<["result", "src"]>, | ||
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] # | ||
ElementwiseMappable.traits>, | ||
Arguments<(ins AnyType:$src)> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NIT: use let arguments = (ins ...)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
... Since when?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since never, but I would like to standardize a single form of expressing these upstream. So maybe NIT is not the right request, but rather would you consider?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah
Yeah, we might want to discuss that somewhere other than PR comments
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Arguments<...>
construct exists, I believe, for helping defining classes instead of definition, I find it quite surprising to see it on an operation definition like this, I'm not sure what's the motivation for doing this instead of the normal let arguments = (ins ...)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A lot of the code I've seen upstream uses Arguments<>
so I figured that was local style when I started working with MLIR
some optimizations (e.g. put value into scalar registers). | ||
|
||
This op can be freely speculated across structured control flow as parent | ||
active mask is always superset of current mask and if can hoist input |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
active mask is always superset of current mask and if can hoist input | |
active mask is always a superset of the current mask, and if it is possible to the hoist input |
semantics if this op is removed. Backends can choose to ignore it or do | ||
some optimizations (e.g. put value into scalar registers). | ||
|
||
This op can be freely speculated across structured control flow as parent |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This op can be freely speculated across structured control flow as parent | |
This op can be freely speculated across structured control flow as the parent |
|
||
This op can be freely speculated across structured control flow as parent | ||
active mask is always superset of current mask and if can hoist input | ||
calculation you can hoist the operation itself as well. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
calculation you can hoist the operation itself as well. | |
calculation then the operation can be hoisted. |
let hasVerifier = 1; | ||
} | ||
|
||
def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider
%ret_val = assume_uniform @scope=subgroup %uniform_val?
Rational: 1) prefix assume* imply compiler hint, to be consistent with other dialects like memref.assume*. 2) extensible scope for potential needs.
let description = [{ | ||
The "subgroup_uniform" op assumes that the value is uniform across all lanes | ||
in a subgroup. This means that all active lanes in the subgroup are expected | ||
to have the same value. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you clarify the uniformity is for input or output? it is for all lanes, or for all active lanes.
My understanding was it is for input and for all lanes, regardless of inactive or not active. So that we can lowered it to scalar register for all lanes, and we can always speculate it - as the speculation means the computation yields the same result even if it is executed in a broad range of threads including those threads which may not use the output (as it becomes inactive at the use point).
But the second statement wrote as if it for output, only active lanes getting the same value. It invites questions like Fabian raised in https://github.com/llvm/llvm-project/pull/157743/files#r2337838103
}]; | ||
let results = (outs AnyType:$result); | ||
let assemblyFormat = "$src attr-dict `:` type($result)"; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It produces no results, and it's up to an analysis to determine whether the value is uniform by looking for an occurrence of the op, in which case the op cannot be pure (otherwise it can get elided), and it becomes harder to promote to something like a read first lane.
I prefer no result.
Let's say you can't speculate my.special_op, can an analysis conclude %v0 use in [tag] is uniform? Why or why not? What's the criteria.
My understanding is that the use case above means that %v0 will be lowered to a scalar register which is shared by all lanes, thus it should be uniform regardless how my.special_op semantics is. The op definition and description should support the lowering to scalar.
// CHECK-LABEL: func @subgroup_uniform | ||
// CHECK-SAME: (%[[ARG:.*]]: f32) | ||
func.func @subgroup_uniform(%arg0 : f32) -> f32 { | ||
// CHECK: gpu.subgroup_uniform %[[ARG]] : f32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please add the following before the operation - just to remind what a typical use case looks like:
%arg0 = [thread ID x] / [subgroup size]; // instead of receiving from argument.
func.func @subgroup_uniform(%arg0 : f32) -> f32 { | ||
// CHECK: gpu.subgroup_uniform %[[ARG]] : f32 | ||
%0 = gpu.subgroup_uniform %arg0 : f32 | ||
func.return %0 : f32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also: Could you add one more test to show the effects of speculation in related pass test?
Arguments<(ins AnyType:$src)> { | ||
let summary = "Assumes value is unform across the lanes in subgroup"; | ||
let description = [{ | ||
The "subgroup_uniform" op assumes that the value is uniform across all lanes |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd include "assume" in the op name to make it clear what this op is about.
Just to get a sense of how people are feeling, how would folks feel if this thing was called |
|
I was thinking gpu.assume_uniform with subgroup as attribute. assume_ prefix indicates that the op works as a compiler hint. Subgroup_ prefix implies that it is a subgroup operation, and the lane is cooperating with Neighbour lanes. |
} | ||
|
||
def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform", | ||
[Pure, AllTypesMatch<["result", "src"]>, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[Pure, AllTypesMatch<["result", "src"]>, | |
[NoMemoryEffect, AllTypesMatch<["result", "src"]>, |
It's not clear to me that this is all speculatable with respect to the property it provides?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my other comment on uniformity, but yeah, I think we can either make it speculatable if we require all lanes to be active at definition or make it non-speculatable but without requiring all lanes
Introducing a dedicated op instead of broadcast
any_lane
per discussion #152808