-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][gpu] Revert gpu.subgroup_broadcast with any_lane #157373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This partially reverts llvm#152808. Post-commit comments revealed that the `any_lane` variant hasn't been fully aggreed upon at the time of landing.
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Jakub Kuderski (kuhar) ChangesThis partially reverts #152808. Post-commit comments revealed that the Full diff: https://github.com/llvm/llvm-project/pull/157373.diff 7 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 3fb0cfe7e2a54..987fc13e0508d 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -3219,8 +3219,7 @@ def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
"a lane to broadcast from",
[
I32EnumAttrCase<"first_active_lane", 0>,
- I32EnumAttrCase<"any_lane", 1>,
- I32EnumAttrCase<"specific_lane", 2>
+ I32EnumAttrCase<"specific_lane", 1>
]>{
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::gpu";
@@ -3248,13 +3247,6 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
must be uniform and within the subgroup size. The result is poison if the
lane index is invalid, non subgroup-uniform, or if the source lane is not
active.
- * `any_lane` - broadcasts the value from any lane of the subgroup,
- assuming the input is already subgroup uniform. The result is poison if
- the input is not uniform. This is useful to convey uniformity to the
- compiler to enable more optimizations. Also, it allows more speculation
- opportunities than `first_active_lane` since `first_active_lane` results
- can depend on active lanes which may change during speculation across
- control flow.
}];
let results = (outs AnyType:$result);
let assemblyFormat = [{
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 8994905e45ad9..807d1f52ee69b 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -193,9 +193,7 @@ struct GPUSubgroupBroadcastOpToROCDL
if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
adaptor.getLane());
- } else { // first_active_lane or any_lane
- // any_lane is lowered to readfirstlane too, to force value into scalar
- // register.
+ } else { // first_active_lane
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
src);
}
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index b87b4f4b5f088..43b02f16aa829 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2529,8 +2529,6 @@ Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
// Cannot speculate first_lane broadcast, because speculating it across
// control flow can change the active lanes.
return Speculation::NotSpeculatable;
- case BroadcastType::any_lane:
- LLVM_FALLTHROUGH;
case BroadcastType::specific_lane:
// Speculation should be safe as long as we inside structured control flow.
return Speculation::Speculatable;
@@ -2540,8 +2538,6 @@ Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
LogicalResult gpu::SubgroupBroadcastOp::verify() {
switch (getBroadcastType()) {
case BroadcastType::first_active_lane:
- LLVM_FALLTHROUGH;
- case BroadcastType::any_lane:
if (getLane())
return emitOpError()
<< "lane can only be specified for `specific_lane` broadcast";
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 6dd03b1c68554..c6261b37ef8f2 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -808,13 +808,11 @@ gpu.module @test_module {
gpu.module @test_module {
// CHECK-LABEL: func @broadcast
// CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
-func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
-// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
// CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : index
- %1 = gpu.subgroup_broadcast %arg0, any_lane : index
- %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
- func.return %0, %1, %2 : index, index, index
+ %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
+ func.return %0, %1 : index, index
}
}
diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
index ea32d62756c35..3cf4853effee5 100644
--- a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
+++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir
@@ -1,24 +1,22 @@
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
-func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
+func.func private @side_effect(%arg0 : f32, %arg1 : f32)
// CHECK-LABEL: func @broadcast_hoisting
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
-// `any_lane` and `specific_lane` can be speculated across the control flow, but
+// `specific_lane` can be speculated across the control flow, but
// `first_active_lane` cannot as active lanes can change.
-// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32
-// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
+// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
// CHECK: scf.for
// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
-// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
+// CHECK: func.call @side_effect(%[[V0]], %[[V1]])
scf.for %i = %c0 to %arg2 step %c1 {
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
- %1 = gpu.subgroup_broadcast %arg0, any_lane : f32
- %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
- func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
+ %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
+ func.call @side_effect(%0, %1) : (f32, f32) -> ()
}
func.return
}
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index 2e92db0f342aa..628b668a65969 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -336,15 +336,12 @@ module attributes {gpu.container_module} {
func.func @broadcast(%idx: i32) {
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
%1 = gpu.subgroup_broadcast %0, first_active_lane : index
- %2 = gpu.subgroup_broadcast %0, any_lane : index
- %3 = gpu.subgroup_broadcast %0, specific_lane %idx : index
+ %2 = gpu.subgroup_broadcast %0, specific_lane %idx : index
- // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
%4 = test.reflect_bounds %1 : index
%5 = test.reflect_bounds %2 : index
- %6 = test.reflect_bounds %3 : index
return
}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index cd889f8025e6f..e3e2474d917c8 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -545,12 +545,10 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
// CHECK-LABEL: func @subgroup_broadcast
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
-func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
+func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
// CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
- // CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32
- %1 = gpu.subgroup_broadcast %arg0, any_lane : f32
// CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
- %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
- func.return %0, %1, %2 : f32, f32, f32
+ %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
+ func.return %0, %1 : f32, f32
}
|
Err, I forgot that the llvm's auto-merge doesn't wait for at least one approving review... I think this is a straightforward revert we agreed on, but let me know if I should clean something up. |
This partially reverts #152808.
Post-commit comments revealed that the
any_lane
variant hasn't been fully agreed upon at the time of landing.