diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 3fb0cfe7e2a54..987fc13e0508d 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -3219,8 +3219,7 @@ def GPU_BroadcastType : I32EnumAttr<"BroadcastType", "a lane to broadcast from", [ I32EnumAttrCase<"first_active_lane", 0>, - I32EnumAttrCase<"any_lane", 1>, - I32EnumAttrCase<"specific_lane", 2> + I32EnumAttrCase<"specific_lane", 1> ]>{ let genSpecializedAttr = 0; let cppNamespace = "::mlir::gpu"; @@ -3248,13 +3247,6 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast", must be uniform and within the subgroup size. The result is poison if the lane index is invalid, non subgroup-uniform, or if the source lane is not active. - * `any_lane` - broadcasts the value from any lane of the subgroup, - assuming the input is already subgroup uniform. The result is poison if - the input is not uniform. This is useful to convey uniformity to the - compiler to enable more optimizations. Also, it allows more speculation - opportunities than `first_active_lane` since `first_active_lane` results - can depend on active lanes which may change during speculation across - control flow. }]; let results = (outs AnyType:$result); let assemblyFormat = [{ diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 8994905e45ad9..807d1f52ee69b 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -193,9 +193,7 @@ struct GPUSubgroupBroadcastOpToROCDL if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) { rewriter.replaceOpWithNewOp(op, src.getType(), src, adaptor.getLane()); - } else { // first_active_lane or any_lane - // any_lane is lowered to readfirstlane too, to force value into scalar - // register. + } else { // first_active_lane rewriter.replaceOpWithNewOp(op, src.getType(), src); } diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index b87b4f4b5f088..43b02f16aa829 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -2529,8 +2529,6 @@ Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() { // Cannot speculate first_lane broadcast, because speculating it across // control flow can change the active lanes. return Speculation::NotSpeculatable; - case BroadcastType::any_lane: - LLVM_FALLTHROUGH; case BroadcastType::specific_lane: // Speculation should be safe as long as we inside structured control flow. return Speculation::Speculatable; @@ -2540,8 +2538,6 @@ Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() { LogicalResult gpu::SubgroupBroadcastOp::verify() { switch (getBroadcastType()) { case BroadcastType::first_active_lane: - LLVM_FALLTHROUGH; - case BroadcastType::any_lane: if (getLane()) return emitOpError() << "lane can only be specified for `specific_lane` broadcast"; diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 6dd03b1c68554..c6261b37ef8f2 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -808,13 +808,11 @@ gpu.module @test_module { gpu.module @test_module { // CHECK-LABEL: func @broadcast // CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32) -func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) { -// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64 +func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) { // CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64 // CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64 %0 = gpu.subgroup_broadcast %arg0, first_active_lane : index - %1 = gpu.subgroup_broadcast %arg0, any_lane : index - %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index - func.return %0, %1, %2 : index, index, index + %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index + func.return %0, %1 : index, index } } diff --git a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir index ea32d62756c35..3cf4853effee5 100644 --- a/mlir/test/Dialect/GPU/broadcast-speculatability.mlir +++ b/mlir/test/Dialect/GPU/broadcast-speculatability.mlir @@ -1,24 +1,22 @@ // RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s -func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32) +func.func private @side_effect(%arg0 : f32, %arg1 : f32) // CHECK-LABEL: func @broadcast_hoisting // CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index) func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index -// `any_lane` and `specific_lane` can be speculated across the control flow, but +// `specific_lane` can be speculated across the control flow, but // `first_active_lane` cannot as active lanes can change. -// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32 -// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32 +// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32 // CHECK: scf.for // CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32 -// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]]) +// CHECK: func.call @side_effect(%[[V0]], %[[V1]]) scf.for %i = %c0 to %arg2 step %c1 { %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32 - %1 = gpu.subgroup_broadcast %arg0, any_lane : f32 - %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 - func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> () + %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 + func.call @side_effect(%0, %1) : (f32, f32) -> () } func.return } diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir index 2e92db0f342aa..628b668a65969 100644 --- a/mlir/test/Dialect/GPU/int-range-interface.mlir +++ b/mlir/test/Dialect/GPU/int-range-interface.mlir @@ -336,15 +336,12 @@ module attributes {gpu.container_module} { func.func @broadcast(%idx: i32) { %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index %1 = gpu.subgroup_broadcast %0, first_active_lane : index - %2 = gpu.subgroup_broadcast %0, any_lane : index - %3 = gpu.subgroup_broadcast %0, specific_lane %idx : index + %2 = gpu.subgroup_broadcast %0, specific_lane %idx : index - // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index} %4 = test.reflect_bounds %1 : index %5 = test.reflect_bounds %2 : index - %6 = test.reflect_bounds %3 : index return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index cd889f8025e6f..e3e2474d917c8 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -545,12 +545,10 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4 // CHECK-LABEL: func @subgroup_broadcast // CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32) -func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) { +func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) { // CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32 %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32 - // CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32 - %1 = gpu.subgroup_broadcast %arg0, any_lane : f32 // CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32 - %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 - func.return %0, %1, %2 : f32, f32, f32 + %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 + func.return %0, %1 : f32, f32 }