diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 987fc13e0508d..3e2fa6b43d5fd 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast", let hasVerifier = 1; } +def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform", + [Pure, AllTypesMatch<["result", "src"]>, + DeclareOpInterfaceMethods] # + ElementwiseMappable.traits>, + Arguments<(ins AnyType:$src)> { + let summary = "Assumes value is unform across the lanes in subgroup"; + let description = [{ + The "subgroup_uniform" op assumes that the value is uniform across all lanes + in a subgroup. This means that all active lanes in the subgroup are expected + to have the same value. + + This op can be used to inform the compiler that a value is uniform across + the subgroup, enabling optimizations. The result is poison if the value + is not actually uniform. + + This op is functionally no-op as no valid program should change its + semantics if this op is removed. Backends can choose to ignore it or do + some optimizations (e.g. put value into scalar registers). + + This op can be freely speculated across structured control flow as parent + active mask is always superset of current mask and if can hoist input + calculation you can hoist the operation itself as well. + + Example: + + ```mlir + %1 = gpu.subgroup_uniform %0 : f32 + ``` + }]; + let results = (outs AnyType:$result); + let assemblyFormat = "$src attr-dict `:` type($result)"; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 807d1f52ee69b..5377ce709497e 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL } }; +struct GPUSubgroupUniformOpToROCDL + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Value src = adaptor.getSrc(); + if (!isSupportedReadLaneType(src.getType())) + return rewriter.notifyMatchFailure(op, "unsupported readlane type"); + + rewriter.replaceOpWithNewOp(op, src.getType(), src); + return success(); + } +}; + struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns( patterns.add(converter); patterns.add(converter); + GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>( + converter); patterns.add(converter, chipset); populateMathToROCDLConversionPatterns(converter, patterns); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 43b02f16aa829..6022fa517421a 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() { } } +//===----------------------------------------------------------------------===// +// GPU_SubgroupUniformOp +//===----------------------------------------------------------------------===// + +void gpu::SubgroupUniformOp::inferResultRanges( + ArrayRef argRanges, SetIntRangeFn setResultRange) { + setResultRange(getResult(), argRanges.front()); +} + //===----------------------------------------------------------------------===// // GPU KernelMetadataAttr //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index c6261b37ef8f2..69fb45d9097b8 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) { func.return %0, %1 : index, index } } + +// ----- + +gpu.module @test_module { +// CHECK-LABEL: func @unifprm +// CHECK-SAME: (%[[ARG:.*]]: i64) +func.func @unifprm(%arg0 : index) -> index { +// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64 + %0 = gpu.subgroup_uniform %arg0 : index + func.return %0 : index +} +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index e3e2474d917c8..d45d1cf52d91d 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) { %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32 func.return %0, %1 : f32, f32 } + +// CHECK-LABEL: func @subgroup_uniform +// CHECK-SAME: (%[[ARG:.*]]: f32) +func.func @subgroup_uniform(%arg0 : f32) -> f32 { + // CHECK: gpu.subgroup_uniform %[[ARG]] : f32 + %0 = gpu.subgroup_uniform %arg0 : f32 + func.return %0 : f32 +}