llvm · Hardcode84 · Sep 9, 2025 · Jianhui-Li · Sep 11, 2025 · joker-eph
@@ -3255,4 +3255,37 @@ def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
   let hasVerifier = 1;
 }
 
+def GPU_SubgroupUniformOp : GPU_Op<"subgroup_uniform",
+    [Pure, AllTypesMatch<["result", "src"]>,
-    [Pure, AllTypesMatch<["result", "src"]>,
+    [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
-    [Pure, AllTypesMatch<["result", "src"]>,
+    [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src)> {
+  let summary = "Assumes value is unform across the lanes in subgroup";
+  let description = [{
+    The "subgroup_uniform" op assumes that the value is uniform across all lanes
+    in a subgroup. This means that all active lanes in the subgroup are expected
+    to have the same value.
+
+    This op can be used to inform the compiler that a value is uniform across
+    the subgroup, enabling optimizations. The result is poison if the value
+    is not actually uniform.
+
+    This op is functionally no-op as no valid program should change its
+    semantics if this op is removed. Backends can choose to ignore it or do
+    some optimizations (e.g. put value into scalar registers).
-    some optimizations (e.g. put value into scalar registers).
+    some optimizations (e.g., put value into scalar registers).
-    some optimizations (e.g. put value into scalar registers).
+    some optimizations (e.g., put value into scalar registers).
+
+    This op can be freely speculated across structured control flow as parent
-    This op can be freely speculated across structured control flow as parent
+    This op can be freely speculated across structured control flow as the parent
-    This op can be freely speculated across structured control flow as parent
+    This op can be freely speculated across structured control flow as the parent
+    active mask is always superset of current mask and if can hoist input
-    active mask is always superset of current mask and if can hoist input
+    active mask is always a superset of the current mask, and if it is possible to the hoist input
-    active mask is always superset of current mask and if can hoist input
+    active mask is always a superset of the current mask, and if it is possible to the hoist input
+    calculation you can hoist the operation itself as well.
-    calculation you can hoist the operation itself as well.
+    calculation then the operation can be hoisted.
-    calculation you can hoist the operation itself as well.
+    calculation then the operation can be hoisted.
+
+    Example:
+
+    ```mlir
+    %1 = gpu.subgroup_uniform %0 : f32
+    ```
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "$src attr-dict `:` type($result)";
+}
+
 #endif // GPU_OPS
@@ -201,6 +201,22 @@ struct GPUSubgroupBroadcastOpToROCDL
   }
 };
 
+struct GPUSubgroupUniformOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::SubgroupUniformOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupUniformOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (!isSupportedReadLaneType(src.getType()))
+      return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+    rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), src);
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -494,7 +510,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
-               GPUSubgroupBroadcastOpToROCDL>(converter);
+               GPUSubgroupBroadcastOpToROCDL, GPUSubgroupUniformOpToROCDL>(
+      converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);

@@ -2550,6 +2550,15 @@ LogicalResult gpu::SubgroupBroadcastOp::verify() {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupUniformOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupUniformOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//

@@ -816,3 +816,15 @@ func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index) {
   func.return %0, %1 : index, index
 }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @unifprm
+//  CHECK-SAME:   (%[[ARG:.*]]: i64)
+func.func @unifprm(%arg0 : index) -> index {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+  %0 = gpu.subgroup_uniform %arg0 : index
+  func.return %0 : index
+}
+}
@@ -552,3 +552,11 @@ func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32) {
   %1 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
   func.return %0, %1 : f32, f32
 }
+
+// CHECK-LABEL: func @subgroup_uniform
+//  CHECK-SAME: (%[[ARG:.*]]: f32)
+func.func @subgroup_uniform(%arg0 : f32) -> f32 {
+  // CHECK: gpu.subgroup_uniform %[[ARG]] : f32
+  %0 = gpu.subgroup_uniform %arg0 : f32
+  func.return %0 : f32
+}