diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f576972183eca..9528fb2b446bc 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -292,6 +292,8 @@ def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signa // void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %barrier, i32 %memberCnt) // The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined. +// If %memberCnt is 0, the member count is retained from the previous +// s_barrier_init or s_barrier_signal operation. def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index f5747488225c5..ffb2a12892ca2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -7145,6 +7145,24 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( const MachineOperand &BarOp = I.getOperand(1); const MachineOperand &CntOp = I.getOperand(2); + // A member count of 0 means "keep existing member count". That plus a known + // constant value for the barrier ID lets us use the immarg form. + if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) { + std::optional CntImm = + getIConstantVRegSExtVal(CntOp.getReg(), *MRI); + if (CntImm && *CntImm == 0) { + std::optional BarValImm = + getIConstantVRegSExtVal(BarOp.getReg(), *MRI); + if (BarValImm) { + auto BarID = ((*BarValImm) >> 4) & 0x3F; + BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) + .addImm(BarID); + I.eraseFromParent(); + return true; + } + } + } + // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2e631d2f4a55e..b1e3334b2e32b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12054,8 +12054,38 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); - case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: { + // Member count of 0 means to re-use a previous member count, + // which, if the named barrier is statically chosen, means we can use + // the immarg form. Otherwisee, fall through to constructiong M0 as for + // s_barrier_init. + SDValue CntOp = Op->getOperand(3); + auto *CntC = dyn_cast(CntOp); + if (CntC && CntC->isZero()) { + SDValue Chain = Op->getOperand(0); + SDValue BarOp = Op->getOperand(2); + SmallVector Ops; + + std::optional BarVal; + if (auto *C = dyn_cast(BarOp)) + BarVal = C->getZExtValue(); + else if (auto *GA = dyn_cast(BarOp)) + if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress( + *GA->getGlobal())) + BarVal = *Addr + GA->getOffset(); + + if (BarVal) { + unsigned BarID = (*BarVal >> 4) & 0x3F; + Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32)); + Ops.push_back(Chain); + auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, + Op->getVTList(), Ops); + return SDValue(NewMI, 0); + } + } + [[fallthrough]]; + } + case Intrinsic::amdgcn_s_barrier_init: { // these two intrinsics have two operands: barrier pointer and member count SDValue Chain = Op->getOperand(0); SmallVector Ops; diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll index 35b86998c9cac..6c438ed94c863 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -258,6 +258,37 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ret void } +define void @signal_var_cnt0_const_bar() { +; GFX12-LABEL: signal_var_cnt0_const_bar: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_barrier_signal 2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 0) + ret void +} + +define void @signal_var_cnt0_dynamic_bar(ptr addrspace(3) inreg %bar) { +; GFX12-LABEL: signal_var_cnt0_dynamic_bar: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshr_b32 s0, s0, 4 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: s_and_b32 m0, s0, 63 +; GFX12-NEXT: s_barrier_signal m0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %bar, i32 0) + ret void +} + define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrspace(1) %out) { ; GFX12-LABEL: test_barrier_leave_write_to_scc: ; GFX12: ; %bb.0: diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 7305de2049ee2..b13206ce5c342 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -346,7 +346,7 @@ def ROCDL_DsBpermuteOp : ROCDL_ConcreteNonMemIntrOp<"ds_bpermute", [], 1>, Each lane reads the value of `src` from the lane whose byte address is given by `index` (i.e. lane id = `index / 4`). - + This is “backward” (pull) in contrast to `ds_permute_b32`, which is “forward” (push/scatter). @@ -574,6 +574,8 @@ def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], let description = [{ Available on gfx1250+. + If `memberCnt` is 0, the member count is retained from a previous initialization. + Example: ```mlir // Signal a named barrier with variable ID.