diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 8d2bcfeca2a41..d1b3dc1b7c61b 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -7160,6 +7160,18 @@ treated as non-atomic. A memory synchronization scope wider than work-group is not meaningful for the group (LDS) address space and is treated as work-group. +When a work-group's maximum flat work-group size does not exceed the wavefront +size, the work-group fits within a single wavefront. In this case, LLVM +``workgroup`` synchronization scope is equivalent to ``wavefront`` scope. + +If the compiler can determine this bound (e.g., via ``amdgpu-flat-work-group-size``), +the AMDGPU backend optimizes ``workgroup`` scope operations by lowering them to +``wavefront``-scoped machine instructions. + +It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg`` +instructions, and to ``fence`` instructions, when they use synchronizing memory +orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``). + The memory model does not support the region address space which is treated as non-atomic. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index d490788a97685..e089498693b2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -94,10 +94,8 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) { const GCNSubtarget &ST = TM.getSubtarget(*I.getFunction()); bool IsSingleWaveWG = false; - if (TM.getOptLevel() > CodeGenOptLevel::None) { - unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second; - IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize(); - } + if (TM.getOptLevel() > CodeGenOptLevel::None) + IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(*I.getFunction()); IRBuilder<> B(&I); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index fdb6b2eb2caf5..fb4728609c877 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -173,6 +173,10 @@ std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( return Requested; } +bool AMDGPUSubtarget::isSingleWavefrontWorkgroup(const Function &F) const { + return getFlatWorkGroupSizes(F).second <= getWavefrontSize(); +} + std::pair AMDGPUSubtarget::getEffectiveWavesPerEU( std::pair RequestedWavesPerEU, std::pair FlatWorkGroupSizes, unsigned LDSBytes) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index d23f94243a459..07746c087904d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -80,6 +80,10 @@ class AMDGPUSubtarget { /// be converted to integer, or violate subtarget's specifications. std::pair getFlatWorkGroupSizes(const Function &F) const; + /// \returns true if the maximum flat work-group size for \p F is at most the + /// wavefront size, so a work-group may fit in a single wavefront. + bool isSingleWavefrontWorkgroup(const Function &F) const; + /// \returns The required size of workgroups that will be used to execute \p F /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size` /// metadata. Otherwise, returns std::nullopt. diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 2a2862ed24da2..fe87c5afda1b8 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -159,7 +159,8 @@ class SIMemOpInfo final { bool IsCrossAddressSpaceOrdering = true, AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent, bool IsVolatile = false, bool IsNonTemporal = false, - bool IsLastUse = false, bool IsCooperative = false) + bool IsLastUse = false, bool IsCooperative = false, + bool CanDemoteWorkgroupToWavefront = false) : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace), IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), @@ -207,6 +208,17 @@ class SIMemOpInfo final { // AGENT scope as a conservatively correct alternative. if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters()) this->Scope = SIAtomicScope::AGENT; + + // When max flat work-group size is at most the wavefront size, the + // work-group fits in a single wave, so LLVM workgroup scope matches + // wavefront scope. Demote workgroup → wavefront here for fences and for + // atomics with ordering stronger than monotonic. + if (CanDemoteWorkgroupToWavefront && + this->Scope == SIAtomicScope::WORKGROUP && + (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) || + llvm::isStrongerThan(this->FailureOrdering, + AtomicOrdering::Monotonic))) + this->Scope = SIAtomicScope::WAVEFRONT; } public: @@ -277,6 +289,7 @@ class SIMemOpAccess final { private: const AMDGPUMachineModuleInfo *MMI = nullptr; const GCNSubtarget &ST; + const bool CanDemoteWorkgroupToWavefront; /// Reports unsupported message \p Msg for \p MI to LLVM context. void reportUnsupported(const MachineBasicBlock::iterator &MI, @@ -300,7 +313,8 @@ class SIMemOpAccess final { public: /// Construct class to support accessing the machine memory operands /// of instructions in the machine function \p MF. - SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST); + SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST, + const Function &F); /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise. std::optional @@ -779,9 +793,13 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { return SIAtomicAddrSpace::OTHER; } +// TODO: Consider moving single-wave workgroup->wavefront scope relaxation to an +// IR pass (and extending it to other scoped operations), so middle-end +// optimizations see wavefront scope earlier. SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_, - const GCNSubtarget &ST) - : MMI(&MMI_), ST(ST) {} + const GCNSubtarget &ST, const Function &F) + : MMI(&MMI_), ST(ST), + CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {} std::optional SIMemOpAccess::constructFromMIWithMMO( const MachineBasicBlock::iterator &MI) const { @@ -851,7 +869,8 @@ std::optional SIMemOpAccess::constructFromMIWithMMO( } return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile, - IsNonTemporal, IsLastUse, IsCooperative); + IsNonTemporal, IsLastUse, IsCooperative, + CanDemoteWorkgroupToWavefront); } std::optional @@ -920,7 +939,8 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const { return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering, - AtomicOrdering::NotAtomic); + AtomicOrdering::NotAtomic, false, false, false, false, + CanDemoteWorkgroupToWavefront); } std::optional SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( @@ -2533,7 +2553,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) { bool Changed = false; const GCNSubtarget &ST = MF.getSubtarget(); - SIMemOpAccess MOA(MMI.getObjFileInfo(), ST); + SIMemOpAccess MOA(MMI.getObjFileInfo(), ST, + MF.getFunction()); CC = SICacheControl::create(ST); for (auto &MBB : MF) { diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 275825a973415..9d0d43d900026 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -5916,7 +5916,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn: @@ -5925,7 +5924,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn: @@ -5935,7 +5933,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -5948,7 +5945,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128: @@ -5960,7 +5956,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128: @@ -5973,7 +5968,6 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -6002,16 +5996,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6045,16 +6041,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6084,11 +6082,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX950-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB58_5 ; GFX950-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6097,6 +6094,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] @@ -6128,11 +6126,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX950-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB58_5 ; GFX950-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6141,6 +6138,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] @@ -6182,16 +6180,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6228,16 +6228,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6270,11 +6272,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX950-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB59_5 ; GFX950-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6283,6 +6284,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] @@ -6317,11 +6319,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX950-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB59_5 ; GFX950-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6330,6 +6331,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] @@ -6372,7 +6374,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 @@ -6410,7 +6411,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 @@ -6443,7 +6443,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6480,7 +6479,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6528,7 +6526,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 @@ -6569,7 +6566,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 @@ -6605,7 +6601,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6646,7 +6641,6 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6752,7 +6746,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn: @@ -6761,7 +6754,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn: @@ -6771,7 +6763,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -6784,7 +6775,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128: @@ -6796,7 +6786,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128: @@ -6809,7 +6798,6 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -6838,16 +6826,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6881,16 +6871,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6920,11 +6912,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX950-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB66_5 ; GFX950-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6933,6 +6924,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] @@ -6964,11 +6956,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX950-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB66_5 ; GFX950-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -6977,6 +6968,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] @@ -7018,16 +7010,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7064,16 +7058,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7106,11 +7102,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX950-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB67_5 ; GFX950-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7119,6 +7114,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] @@ -7153,11 +7149,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX950-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB67_5 ; GFX950-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7166,6 +7161,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] @@ -7208,7 +7204,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 @@ -7246,7 +7241,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 @@ -7279,7 +7273,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7316,7 +7309,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7364,7 +7356,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 @@ -7405,7 +7396,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 @@ -7441,7 +7431,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7482,7 +7471,6 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7588,7 +7576,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn: @@ -7597,7 +7584,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn: @@ -7607,7 +7593,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -7620,7 +7605,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128: @@ -7632,7 +7616,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128: @@ -7645,7 +7628,6 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -7674,16 +7656,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7717,16 +7701,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7756,11 +7742,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX950-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB74_5 ; GFX950-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7769,6 +7754,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] @@ -7800,11 +7786,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX950-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB74_5 ; GFX950-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7813,6 +7798,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] @@ -7854,16 +7840,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7900,16 +7888,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7942,11 +7932,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX950-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB75_5 ; GFX950-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -7955,6 +7944,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] @@ -7989,11 +7979,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX950-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB75_5 ; GFX950-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8002,6 +7991,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] @@ -8044,7 +8034,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 @@ -8082,7 +8071,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 @@ -8115,7 +8103,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8152,7 +8139,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8200,7 +8186,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 @@ -8241,7 +8226,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 @@ -8277,7 +8261,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8318,7 +8301,6 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8424,7 +8406,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn: @@ -8433,7 +8414,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn: @@ -8443,7 +8423,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -8456,7 +8435,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128: @@ -8468,7 +8446,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128: @@ -8481,7 +8458,6 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -8510,16 +8486,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -8553,16 +8531,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -8592,11 +8572,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX950-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB82_5 ; GFX950-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8605,6 +8584,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] @@ -8636,11 +8616,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX950-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB82_5 ; GFX950-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8649,6 +8628,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] @@ -8690,16 +8670,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -8736,16 +8718,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -8778,11 +8762,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX950-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB83_5 ; GFX950-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8791,6 +8774,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] @@ -8825,11 +8809,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX950-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB83_5 ; GFX950-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8838,6 +8821,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] @@ -8880,7 +8864,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 @@ -8918,7 +8901,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 @@ -8951,7 +8933,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -8988,7 +8969,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -9036,7 +9016,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 @@ -9077,7 +9056,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 @@ -9113,7 +9091,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] @@ -9154,7 +9131,6 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] -; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index 58f7c4340276d..466d7152a0d84 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -2142,31 +2142,22 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_max_saddr_i32_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_max_saddr_i32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_max_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_max_saddr_i32_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2176,31 +2167,22 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_max_saddr_i32_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_max_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2211,30 +2193,19 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % } define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_max_saddr_i32_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_max_saddr_i32_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_max_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax v0, v1, s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_max_saddr_i32_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2243,30 +2214,19 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_max_saddr_i32_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_max_saddr_i32_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_max_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2276,31 +2236,22 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg } define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_max_saddr_i64_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_max_saddr_i64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_max_saddr_i64_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_max_saddr_i64_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2310,31 +2261,22 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s } define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_max_saddr_i64_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_max_saddr_i64_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_max_saddr_i64_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2345,30 +2287,19 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i } define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_max_saddr_i64_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_max_saddr_i64_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_max_saddr_i64_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_max_saddr_i64_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2377,30 +2308,19 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_max_saddr_i64_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_max_saddr_i64_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_max_saddr_i64_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2414,31 +2334,22 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_min_saddr_i32_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_min_saddr_i32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_min_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_min_saddr_i32_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2448,31 +2359,22 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_min_saddr_i32_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_min_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2483,30 +2385,19 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % } define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_min_saddr_i32_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_min_saddr_i32_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_min_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin v0, v1, s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_min_saddr_i32_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2515,30 +2406,19 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_min_saddr_i32_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_min_saddr_i32_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_min_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2548,31 +2428,22 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg } define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_min_saddr_i64_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_min_saddr_i64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_min_saddr_i64_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_min_saddr_i64_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2582,31 +2453,22 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s } define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_min_saddr_i64_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_min_saddr_i64_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_min_saddr_i64_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2617,30 +2479,19 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i } define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_min_saddr_i64_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_min_saddr_i64_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_min_saddr_i64_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_min_saddr_i64_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2649,30 +2500,19 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_min_saddr_i64_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_min_saddr_i64_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_min_saddr_i64_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2686,31 +2526,22 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umax_saddr_i32_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umax_saddr_i32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umax_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umax_saddr_i32_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2720,31 +2551,22 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umax_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2755,30 +2577,19 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg } define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umax_saddr_i32_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umax_saddr_i32_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umax_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax v0, v1, s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umax_saddr_i32_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2787,30 +2598,19 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase } define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umax_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2820,31 +2620,22 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg } define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umax_saddr_i64_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umax_saddr_i64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umax_saddr_i64_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umax_saddr_i64_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2854,31 +2645,22 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % } define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umax_saddr_i64_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2889,30 +2671,19 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) } define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umax_saddr_i64_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umax_saddr_i64_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umax_saddr_i64_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umax_saddr_i64_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2921,30 +2692,19 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase } define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umax_saddr_i64_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2958,31 +2718,22 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umin_saddr_i32_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umin_saddr_i32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umin_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umin_saddr_i32_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -2992,31 +2743,22 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, } define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umin_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3027,30 +2769,19 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg } define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umin_saddr_i32_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umin_saddr_i32_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umin_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin v0, v1, s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umin_saddr_i32_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3059,30 +2790,19 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase } define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umin_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3092,31 +2812,22 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg } define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umin_saddr_i64_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umin_saddr_i64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umin_saddr_i64_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i64_rtn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umin_saddr_i64_rtn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3126,31 +2837,22 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % } define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_umin_saddr_i64_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3161,30 +2863,19 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) } define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umin_saddr_i64_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umin_saddr_i64_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umin_saddr_i64_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umin_saddr_i64_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -3193,30 +2884,19 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase } define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_umin_saddr_i64_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 ; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll new file mode 100644 index 0000000000000..aaa295992c361 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-single-wave-workgroup-memops.ll @@ -0,0 +1,2759 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W32 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX10,GFX10-W64 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W32 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX12,GFX12-W64 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=si-memory-legalizer < %s -o - | FileCheck %s --check-prefixes=GFX1250 + + +define amdgpu_kernel void @wg_fence_acq_rel_single32() #0 { + ; GFX9-LABEL: name: wg_fence_acq_rel_single32 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_fence_acq_rel_single32 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_fence_acq_rel_single32 + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: wg_fence_acq_rel_single32 + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_fence_acq_rel_single32 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_ENDPGM 0 + fence syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @wg_fence_acq_rel_single64() #1 { + ; GFX9-LABEL: name: wg_fence_acq_rel_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_fence_acq_rel_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_fence_acq_rel_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_fence_acq_rel_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_fence_acq_rel_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_fence_acq_rel_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_fence_acq_rel_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + fence syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @wg_fence_acq_rel_multi() #2 { + ; GFX9-LABEL: name: wg_fence_acq_rel_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_fence_acq_rel_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_fence_acq_rel_multi + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: S_WAITCNT_soft 112 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: wg_fence_acq_rel_multi + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_fence_acq_rel_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + fence syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @wg_fence_acquire_single64() #1 { + ; GFX9-LABEL: name: wg_fence_acquire_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_fence_acquire_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_fence_acquire_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_fence_acquire_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_fence_acquire_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_fence_acquire_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_fence_acquire_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + fence syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @wg_fence_release_single64() #1 { + ; GFX9-LABEL: name: wg_fence_release_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_fence_release_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_fence_release_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_fence_release_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_fence_release_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_fence_release_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_fence_release_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + fence syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @wg_fence_seq_cst_single64() #1 { + ; GFX9-LABEL: name: wg_fence_seq_cst_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_fence_seq_cst_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_fence_seq_cst_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_fence_seq_cst_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_fence_seq_cst_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_fence_seq_cst_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_fence_seq_cst_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + fence syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @wg_ld_seq_cst_single32(ptr addrspace(1) %p) #0 { + ; GFX9-LABEL: name: wg_ld_seq_cst_single32 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_ld_seq_cst_single32 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_ld_seq_cst_single32 + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: wg_ld_seq_cst_single32 + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_ld_seq_cst_single32 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @wg_ld_seq_cst_single64(ptr addrspace(1) %p) #1 { + ; GFX9-LABEL: name: wg_ld_seq_cst_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_ld_seq_cst_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_ld_seq_cst_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_soft 16240 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_ld_seq_cst_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_ld_seq_cst_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_ld_seq_cst_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_ld_seq_cst_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @wg_ld_seq_cst_multi(ptr addrspace(1) %p) #2 { + ; GFX9-LABEL: name: wg_ld_seq_cst_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_ld_seq_cst_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_ld_seq_cst_multi + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: S_WAITCNT_soft 112 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX10-NEXT: S_WAITCNT_soft 16240 + ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: wg_ld_seq_cst_multi + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_ld_seq_cst_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @wg_ld_acquire_single64(ptr addrspace(1) %p) #1 { + ; GFX9-LABEL: name: wg_ld_acquire_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_ld_acquire_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_ld_acquire_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_soft 16240 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_ld_acquire_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_ld_acquire_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W32-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_ld_acquire_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_ld_acquire_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") acquire (s32) from %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") acquire, align 4 + ret void +} + +define amdgpu_kernel void @wg_ld_monotonic_single64(ptr addrspace(1) %p) #1 { + ; GFX9-LABEL: name: wg_ld_monotonic_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_ld_monotonic_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_ld_monotonic_single64 + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 1, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: wg_ld_monotonic_single64 + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 8, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1) + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_ld_monotonic_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: dead renamable $vgpr0 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: ("amdgpu-noclobber" load syncscope("workgroup") monotonic (s32) from %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @wg_st_seq_cst_single32(ptr addrspace(1) %p, i32 %x) #0 { + ; GFX9-LABEL: name: wg_st_seq_cst_single32 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_st_seq_cst_single32 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_st_seq_cst_single32 + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_st_seq_cst_single32 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_st_seq_cst_single32 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_st_seq_cst_single32 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_ENDPGM 0 + store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @wg_st_seq_cst_single64(ptr addrspace(1) %p, i32 %x) #1 { + ; GFX9-LABEL: name: wg_st_seq_cst_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_st_seq_cst_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_st_seq_cst_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_st_seq_cst_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_st_seq_cst_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_st_seq_cst_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_st_seq_cst_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_ENDPGM 0 + store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @wg_st_seq_cst_multi(ptr addrspace(1) %p, i32 %x) #2 { + ; GFX9-LABEL: name: wg_st_seq_cst_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_st_seq_cst_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 1, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_st_seq_cst_multi + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-NEXT: S_WAITCNT_soft 112 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_st_seq_cst_multi + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_st_seq_cst_multi + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_st_seq_cst_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_ENDPGM 0 + store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @wg_st_release_single64(ptr addrspace(1) %p, i32 %x) #1 { + ; GFX9-LABEL: name: wg_st_release_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_st_release_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_st_release_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_st_release_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_st_release_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_st_release_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_st_release_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_ENDPGM 0 + store atomic i32 %x, ptr addrspace(1) %p syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @wg_rmw_add_seq_cst_single32(ptr addrspace(1) %p) #0 { + ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1 (%ir-block.11): + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2 (%ir-block.16): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.1 (%ir-block.11): + ; GFX942-NEXT: successors: %bb.2(0x80000000) + ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.16): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec + ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.1 (%ir-block.7): + ; GFX10-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.2 (%ir-block.11): + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.1 (%ir-block.11): + ; GFX10-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.2 (%ir-block.16): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.1 (%ir-block.7): + ; GFX12-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.2 (%ir-block.11): + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.1 (%ir-block.11): + ; GFX12-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.2 (%ir-block.16): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single32 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.1 (%ir-block.7): + ; GFX1250-NEXT: successors: %bb.2(0x80000000) + ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.2 (%ir-block.11): + ; GFX1250-NEXT: S_ENDPGM 0 + %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @wg_rmw_add_seq_cst_single64(ptr addrspace(1) %p) #1 { + ; GFX9-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1 (%ir-block.11): + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2 (%ir-block.16): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.1 (%ir-block.11): + ; GFX942-NEXT: successors: %bb.2(0x80000000) + ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.16): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec + ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.1 (%ir-block.7): + ; GFX10-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.2 (%ir-block.11): + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.1 (%ir-block.11): + ; GFX10-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.2 (%ir-block.16): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.1 (%ir-block.7): + ; GFX12-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.2 (%ir-block.11): + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.1 (%ir-block.11): + ; GFX12-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.2 (%ir-block.16): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.1 (%ir-block.7): + ; GFX1250-NEXT: successors: %bb.2(0x80000000) + ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.2 (%ir-block.11): + ; GFX1250-NEXT: S_ENDPGM 0 + %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @wg_rmw_add_seq_cst_multi(ptr addrspace(1) %p) #2 { + ; GFX9-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1 (%ir-block.11): + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2 (%ir-block.16): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.1 (%ir-block.11): + ; GFX942-NEXT: successors: %bb.2(0x80000000) + ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.16): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec + ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.1 (%ir-block.7): + ; GFX10-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.2 (%ir-block.11): + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.1 (%ir-block.11): + ; GFX10-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: S_WAITCNT_soft 112 + ; GFX10-W64-NEXT: S_WAITCNT_lds_direct + ; GFX10-W64-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W64-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.2 (%ir-block.16): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.1 (%ir-block.7): + ; GFX12-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.2 (%ir-block.11): + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.1 (%ir-block.11): + ; GFX12-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W64-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.2 (%ir-block.16): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_rmw_add_seq_cst_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.1 (%ir-block.7): + ; GFX1250-NEXT: successors: %bb.2(0x80000000) + ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: early-clobber renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 7 + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr0, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_ADD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.2 (%ir-block.11): + ; GFX1250-NEXT: S_ENDPGM 0 + %r = atomicrmw add ptr addrspace(1) %p, i32 7 syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @wg_rmw_xchg_acq_rel_single64(ptr addrspace(1) %p, i32 %x) #1 { + ; GFX9-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_rmw_xchg_acq_rel_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_SWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %r = atomicrmw xchg ptr addrspace(1) %p, i32 %x syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 { + ; GFX9-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @wg_cmpxchg_seq_cst_seq_cst_multi(ptr addrspace(1) %p, i32 %cmp, i32 %new) #2 { + ; GFX9-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-NEXT: S_WAITCNT_soft 112 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W64-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_cmpxchg_seq_cst_seq_cst_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") seq_cst seq_cst (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @wg_cmpxchg_acquire_acquire_single64(ptr addrspace(1) %p, i32 %cmp, i32 %new) #1 { + ; GFX9-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: renamable $vgpr2, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 8, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: wg_cmpxchg_acquire_acquire_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr2, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250 0, killed $sgpr3, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: GLOBAL_ATOMIC_CMPSWAP_SADDR killed renamable $vgpr2, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acquire acquire (s32) on %ir.p.load, addrspace 1) + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %r = cmpxchg ptr addrspace(1) %p, i32 %cmp, i32 %new syncscope("workgroup") acquire acquire + ret void +} + +define amdgpu_kernel void @lds_wg_ld_seq_cst_single32(ptr addrspace(3) %p) #0 { + ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single32 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single32 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: lds_wg_ld_seq_cst_single32 + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: lds_wg_ld_seq_cst_single32 + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX12-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single32 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @lds_wg_ld_seq_cst_single64(ptr addrspace(3) %p) #1 { + ; GFX9-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX10-W32-NEXT: S_WAITCNT_soft 49279 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX12-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @lds_wg_ld_seq_cst_multi(ptr addrspace(3) %p) #2 { + ; GFX9-LABEL: name: lds_wg_ld_seq_cst_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_ld_seq_cst_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: lds_wg_ld_seq_cst_multi + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-NEXT: S_WAITCNT_soft 112 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX10-NEXT: S_WAITCNT_soft 49279 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: lds_wg_ld_seq_cst_multi + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX12-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX12-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_ld_seq_cst_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX1250-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: dead renamable $vgpr0 = DS_READ_B32_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load syncscope("workgroup") seq_cst (s32) from %ir.p.load, addrspace 3) + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @lds_wg_st_release_single64(ptr addrspace(3) %p, i32 %x) #1 { + ; GFX9-LABEL: name: lds_wg_st_release_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX9-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_st_release_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: lds_wg_st_release_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: lds_wg_st_release_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: lds_wg_st_release_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: lds_wg_st_release_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_st_release_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (store syncscope("workgroup") release (s32) into %ir.2, addrspace 3) + ; GFX1250-NEXT: S_ENDPGM 0 + store atomic i32 %x, ptr addrspace(3) %p syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @lds_wg_rmw_add_acq_rel_single64(ptr addrspace(3) %p) #1 { + ; GFX9-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX9-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX9-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX9-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX9-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.1 (%ir-block.11): + ; GFX9-NEXT: successors: %bb.2(0x80000000) + ; GFX9-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX9-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX9-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: bb.2 (%ir-block.16): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX942-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX942-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX942-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX942-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.1 (%ir-block.11): + ; GFX942-NEXT: successors: %bb.2(0x80000000) + ; GFX942-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX942-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX942-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: bb.2 (%ir-block.16): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX10-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W32-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec + ; GFX10-W32-NEXT: $sgpr1 = S_AND_SAVEEXEC_B32 killed $vcc_lo, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.1 (%ir-block.7): + ; GFX10-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX10-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX10-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX10-W32-NEXT: S_WAITCNT_soft 49279 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: bb.2 (%ir-block.11): + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX10-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX10-W64-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX10-W64-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.1 (%ir-block.11): + ; GFX10-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX10-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX10-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX10-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: bb.2 (%ir-block.16): + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX12-W32-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W32-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W32-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.1 (%ir-block.7): + ; GFX12-W32-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W32-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX12-W32-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX12-W32-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX12-W32-NEXT: $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: bb.2 (%ir-block.11): + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: $sgpr2_sgpr3 = S_MOV_B64 $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX12-W64-NEXT: renamable $vgpr0 = V_MBCNT_HI_U32_B32_e64 $sgpr1, killed $vgpr0, implicit $exec + ; GFX12-W64-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX12-W64-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.1 (%ir-block.11): + ; GFX12-W64-NEXT: successors: %bb.2(0x80000000) + ; GFX12-W64-NEXT: liveins: $sgpr0_sgpr1:0x000000000000000F, $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX12-W64-NEXT: renamable $sgpr0 = S_BCNT1_I32_B64 killed renamable $sgpr0_sgpr1, implicit-def dead $scc + ; GFX12-W64-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: bb.2 (%ir-block.16): + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_rmw_add_acq_rel_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; GFX1250-NEXT: renamable $vgpr0 = V_MBCNT_LO_U32_B32_e64 $sgpr0, 0, implicit $exec + ; GFX1250-NEXT: V_CMPX_EQ_U32_nosdst_e32 0, killed $vgpr0, implicit-def $exec, implicit $exec + ; GFX1250-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.1 (%ir-block.7): + ; GFX1250-NEXT: successors: %bb.2(0x80000000) + ; GFX1250-NEXT: liveins: $sgpr0, $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s32) from %ir.p.kernarg.offset, addrspace 4) + ; GFX1250-NEXT: renamable $sgpr0 = S_BCNT1_I32_B32 killed renamable $sgpr0, implicit-def dead $scc + ; GFX1250-NEXT: renamable $sgpr0 = S_MUL_I32 killed renamable $sgpr0, 3 + ; GFX1250-NEXT: $vgpr1, $vgpr0 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: DS_ADD_U32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel (s32) on %ir.p.load, addrspace 3) + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: bb.2 (%ir-block.11): + ; GFX1250-NEXT: S_ENDPGM 0 + %r = atomicrmw add ptr addrspace(3) %p, i32 3 syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @lds_wg_cmpxchg_acq_rel_monotonic_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 { + ; GFX9-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-W32-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX10-W32-NEXT: S_WAITCNT_soft 49279 + ; GFX10-W32-NEXT: S_WAITCNT_lds_direct + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_cmpxchg_acq_rel_monotonic_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") acq_rel monotonic (s32) on %ir.2, addrspace 3) + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @lds_wg_ld_unordered_single64(ptr addrspace(3) %p) #1 { + ; GFX9-LABEL: name: lds_wg_ld_unordered_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_ld_unordered_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: lds_wg_ld_unordered_single64 + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: lds_wg_ld_unordered_single64 + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_ld_unordered_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(3) %p syncscope("workgroup") unordered, align 4 + ret void +} + +define amdgpu_kernel void @lds_wg_cmpxchg_monotonic_acquire_single64(ptr addrspace(3) %p, i32 %cmp, i32 %new) #1 { + ; GFX9-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W32-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX10-W32-NEXT: S_WAITCNT_soft 49279 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s128) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-W64-NEXT: DS_CMPST_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: lds_wg_cmpxchg_monotonic_acquire_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 killed $sgpr0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec + ; GFX1250-NEXT: DS_CMPSTORE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec :: (load store syncscope("workgroup") monotonic acquire (s32) on %ir.2, addrspace 3) + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %r = cmpxchg ptr addrspace(3) %p, i32 %cmp, i32 %new syncscope("workgroup") monotonic acquire + ret void +} + +define amdgpu_kernel void @flat_wg_ld_acquire_single64(ptr addrspace(0) %p) #1 { + ; GFX9-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX9-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-W32-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX10-W32: bb.0 (%ir-block.0): + ; GFX10-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W32-NEXT: {{ $}} + ; GFX10-W32-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W32-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-W32-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-W32-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 1, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX10-W32-NEXT: S_WAITCNT_soft 112 + ; GFX10-W32-NEXT: BUFFER_GL0_INV implicit $exec + ; GFX10-W32-NEXT: S_ENDPGM 0 + ; + ; GFX10-W64-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX10-W64: bb.0 (%ir-block.0): + ; GFX10-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-W64-NEXT: {{ $}} + ; GFX10-W64-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-W64-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX10-W64-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX12-W32-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 8, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: GLOBAL_INV 8, implicit $exec + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX12-W64-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: flat_wg_ld_acquire_single64 + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX1250-NEXT: dead renamable $vgpr0 = FLAT_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (load syncscope("workgroup") acquire (s32) from %ir.p.load) + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_ENDPGM 0 + %v = load atomic i32, ptr addrspace(0) %p syncscope("workgroup") acquire, align 4 + ret void +} + +define amdgpu_kernel void @flat_wg_st_seq_cst_multi(ptr addrspace(0) %p, i32 %x) #2 { + ; GFX9-LABEL: name: flat_wg_st_seq_cst_multi + ; GFX9: bb.0 (%ir-block.0): + ; GFX9-NEXT: liveins: $sgpr4_sgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX9-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX9-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX9-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX9-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX9-NEXT: S_WAITCNT_soft 49279 + ; GFX9-NEXT: S_WAITCNT_lds_direct + ; GFX9-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load) + ; GFX9-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: flat_wg_st_seq_cst_multi + ; GFX942: bb.0 (%ir-block.0): + ; GFX942-NEXT: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX942-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr0_sgpr1, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX942-NEXT: S_WAITCNT_soft 49279 + ; GFX942-NEXT: S_WAITCNT_lds_direct + ; GFX942-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 1, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load) + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX10-LABEL: name: flat_wg_st_seq_cst_multi + ; GFX10: bb.0 (%ir-block.0): + ; GFX10-NEXT: liveins: $sgpr4_sgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX10-NEXT: renamable $sgpr2 = S_LOAD_DWORD_IMM killed renamable $sgpr4_sgpr5, 44, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) + ; GFX10-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-NEXT: S_WAITCNT_soft 112 + ; GFX10-NEXT: S_WAITCNT_lds_direct + ; GFX10-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0 + ; GFX10-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-W32-LABEL: name: flat_wg_st_seq_cst_multi + ; GFX12-W32: bb.0 (%ir-block.0): + ; GFX12-W32-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W32-NEXT: {{ $}} + ; GFX12-W32-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W32-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 $sgpr0, killed $sgpr1, implicit $exec, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX12-W32-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W32-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W32-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W32-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load) + ; GFX12-W32-NEXT: S_ENDPGM 0 + ; + ; GFX12-W64-LABEL: name: flat_wg_st_seq_cst_multi + ; GFX12-W64: bb.0 (%ir-block.0): + ; GFX12-W64-NEXT: liveins: $sgpr4_sgpr5 + ; GFX12-W64-NEXT: {{ $}} + ; GFX12-W64-NEXT: renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX12-W64-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX12-W64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX12-W64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX12-W64-NEXT: S_WAIT_BVHCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_SAMPLECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX12-W64-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX12-W64-NEXT: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 8, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load) + ; GFX12-W64-NEXT: S_ENDPGM 0 + ; + ; GFX1250-LABEL: name: flat_wg_st_seq_cst_multi + ; GFX1250: bb.0 (%ir-block.0): + ; GFX1250-NEXT: liveins: $sgpr4_sgpr5 + ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_SETREG_IMM32_B32 1, 1601, implicit-def $mode, implicit $mode + ; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2 = S_LOAD_DWORDX3_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s96) from %ir.p.kernarg.offset, align 4, addrspace 4) + ; GFX1250-NEXT: renamable $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 0, killed $sgpr2, implicit $exec, implicit $exec, implicit $exec, implicit $exec + ; GFX1250-NEXT: S_WAIT_LOADCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 + ; GFX1250-NEXT: S_WAIT_DSCNT_soft 0 + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 + ; GFX1250-NEXT: FLAT_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst (s32) into %ir.p.load) + ; GFX1250-NEXT: S_ENDPGM 0 + store atomic i32 %x, ptr addrspace(0) %p syncscope("workgroup") seq_cst, align 4 + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="32,32" } +attributes #1 = { "amdgpu-flat-work-group-size"="64,64" } +attributes #2 = { "amdgpu-flat-work-group-size"="64,256" }