diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index cb27f474d78f3..e567176e658b3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1871,9 +1871,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the subtarget supports clusters of workgroups. bool hasClusters() const { return HasClusters; } - /// \returns true if the subtarget requires a wait for xcnt before atomic - /// flat/global stores & rmw. - bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } + /// \returns true if the subtarget requires a wait for xcnt before VMEM + /// accesses that must never be repeated in the event of a page fault/re-try. + /// Atomic stores/rmw and all volatile accesses fall under this criteria. + bool requiresWaitXCntForSingleAccessInstructions() const { + return GFX1250Insts; + } /// \returns the number of significant bits in the immediate field of the /// S_NOP instruction. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 66e1873319553..78d4bd56dfc24 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9934,6 +9934,11 @@ void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { } } +bool SIInstrInfo::isVBUFFER(const MachineInstr &MI) const { + return (ST.getGeneration() == GCNSubtarget::GFX12) && + (MI.getDesc().TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF)); +} + bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { if (!isSMRD(MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c66985a19685b..9bbeccf4c8be9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -585,6 +585,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } + /// \returns true if \p MI is a GFX12 VBUFFER instruction. + bool isVBUFFER(const MachineInstr &MI) const; + static bool isSMRD(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SMRD; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index bf04c7fa132c0..9ea41ead2c7b5 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -776,6 +776,13 @@ std::optional SIMemOpAccess::constructFromMIWithMMO( } } + // FIXME: The MMO of buffer atomic instructions does not always have an atomic + // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it + // here, but the lowering should really be cleaned up at some point. + if (ST.getInstrInfo()->isVBUFFER(*MI) && SIInstrInfo::isAtomic(*MI) && + Ordering == AtomicOrdering::NotAtomic) + Ordering = AtomicOrdering::Monotonic; + SIAtomicScope Scope = SIAtomicScope::NONE; SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; bool IsCrossAddressSpaceOrdering = false; @@ -2059,6 +2066,13 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); + if (ST.requiresWaitXCntForSingleAccessInstructions() && + (TII->isFLAT(*MI) || TII->isVBUFFER(*MI))) { + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2077,9 +2091,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { const bool IsRMW = (MI.mayLoad() && MI.mayStore()); bool Changed = false; - // GFX12.5 only: xcnt wait is needed before flat and global atomics - // stores/rmw. - if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() && + (TII->isFLAT(MI) || TII->isVBUFFER(MI))) { MachineBasicBlock &MBB = *MI.getParent(); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); Changed = true; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 28d7e6916e519..453f98370efbc 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -5136,6 +5136,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] ; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 @@ -6215,6 +6216,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] ; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v9, 1 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 5e2cec504c6a9..8d0131ccc6f7b 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -7409,8 +7409,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar ; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v1, v0 ; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v1, off scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v0 @@ -7431,8 +7433,10 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar ; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v1, v0 ; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v1, off scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v0 @@ -7686,6 +7690,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a ; GFX1250-SDAG-NEXT: v_mul_lo_u32 v3, v0, v1 ; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v2, off scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: global_store_b32 v[4:5], v3, off scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u32_e32 v0, v3, v1 @@ -7706,6 +7711,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a ; GFX1250-GISEL-NEXT: v_mul_lo_u32 v5, v0, v1 ; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v4, off scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: global_store_b32 v[2:3], v5, off scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v0, v5, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-buffer-atomics.ll new file mode 100644 index 0000000000000..b7971a024cc38 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-buffer-atomics.ll @@ -0,0 +1,435 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + +define void @buffer_fat_ptr_agent_atomic_add_noret_i32(ptr addrspace(7) inreg %ptr, i32 %val) { +; GFX1250-LABEL: buffer_fat_ptr_agent_atomic_add_noret_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s8, s7 +; GFX1250-NEXT: s_mov_b32 s9, s6 +; GFX1250-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s10, s5 +; GFX1250-NEXT: s_mov_b32 s0, s4 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s10 +; GFX1250-NEXT: s_mov_b32 s2, s9 +; GFX1250-NEXT: s_mov_b32 s3, s8 +; GFX1250-NEXT: ; kill: def $sgpr8 killed $sgpr16 +; GFX1250-NEXT: v_mov_b32_e32 v1, s16 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_atomic_add_u32 v0, v1, s[0:3], null offen +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %unused = atomicrmw add ptr addrspace(7) %ptr, i32 %val syncscope("agent") monotonic + ret void +} + +define i32 @buffer_fat_ptr_agent_atomic_add_ret_i32(ptr addrspace(7) inreg %ptr, i32 %val) { +; GFX1250-LABEL: buffer_fat_ptr_agent_atomic_add_ret_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s8, s7 +; GFX1250-NEXT: s_mov_b32 s9, s6 +; GFX1250-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s10, s5 +; GFX1250-NEXT: s_mov_b32 s0, s4 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s10 +; GFX1250-NEXT: s_mov_b32 s2, s9 +; GFX1250-NEXT: s_mov_b32 s3, s8 +; GFX1250-NEXT: ; kill: def $sgpr8 killed $sgpr16 +; GFX1250-NEXT: v_mov_b32_e32 v1, s16 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_atomic_add_u32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = atomicrmw add ptr addrspace(7) %ptr, i32 %val syncscope("agent") monotonic + ret i32 %ret +} + +define void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX1250-LABEL: raw_buffer_atomic_add_v2f16_noret: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX1250-LABEL: raw_buffer_atomic_add_v2f16_ret: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <2 x half> %ret +} + +define float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX1250-LABEL: struct_buffer_atomic_add_v2f16_ret: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %r = bitcast <2 x half> %orig to float + ret float %r +} + +define void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX1250-LABEL: struct_buffer_atomic_add_v2f16_noret: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define void @raw_buffer_store_i32(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: raw_buffer_store_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %v, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +define void @raw_buffer_store_i32_volatile(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: raw_buffer_store_i32_volatile: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %v, <4 x i32> %rsrc, i32 0, i32 0, i32 2147483648) + ret void +} + +define void @struct_buffer_store_i32(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: struct_buffer_store_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + call void @llvm.amdgcn.struct.buffer.store.i32(i32 %v, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define void @struct_buffer_store_i32_volatile(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: struct_buffer_store_i32_volatile: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + call void @llvm.amdgcn.struct.buffer.store.i32(i32 %v, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 2147483648) + ret void +} + +define i32 @raw_buffer_load_i32(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: raw_buffer_load_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[0:3], null +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret i32 %val +} + +; FIXME?: Compiler strips volatile bit during lowering, we cannot emit volatile buffer loads this way. +define i32 @raw_buffer_load_i32_volatile(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: raw_buffer_load_i32_volatile: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[0:3], null +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 21474836480) + ret i32 %val +} + +define i32 @struct_buffer_load_i32(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: struct_buffer_load_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: buffer_load_b32 v0, v0, s[0:3], null idxen +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %val = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret i32 %val +} + +; FIXME?: Compiler strips volatile bit during lowering, we cannot emit volatile buffer loads this way. +define i32 @struct_buffer_load_i32_volatile(<4 x i32> inreg %rsrc, i32 %v) { +; GFX1250-LABEL: struct_buffer_load_i32_volatile: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, s3 +; GFX1250-NEXT: s_mov_b32 s5, s2 +; GFX1250-NEXT: s_mov_b32 s6, s1 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s6 +; GFX1250-NEXT: s_mov_b32 s2, s5 +; GFX1250-NEXT: s_mov_b32 s3, s4 +; GFX1250-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: buffer_load_b32 v0, v0, s[0:3], null idxen +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %val = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 21474836480) + ret i32 %val +} + +define void @buffer_fat_ptr_store_i32(ptr addrspace(7) inreg %ptr, i32 %val) { +; GFX1250-LABEL: buffer_fat_ptr_store_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s8, s7 +; GFX1250-NEXT: s_mov_b32 s9, s6 +; GFX1250-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s10, s5 +; GFX1250-NEXT: s_mov_b32 s0, s4 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s10 +; GFX1250-NEXT: s_mov_b32 s2, s9 +; GFX1250-NEXT: s_mov_b32 s3, s8 +; GFX1250-NEXT: ; kill: def $sgpr8 killed $sgpr16 +; GFX1250-NEXT: v_mov_b32_e32 v1, s16 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr addrspace(7) %ptr + ret void +} + +define void @buffer_fat_ptr_store_i32_volatile(ptr addrspace(7) inreg %ptr, i32 %val) { +; GFX1250-LABEL: buffer_fat_ptr_store_i32_volatile: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s8, s7 +; GFX1250-NEXT: s_mov_b32 s9, s6 +; GFX1250-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s10, s5 +; GFX1250-NEXT: s_mov_b32 s0, s4 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s10 +; GFX1250-NEXT: s_mov_b32 s2, s9 +; GFX1250-NEXT: s_mov_b32 s3, s8 +; GFX1250-NEXT: ; kill: def $sgpr8 killed $sgpr16 +; GFX1250-NEXT: v_mov_b32_e32 v1, s16 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + store volatile i32 %val, ptr addrspace(7) %ptr + ret void +} + +define i32 @buffer_fat_ptr_load_i32(ptr addrspace(7) inreg %ptr) { +; GFX1250-LABEL: buffer_fat_ptr_load_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s8, s7 +; GFX1250-NEXT: s_mov_b32 s9, s6 +; GFX1250-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s10, s5 +; GFX1250-NEXT: s_mov_b32 s0, s4 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s10 +; GFX1250-NEXT: s_mov_b32 s2, s9 +; GFX1250-NEXT: s_mov_b32 s3, s8 +; GFX1250-NEXT: ; kill: def $sgpr8 killed $sgpr16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s16 +; GFX1250-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %val = load i32, ptr addrspace(7) %ptr + ret i32 %val +} + +define i32 @buffer_fat_ptr_load_i32_volatile(ptr addrspace(7) inreg %ptr) { +; GFX1250-LABEL: buffer_fat_ptr_load_i32_volatile: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s8, s7 +; GFX1250-NEXT: s_mov_b32 s9, s6 +; GFX1250-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_mov_b32 s10, s5 +; GFX1250-NEXT: s_mov_b32 s0, s4 +; GFX1250-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 +; GFX1250-NEXT: s_mov_b32 s1, s10 +; GFX1250-NEXT: s_mov_b32 s2, s9 +; GFX1250-NEXT: s_mov_b32 s3, s8 +; GFX1250-NEXT: ; kill: def $sgpr8 killed $sgpr16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s16 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %val = load volatile i32, ptr addrspace(7) %ptr + ret i32 %val +} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index bdde7c0975425..bde7e7e38d8fa 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -111,6 +111,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index a2a8ce75d7fb4..2a866e8c625b4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -1303,6 +1303,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index c59b0ee83e955..7447ac68e7fc5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -150,6 +150,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -421,6 +422,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -582,6 +584,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -849,6 +852,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s2, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index ca7802d295e0b..36673c99ae056 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -92,6 +92,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 72cbbc0283545..8b04261ce37fb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -1104,6 +1104,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 2a40ee532be98..d32f961976325 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -153,6 +153,7 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -361,6 +362,7 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s4, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -532,6 +534,7 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -733,6 +736,7 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index 80ea48be0b893..7ed4491d0df66 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -89,6 +89,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) % ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 6c19722ad6e33..e5dba27b0acc9 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -1080,6 +1080,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index 7c23b76cec3e9..811a10cbe3ba5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -162,6 +162,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -357,6 +358,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_mov_b32 s3, 0x3ff ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -521,6 +523,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -706,6 +709,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX1250-NEXT: v_and_b32_e64 v1, v0, s2 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 v1, v0, s0 scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index c1764c94ea2de..4a0ce202e124a 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -352,6 +352,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -410,6 +411,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 9686c9d30b97c..c16793675f6a2 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -71,6 +71,7 @@ define void @spill_i16_alu() { ; GFX1250-TRUE16-NEXT: scratch_load_u16 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] @@ -87,6 +88,7 @@ define void @spill_i16_alu() { ; GFX1250-FAKE16-NEXT: ;;#ASMSTART ; GFX1250-FAKE16-NEXT: ;;#ASMEND ; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -215,8 +217,10 @@ define void @spill_i16_alu_two_vals() { ; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] @@ -236,9 +240,11 @@ define void @spill_i16_alu_two_vals() { ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload ; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v1, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] @@ -325,6 +331,7 @@ define void @spill_i16() { ; GFX1250-NEXT: ;;#ASMSTART ; GFX1250-NEXT: ;;#ASMEND ; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -403,6 +410,7 @@ define void @spill_half() { ; GFX1250-NEXT: ;;#ASMSTART ; GFX1250-NEXT: ;;#ASMEND ; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -481,6 +489,7 @@ define void @spill_i16_from_v2i16() { ; GFX1250-NEXT: ;;#ASMSTART ; GFX1250-NEXT: ;;#ASMEND ; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -584,6 +593,7 @@ define void @spill_2xi16_from_v2i16() { ; GFX1250-TRUE16-NEXT: ;;#ASMSTART ; GFX1250-TRUE16-NEXT: ;;#ASMEND ; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -608,6 +618,7 @@ define void @spill_2xi16_from_v2i16() { ; GFX1250-FAKE16-NEXT: ;;#ASMSTART ; GFX1250-FAKE16-NEXT: ;;#ASMEND ; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -707,6 +718,7 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill @@ -728,6 +740,7 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill @@ -806,6 +819,7 @@ define void @spill_v2i16() { ; GFX1250-NEXT: ;;#ASMSTART ; GFX1250-NEXT: ;;#ASMEND ; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir index 1e9d4dda61bf3..1ce2166f9f470 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir @@ -49,6 +49,7 @@ body: | ; GFX1250-LABEL: name: generic_store_volatile ; GFX1250: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX1250-NEXT: {{ $}} + ; GFX1250-NEXT: S_WAIT_XCNT_soft 0 ; GFX1250-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr2_vgpr3, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1) ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0 ; GFX1250-NEXT: S_ENDPGM 0