diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b008354cfd462..3d0a986fec138 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -895,6 +895,12 @@ def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug", [FeatureFP8ConversionInsts] >; +def FeatureWriteCombiningMissesHazards : SubtargetFeature<"write-combining-misses-hazards", + "HasWriteCombiningMissesHazards", + "true", + "Write combining misses hazards that require s_wait_xcnt(0) before every atomic operation" +>; + def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "HasPkFmacF16Inst", "true", @@ -2145,6 +2151,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureXNACK, FeatureClusters, FeatureD16Writes32BitVgpr, + FeatureWriteCombiningMissesHazards, ]>; def FeatureISAVersion12_51 : FeatureSet< @@ -2945,6 +2952,8 @@ def HasGWS : Predicate<"Subtarget->hasGWS()">; def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">; def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">; +def HasWriteCombiningMissesHazards : Predicate<"Subtarget->hasWriteCombiningMissesHazards()">; + def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">; def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f377b8aaf1333..52ca334f71bd4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -282,7 +282,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasPointSampleAccel = false; bool HasLdsBarrierArriveAtomic = false; bool HasSetPrioIncWgInst = false; - + bool HasWriteCombiningMissesHazards = false; bool RequiresCOV6 = false; bool UseBlockVGPROpsForCSR = false; bool HasGloballyAddressableScratch = false; @@ -1834,6 +1834,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return getGeneration() == GFX12; } + bool hasWriteCombiningMissesHazards() const { + return HasWriteCombiningMissesHazards; + } + // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base // read. bool hasScratchBaseForwardingHazard() const { diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 306d59d0867cd..25280eef6adb9 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2086,6 +2086,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Verify that the wait is actually needed. ScoreBrackets.simplifyWaitcnt(Wait); + // An s_wait_xcnt(0) before every atomic store/RMW operation is required to + // work around the write combining misses hazard. + if (ST->hasWriteCombiningMissesHazards() && SIInstrInfo::isAtomic(MI) && + SIInstrInfo::isVMEM(MI)) + Wait.XCnt = 0; + // When forcing emit, we need to skip terminators because that would break the // terminators of the MBB if we emit a waitcnt between terminators. if (ForceEmitZeroFlag && !MI.isTerminator()) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 7e297f46a780e..45843444143dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1764,6 +1764,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1802,6 +1803,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1842,6 +1844,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2088,6 +2091,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2126,6 +2130,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2168,6 +2173,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 54871a622189b..d159746726442 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -6,6 +6,7 @@ define float @global_system_atomic_fadd_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -18,6 +19,7 @@ define float @global_one_as_atomic_fadd_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -30,6 +32,7 @@ define double @global_system_atomic_fadd_f64(ptr addrspace(1) %ptr, double %val) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -42,6 +45,7 @@ define double @global_one_as_atomic_fadd_f64(ptr addrspace(1) %ptr, double %val) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -54,6 +58,7 @@ define float @global_system_atomic_fmin_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -66,6 +71,7 @@ define float @global_one_as_atomic_fmin_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -78,6 +84,7 @@ define double @global_system_atomic_fmin_f64(ptr addrspace(1) %ptr, double %val) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -90,6 +97,7 @@ define double @global_one_as_atomic_fmin_f64(ptr addrspace(1) %ptr, double %val) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -102,6 +110,7 @@ define float @global_system_atomic_fmax_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -114,6 +123,7 @@ define float @global_one_as_atomic_fmax_f32(ptr addrspace(1) %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -126,6 +136,7 @@ define double @global_system_atomic_fmax_f64(ptr addrspace(1) %ptr, double %val) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -138,6 +149,7 @@ define double @global_one_as_atomic_fmax_f64(ptr addrspace(1) %ptr, double %val) ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -150,6 +162,7 @@ define i32 @global_one_as_atomic_min_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -162,6 +175,7 @@ define i32 @global_system_atomic_min_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -174,6 +188,7 @@ define i32 @global_one_as_atomic_max_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -186,6 +201,7 @@ define i32 @global_system_atomic_max_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -198,6 +214,7 @@ define i32 @global_one_as_atomic_umin_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -210,6 +227,7 @@ define i32 @global_system_atomic_umin_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -222,6 +240,7 @@ define i32 @global_one_as_atomic_umax_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -234,6 +253,7 @@ define i32 @global_system_atomic_umax_i32(ptr addrspace(1) %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -246,6 +266,7 @@ define i64 @global_one_as_atomic_min_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -258,6 +279,7 @@ define i64 @global_system_atomic_min_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -270,6 +292,7 @@ define i64 @global_one_as_atomic_max_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -282,6 +305,7 @@ define i64 @global_system_atomic_max_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -294,6 +318,7 @@ define i64 @global_one_as_atomic_umin_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -306,6 +331,7 @@ define i64 @global_system_atomic_umin_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_min_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -318,6 +344,7 @@ define i64 @global_one_as_atomic_umax_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -330,6 +357,7 @@ define i64 @global_system_atomic_umax_i64(ptr addrspace(1) %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_max_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -514,6 +542,7 @@ define float @flat_system_atomic_fadd_f32(ptr %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -526,6 +555,7 @@ define float @flat_one_as_atomic_fadd_f32(ptr %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -562,6 +592,7 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1250-NEXT: s_cbranch_execz .LBB34_5 ; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -627,6 +658,7 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1250-NEXT: s_cbranch_execz .LBB35_5 ; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -668,6 +700,7 @@ define float @flat_system_atomic_fmin_f32(ptr %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -680,6 +713,7 @@ define float @flat_one_as_atomic_fmin_f32(ptr %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -709,6 +743,7 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-NEXT: .LBB38_3: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -755,6 +790,7 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-NEXT: .LBB39_3: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -784,6 +820,7 @@ define float @flat_system_atomic_fmax_f32(ptr %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -796,6 +833,7 @@ define float @flat_one_as_atomic_fmax_f32(ptr %ptr, float %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -825,6 +863,7 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -871,6 +910,7 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -900,6 +940,7 @@ define i32 @flat_one_as_atomic_min_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -912,6 +953,7 @@ define i32 @flat_system_atomic_min_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -924,6 +966,7 @@ define i32 @flat_one_as_atomic_max_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -936,6 +979,7 @@ define i32 @flat_system_atomic_max_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -948,6 +992,7 @@ define i32 @flat_one_as_atomic_umin_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -960,6 +1005,7 @@ define i32 @flat_system_atomic_umin_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -972,6 +1018,7 @@ define i32 @flat_one_as_atomic_umax_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -984,6 +1031,7 @@ define i32 @flat_system_atomic_umax_i32(ptr %ptr, i32 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -1004,6 +1052,7 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1044,6 +1093,7 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1084,6 +1134,7 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1124,6 +1175,7 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1164,6 +1216,7 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1204,6 +1257,7 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1244,6 +1298,7 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -1284,6 +1339,7 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 357234080235a..fcad89f01ee92 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -12,6 +12,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -51,6 +52,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -91,6 +93,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -136,6 +139,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -176,6 +180,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -217,6 +222,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -276,6 +282,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -292,6 +299,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -343,6 +351,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -359,6 +368,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -411,6 +421,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -426,6 +437,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -475,6 +487,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -490,6 +503,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -561,6 +575,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -607,6 +622,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -741,6 +757,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -790,6 +807,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -925,6 +943,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -962,6 +981,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1071,6 +1091,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1111,6 +1132,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1214,6 +1236,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1254,6 +1277,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1301,6 +1325,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1339,6 +1364,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1403,6 +1429,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1449,6 +1476,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1587,6 +1615,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1636,6 +1665,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1775,6 +1805,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1815,6 +1846,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1935,6 +1967,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1978,6 +2011,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2092,6 +2126,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2132,6 +2167,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2179,6 +2215,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2217,6 +2254,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2281,6 +2319,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2327,6 +2366,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2467,6 +2507,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2516,6 +2557,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2657,6 +2699,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2697,6 +2740,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2819,6 +2863,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2862,6 +2907,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2978,6 +3024,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3018,6 +3065,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3065,6 +3113,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3103,6 +3152,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3167,6 +3217,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3214,6 +3265,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3353,6 +3405,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3403,6 +3456,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3543,6 +3597,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3584,6 +3639,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3705,6 +3761,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3749,6 +3806,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3864,6 +3922,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3904,6 +3963,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3951,6 +4011,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3989,6 +4050,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4053,6 +4115,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4100,6 +4163,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4239,6 +4303,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4289,6 +4354,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4429,6 +4495,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4470,6 +4537,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4591,6 +4659,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4635,6 +4704,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4750,6 +4820,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4790,6 +4861,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4837,6 +4909,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4875,6 +4948,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4939,6 +5013,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4986,6 +5061,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5125,6 +5201,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5175,6 +5252,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5315,6 +5393,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5356,6 +5435,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5477,6 +5557,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5521,6 +5602,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5634,6 +5716,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -5666,6 +5749,7 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -5705,6 +5789,7 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5736,6 +5821,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5792,6 +5878,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -5834,6 +5921,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -5970,6 +6058,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -6015,6 +6104,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -6153,6 +6243,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -6190,6 +6281,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -6307,6 +6399,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -6347,6 +6440,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -6458,6 +6552,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -6490,6 +6585,7 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -6529,6 +6625,7 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6560,6 +6657,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6616,6 +6714,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -6658,6 +6757,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -6794,6 +6894,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -6839,6 +6940,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -6977,6 +7079,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -7014,6 +7117,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -7131,6 +7235,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -7171,6 +7276,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -7282,6 +7388,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -7314,6 +7421,7 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -7353,6 +7461,7 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -7384,6 +7493,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -7440,6 +7550,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -7482,6 +7593,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -7618,6 +7730,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -7663,6 +7776,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -7801,6 +7915,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -7838,6 +7953,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -7955,6 +8071,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -7995,6 +8112,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -8106,6 +8224,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -8138,6 +8257,7 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -8177,6 +8297,7 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -8208,6 +8329,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -8264,6 +8386,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -8306,6 +8429,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -8442,6 +8566,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -8487,6 +8612,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -8625,6 +8751,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -8662,6 +8789,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -8779,6 +8907,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -8819,6 +8948,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -8933,6 +9063,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -8976,6 +9107,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -9026,6 +9158,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -9066,6 +9199,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -9132,6 +9266,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS @@ -9180,6 +9315,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS @@ -9329,6 +9465,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS @@ -9380,6 +9517,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS @@ -9530,6 +9668,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS @@ -9572,6 +9711,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS @@ -9702,6 +9842,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS @@ -9747,6 +9888,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS @@ -9868,6 +10010,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -9900,6 +10043,7 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -9939,6 +10083,7 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -9967,6 +10112,7 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -10021,6 +10167,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB98_5 ; GFX1250-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -10067,6 +10214,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB98_5 ; GFX1250-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -10213,6 +10361,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB99_5 ; GFX1250-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -10262,6 +10411,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB99_5 ; GFX1250-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -10409,6 +10559,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -10447,6 +10598,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -10569,6 +10721,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -10610,6 +10763,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -10727,6 +10881,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -10759,6 +10914,7 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog @@ -10798,6 +10954,7 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -10826,6 +10983,7 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; @@ -10880,6 +11038,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB106_5 ; GFX1250-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -10929,6 +11088,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB106_5 ; GFX1250-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -11078,6 +11238,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB107_5 ; GFX1250-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -11130,6 +11291,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB107_5 ; GFX1250-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -11280,6 +11442,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -11321,6 +11484,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -11446,6 +11610,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 @@ -11490,6 +11655,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 @@ -11624,6 +11790,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5 @@ -11678,6 +11845,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow @@ -11845,6 +12013,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow @@ -11891,6 +12060,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow @@ -12038,6 +12208,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3 @@ -12076,6 +12247,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow @@ -12192,6 +12364,7 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 @@ -12224,6 +12397,7 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow @@ -12331,6 +12505,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3 @@ -12369,6 +12544,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow @@ -12485,6 +12661,7 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 @@ -12517,6 +12694,7 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow @@ -12615,6 +12793,7 @@ define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12647,6 +12826,7 @@ define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12679,6 +12859,7 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12742,6 +12923,7 @@ define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12804,6 +12986,7 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12867,6 +13050,7 @@ define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12929,6 +13113,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12961,6 +13146,7 @@ define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13325,6 +13511,7 @@ define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13349,6 +13536,7 @@ define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index ba761bedb905c..033d4b8936472 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -123,6 +123,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX1250-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 6484c2f82ff94..6555b32e4244a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1645,6 +1645,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1683,6 +1684,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1723,6 +1725,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1950,6 +1953,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1988,6 +1992,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2030,6 +2035,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index a50791e10f5a2..37e0ff2f71226 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -24,6 +24,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -221,6 +222,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -420,6 +422,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -629,6 +632,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -820,6 +824,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1014,6 +1019,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1216,6 +1222,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1418,6 +1425,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1615,6 +1623,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1830,6 +1839,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2045,6 +2055,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2253,6 +2264,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2452,6 +2464,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2637,6 +2650,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2852,6 +2866,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3060,6 +3075,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3254,6 +3270,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3422,6 +3439,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3630,6 +3648,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3843,6 +3862,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4048,6 +4068,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4261,6 +4282,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4466,6 +4488,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4663,6 +4686,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4858,6 +4882,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5041,6 +5066,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5226,6 +5252,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5421,6 +5448,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5586,6 +5614,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5754,6 +5783,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5930,6 +5960,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -6118,6 +6149,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -6289,6 +6321,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -6474,6 +6507,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -6642,6 +6676,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -6855,6 +6890,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -7060,6 +7096,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -7243,6 +7280,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -7412,6 +7450,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -7646,6 +7685,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -7881,6 +7921,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8120,6 +8161,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8336,6 +8378,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8555,6 +8598,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19426,6 +19470,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19669,6 +19714,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19914,6 +19960,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20163,6 +20210,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20384,6 +20432,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20608,6 +20657,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20840,6 +20890,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -21088,6 +21139,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -21315,6 +21367,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21572,6 +21625,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21819,6 +21873,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -22062,6 +22117,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -22283,6 +22339,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -22540,6 +22597,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -22791,6 +22849,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -23179,6 +23238,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -23569,6 +23629,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -23963,6 +24024,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -24339,6 +24401,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -24718,6 +24781,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -25105,6 +25169,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -25498,6 +25563,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -25880,6 +25946,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -26268,6 +26335,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -26644,6 +26712,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -27032,6 +27101,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -27408,6 +27478,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -27796,6 +27867,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll index 20b876836082e..e39efcd30f7cc 100644 --- a/llvm/test/CodeGen/AMDGPU/literal64.ll +++ b/llvm/test/CodeGen/AMDGPU/literal64.ll @@ -68,6 +68,7 @@ define void @v_mov_b64_double(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 +; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_SYS ; GCN-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, double 153.1 monotonic @@ -80,6 +81,7 @@ define void @v_mov_b64_int(ptr addrspace(1) %ptr) { ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 +; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: global_atomic_add_u64 v[0:1], v[2:3], off scope:SCOPE_SYS ; GCN-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i64 64729929336 monotonic diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index a18847b56a330..55dd20f6f4720 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -311,6 +311,7 @@ define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx ; GCN-LABEL: flat_atomicrmw_b32_idxprom: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_endpgm entry: @@ -342,6 +343,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-NEXT: s_branch .LBB21_5 ; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global ; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1 +; SDAG-NEXT: s_wait_xcnt 0x0 ; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SDAG-NEXT: s_wait_xcnt 0x0 @@ -386,6 +388,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: s_wait_xcnt 0x0 ; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GISEL-NEXT: s_wait_xcnt 0x0 ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll index a0fca0e2bdc72..9d49c2539eb26 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -311,6 +311,7 @@ define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inr ; GCN-LABEL: global_atomicrmw_b32_idxprom: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_endpgm entry: @@ -324,6 +325,7 @@ define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) ; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b64_e32 v[2:3], 1 +; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index fe16f0d44dd1c..8479f96f73672 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -486,7 +486,6 @@ body: | ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1) ; GCN-NEXT: GLOBAL_ATOMIC_ADD_F32 $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, implicit $exec :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1) ; GCN-NEXT: $vgpr6 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: S_WAIT_XCNT 2 ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: S_WAIT_XCNT 1 ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec