diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6dcbced010a5a..5731de82f462e 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -646,6 +646,8 @@ class WaitcntBrackets { void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); + bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait); + bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait); void applyXcnt(const AMDGPU::Waitcnt &Wait); void updateByEvent(WaitEventType E, MachineInstr &MI); @@ -1287,20 +1289,26 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { } } -void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { +bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) { // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. - if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) - return applyWaitcnt(X_CNT, 0); + return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP); +} +bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) { // If we have pending store we cannot optimize XCnt because we do not wait for // stores. VMEM loads retun in order, so if we only have loads XCnt is // decremented to the same number as LOADCnt. - if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && - !hasPendingEvent(STORE_CNT)) - return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && + !hasPendingEvent(STORE_CNT); +} +void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { + if (hasRedundantXCntWithKmCnt(Wait)) + return applyWaitcnt(X_CNT, 0); + if (canOptimizeXCntWithLoadCnt(Wait)) + return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); applyWaitcnt(X_CNT, Wait.XCnt); } @@ -1636,6 +1644,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } } + // Save the pre combine waitcnt in order to make xcnt checks. + AMDGPU::Waitcnt PreCombine = Wait; if (CombinedLoadDsCntInstr) { // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need // to be waited for. Otherwise, let the instruction be deleted so @@ -1726,6 +1736,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( } for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { + if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) || + (CT == LOAD_CNT && + ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) + // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT + // due to taking the backedge of a block. + ScoreBrackets.applyXcnt(PreCombine); if (!WaitInstrs[CT]) continue; diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index b5b2655246c3f..365cdc20cb3f9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2115,7 +2115,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -2134,7 +2134,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { ; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo @@ -2170,7 +2169,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 ; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 ; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 @@ -2191,7 +2190,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 6a6f232c55e24..2756472652bc9 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x1 ; GFX1250-NEXT: s_mov_b32 s4, s14 ; GFX1250-NEXT: s_mov_b32 s5, s15 ; GFX1250-NEXT: s_mov_b32 s0, s8 @@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x1 ; GFX1250-NEXT: s_mov_b32 s4, s14 ; GFX1250-NEXT: s_mov_b32 s5, s15 ; GFX1250-NEXT: s_mov_b32 s0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index a50791e10f5a2..ed565ca43f9a3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -8814,7 +8814,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB44_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -8857,7 +8857,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB44_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9322,7 +9322,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9365,7 +9365,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB45_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9844,7 +9844,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB46_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9888,7 +9888,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB46_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -10365,7 +10365,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -10407,7 +10406,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -10857,7 +10855,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -10899,7 +10896,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11363,7 +11359,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11406,7 +11401,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11861,7 +11855,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11893,7 +11887,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB50_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12245,7 +12239,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12276,7 +12269,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12631,7 +12623,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12674,7 +12666,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB52_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13154,7 +13146,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13196,7 +13187,6 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13676,7 +13666,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13722,7 +13712,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB54_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14273,7 +14263,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14319,7 +14309,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB55_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14888,7 +14878,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -14936,7 +14926,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB56_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -15502,7 +15492,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15547,7 +15536,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16081,7 +16069,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16126,7 +16113,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16678,7 +16664,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -16725,7 +16710,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -17269,7 +17253,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17305,7 +17289,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB60_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17753,7 +17737,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -17788,7 +17771,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -18238,7 +18220,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB62_1 ; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18284,7 +18266,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB62_1 ; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end @@ -18854,7 +18836,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 -; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -18899,7 +18880,6 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 355d0026091d9..7aecae901becf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -101,7 +101,6 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] scale_offset scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1250-NEXT: global_store_b32 v[0:1], v0, off ; GFX1250-NEXT: s_endpgm