Skip to content

Commit 8ad5479

Browse files
committed
[AMDGPU][SIInsertWaitCnts] Refactor xcnt optimization
Refactor the XCnt optimization checks so that they can be checked when applying a pre-existing waitcnt. This has the effect of removing unnecessary xcnt waits when taking a loop backedge.
1 parent d10af9d commit 8ad5479

File tree

5 files changed

+42
-53
lines changed

5 files changed

+42
-53
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,8 @@ class WaitcntBrackets {
646646

647647
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
648648
void applyWaitcnt(InstCounterType T, unsigned Count);
649+
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
650+
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
649651
void applyXcnt(const AMDGPU::Waitcnt &Wait);
650652
void updateByEvent(WaitEventType E, MachineInstr &MI);
651653

@@ -1287,20 +1289,25 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12871289
}
12881290
}
12891291

1290-
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1292+
bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
12911293
// Wait on XCNT is redundant if we are already waiting for a load to complete.
12921294
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
12931295
// zero.
1294-
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1295-
return applyWaitcnt(X_CNT, 0);
1296+
return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1297+
}
12961298

1299+
bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
12971300
// If we have pending store we cannot optimize XCnt because we do not wait for
12981301
// stores. VMEM loads retun in order, so if we only have loads XCnt is
12991302
// decremented to the same number as LOADCnt.
1300-
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1301-
!hasPendingEvent(STORE_CNT))
1302-
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1303+
return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && !hasPendingEvent(STORE_CNT);
1304+
}
13031305

1306+
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1307+
if (hasRedundantXCntWithKmCnt(Wait))
1308+
return applyWaitcnt(X_CNT, 0);
1309+
if (canOptimizeXCntWithLoadCnt(Wait))
1310+
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
13041311
applyWaitcnt(X_CNT, Wait.XCnt);
13051312
}
13061313

@@ -1729,6 +1736,12 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
17291736
if (!WaitInstrs[CT])
17301737
continue;
17311738

1739+
if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(Wait)) ||
1740+
(CT == LOAD_CNT && ScoreBrackets.canOptimizeXCntWithLoadCnt(Wait)))
1741+
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1742+
// due to taking the backedge of a block.
1743+
ScoreBrackets.applyXcnt(Wait);
1744+
17321745
unsigned NewCnt = getWait(Wait, CT);
17331746
if (NewCnt != ~0u) {
17341747
Modified |= updateOperandIfDifferent(*WaitInstrs[CT],

llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2115,7 +2115,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
21152115
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
21162116
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
21172117
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
2118-
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
2118+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
21192119
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
21202120
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
21212121
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2134,7 +2134,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
21342134
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
21352135
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
21362136
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
2137-
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
21382137
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21392138
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
21402139
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
@@ -2170,7 +2169,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
21702169
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
21712170
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
21722171
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
2173-
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
2172+
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
21742173
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
21752174
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
21762175
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
@@ -2191,7 +2190,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
21912190
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
21922191
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
21932192
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
2194-
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
2193+
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
21952194
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
21962195
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
21972196
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo

llvm/test/CodeGen/AMDGPU/fmin3.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
12331233
; GFX1250-NEXT: s_wait_loadcnt 0x0
12341234
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
12351235
; GFX1250-NEXT: s_wait_loadcnt 0x0
1236-
; GFX1250-NEXT: s_wait_xcnt 0x1
12371236
; GFX1250-NEXT: s_mov_b32 s4, s14
12381237
; GFX1250-NEXT: s_mov_b32 s5, s15
12391238
; GFX1250-NEXT: s_mov_b32 s0, s8
@@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
14431442
; GFX1250-NEXT: s_wait_loadcnt 0x0
14441443
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
14451444
; GFX1250-NEXT: s_wait_loadcnt 0x0
1446-
; GFX1250-NEXT: s_wait_xcnt 0x1
14471445
; GFX1250-NEXT: s_mov_b32 s4, s14
14481446
; GFX1250-NEXT: s_mov_b32 s5, s15
14491447
; GFX1250-NEXT: s_mov_b32 s0, s8

0 commit comments

Comments
 (0)