Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 47 additions & 47 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,8 +633,11 @@ class WaitcntBrackets {
const MachineOperand &Op) const;

bool counterOutOfOrder(InstCounterType T) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
void simplifyXcnt(AMDGPU::Waitcnt &Wait);

void determineWait(InstCounterType T, RegInterval Interval,
AMDGPU::Waitcnt &Wait) const;
Expand All @@ -646,7 +649,6 @@ class WaitcntBrackets {

void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
void applyXcnt(const AMDGPU::Waitcnt &Wait);
void updateByEvent(WaitEventType E, MachineInstr &MI);

unsigned hasPendingEvent() const { return PendingEvents; }
Expand Down Expand Up @@ -1192,15 +1194,15 @@ void WaitcntBrackets::print(raw_ostream &OS) const {

/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
simplifyWaitcnt(DS_CNT, Wait.DsCnt);
simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
simplifyWaitcnt(KM_CNT, Wait.KmCnt);
simplifyWaitcnt(X_CNT, Wait.XCnt);
simplifyXcnt(Wait);
}

void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
Expand Down Expand Up @@ -1270,7 +1272,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
applyWaitcnt(BVH_CNT, Wait.BvhCnt);
applyWaitcnt(KM_CNT, Wait.KmCnt);
applyXcnt(Wait);
applyWaitcnt(X_CNT, Wait.XCnt);
}

void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
Expand All @@ -1287,41 +1289,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
}

void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
auto applyPendingXcntGroup = [this](unsigned E) {
unsigned LowerBound = getScoreLB(X_CNT);
applyWaitcnt(X_CNT, 0);
PendingEvents |= (1 << E);
setScoreLB(X_CNT, LowerBound);
};

bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
if (hasPendingEvent(VMEM_GROUP))
applyPendingXcntGroup(VMEM_GROUP);
else
applyWaitcnt(X_CNT, 0);
return;
}
return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
}

bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
!hasPendingEvent(STORE_CNT)) {
if (hasPendingEvent(SMEM_GROUP))
applyPendingXcntGroup(SMEM_GROUP);
else
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
return;
}
return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
!hasPendingEvent(STORE_CNT) && !hasPendingEvent(SMEM_GROUP);
}

applyWaitcnt(X_CNT, Wait.XCnt);
void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &Wait) {
// Try to simplify xcnt further by checking for joint kmcnt and loadcnt
// optimizations. On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
if (hasRedundantXCntWithKmCnt(Wait)) {
if (hasPendingEvent(VMEM_GROUP)) {
// Only clear the SMEM_GROUP event, but VMEM_GROUP could still require
// handling.
PendingEvents &= ~(1 << SMEM_GROUP);
} else {
applyWaitcnt(X_CNT, 0);
}
} else if (canOptimizeXCntWithLoadCnt(Wait)) {
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
}
simplifyWaitcnt(X_CNT, Wait.XCnt);
}

// Where there are multiple types of event in the bracket of a counter,
Expand Down Expand Up @@ -1656,6 +1655,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}
}

// Save the pre combine waitcnt in order to make xcnt checks.
AMDGPU::Waitcnt PreCombine = Wait;
if (CombinedLoadDsCntInstr) {
// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
// to be waited for. Otherwise, let the instruction be deleted so
Expand Down Expand Up @@ -1746,6 +1747,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
}

for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
(CT == LOAD_CNT &&
ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
// Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
// due to taking the backedge of a block.
ScoreBrackets.simplifyXcnt(PreCombine);
}
if (!WaitInstrs[CT])
continue;

Expand Down Expand Up @@ -2089,6 +2097,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.DsCnt = 0;
}

// Since the translation for VMEM addresses occur in-order, we can skip the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
if (isVmemAccess(MI)) {
Wait.XCnt = ~0u;
}

// Verify that the wait is actually needed.
ScoreBrackets.simplifyWaitcnt(Wait);

Expand Down Expand Up @@ -2160,21 +2175,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
<< "Update Instr: " << *It);
}

// XCnt may be already consumed by a load wait.
if (Wait.XCnt != ~0u) {
if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
Wait.XCnt = ~0u;

if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
Wait.XCnt = ~0u;

// Since the translation for VMEM addresses occur in-order, we can skip the
// XCnt if the current instruction is of VMEM type and has a memory
// dependency with another VMEM instruction in flight.
if (isVmemAccess(*It))
Wait.XCnt = ~0u;
}

if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;

Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2107,7 +2107,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
Expand All @@ -2126,7 +2126,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
Expand Down Expand Up @@ -2162,7 +2161,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
Expand All @@ -2183,7 +2182,6 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fmin3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,6 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: s_mov_b32 s4, s14
; GFX1250-NEXT: s_mov_b32 s5, s15
; GFX1250-NEXT: s_mov_b32 s0, s8
Expand Down Expand Up @@ -1443,7 +1442,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_xcnt 0x1
; GFX1250-NEXT: s_mov_b32 s4, s14
; GFX1250-NEXT: s_mov_b32 s5, s15
; GFX1250-NEXT: s_mov_b32 s0, s8
Expand Down
Loading