Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}

void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// On entry to a block with multiple predescessors, there may
// be pending SMEM and VMEM events active at the same time.
// In such cases, only clear one active event at a time.
auto applyPendingXcntGroup = [this](unsigned E) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: this really should not have "group" in the name. Xcnt insertion has nothing to do with groups. This goes back to #145566 which added the bad names "SMEM_GROUP" and "VMEM_GROUP".

unsigned LowerBound = getScoreLB(X_CNT);
applyWaitcnt(X_CNT, 0);
PendingEvents |= (1 << E);
setScoreLB(X_CNT, LowerBound);
Comment on lines +1295 to +1298
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a very complicated way to write PendingEvents &= ~(1 << E) (where E is the other SMEM/VMEM event type). Can you simplify and inline this?

};

// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
return applyWaitcnt(X_CNT, 0);
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
if (hasPendingEvent(VMEM_GROUP))
applyPendingXcntGroup(VMEM_GROUP);
else
applyWaitcnt(X_CNT, 0);
return;
}

// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
!hasPendingEvent(STORE_CNT))
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
!hasPendingEvent(STORE_CNT)) {
if (hasPendingEvent(SMEM_GROUP))
applyPendingXcntGroup(SMEM_GROUP);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not safe. See #166154.

else
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
return;
}

applyWaitcnt(X_CNT, Wait.XCnt);
}
Expand Down
176 changes: 175 additions & 1 deletion llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
Original file line number Diff line number Diff line change
Expand Up @@ -945,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...

# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem_2
tracksRegLiveness: true
Expand All @@ -971,6 +970,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
Expand All @@ -985,6 +985,180 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...

---
name: wait_kmcnt_and_wait_loadcnt
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
; GCN: bb.0:
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: liveins: $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
; GCN-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
S_CBRANCH_SCC1 %bb.2, implicit $scc
bb.1:
liveins: $vgpr0_vgpr1, $sgpr2
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
bb.2:
liveins: $sgpr2
$sgpr2 = S_MOV_B32 $sgpr2
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
...

---
name: implicit_handling_of_pending_vmem_group
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
; GCN: bb.0:
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
S_CBRANCH_SCC1 %bb.2, implicit $scc
bb.1:
liveins: $vgpr0_vgpr1, $sgpr2
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
bb.2:
liveins: $sgpr0_sgpr1, $sgpr2
$sgpr2 = S_MOV_B32 $sgpr2
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $sgpr0
...

---
name: pending_vmem_event_between_block
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; GCN-LABEL: name: pending_vmem_event_between_block
; GCN: bb.0:
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
; GCN-NEXT: S_WAIT_XCNT 1
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
S_CBRANCH_SCC1 %bb.2, implicit $scc
bb.1:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
$vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
bb.2:
liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
$sgpr2 = S_MOV_B32 $sgpr2
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $sgpr0
...

---
name: flushing_vmem_cnt_on_block_entry
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
body: |
; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
; GCN: bb.0:
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
S_CBRANCH_SCC1 %bb.2, implicit $scc
bb.1:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
$vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
bb.2:
liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
$sgpr0 = S_MOV_B32 $sgpr0
...

---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
Expand Down
Loading