Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
}
} else if (T == X_CNT) {
WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
if (PendingEvents & (1 << OtherEvent)) {
// Hardware inserts an implicit xcnt between interleaved
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
setScoreLB(T, CurrScore - 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit unorthodox. You would normally write setScoreLB(T, getScoreUB(T)) like in applyWaitcnt. In fact maybe you can use applyWaitcnt(T, 0) here, since it can update PendingEvents for you too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to clear either the SMEM_GROUP or the VMEM_GROUP, not both of them together.
I think getScoreUB() will be cleaner, I'll modify it.
Thanks!

PendingEvents &= ~(1 << OtherEvent);
}
for (const MachineOperand &Op : Inst.all_uses())
setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
Expand Down Expand Up @@ -2257,6 +2266,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
// For architectures with X_CNT, mark the source address operands
// with the appropriate counter values.
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.

bool IsVMEMAccess = false;
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: .LBB5_3: ; %bb4
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: s_wait_xcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ body: |
; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
$sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
Expand Down Expand Up @@ -921,7 +922,6 @@ body: |
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
...

# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem
tracksRegLiveness: true
Expand All @@ -937,14 +937,14 @@ body: |
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_MOV_B32 $sgpr2
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...

# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
Expand All @@ -960,14 +960,14 @@ body: |
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_WAIT_LOADCNT 0
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
$sgpr0 = S_MOV_B32 0
...

# TODO: Unnecessary wait before overwriting vgpr0.
---
name: overwrite_vgpr_after_smem
tracksRegLiveness: true
Expand All @@ -981,14 +981,12 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...

# TODO: Unnecessary wait before overwriting sgpr0.
---
name: overwrite_sgpr_after_vmem
tracksRegLiveness: true
Expand All @@ -1002,7 +1000,6 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
Expand Down