Skip to content

Commit 19cd5bd

Browse files
authored
[AMDGPU] Account for implicit XCNT insertion (#160812)
Hardware inserts an implicit `S_WAIT_XCNT 0` between alternate SMEM and VMEM instructions, so there are never outstanding address translations for both SMEM and VMEM at the same time.
1 parent a1db40f commit 19cd5bd

File tree

3 files changed

+14
-7
lines changed

3 files changed

+14
-7
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,6 +1013,15 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
10131013
}
10141014
}
10151015
} else if (T == X_CNT) {
1016+
WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1017+
if (PendingEvents & (1 << OtherEvent)) {
1018+
// Hardware inserts an implicit xcnt between interleaved
1019+
// SMEM and VMEM operations. So there will never be
1020+
// outstanding address translations for both SMEM and
1021+
// VMEM at the same time.
1022+
setScoreLB(T, CurrScore - 1);
1023+
PendingEvents &= ~(1 << OtherEvent);
1024+
}
10161025
for (const MachineOperand &Op : Inst.all_uses())
10171026
setScoreByOperand(&Inst, Op, T, CurrScore);
10181027
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -2220,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
22202229
// Now look at the instruction opcode. If it is a memory access
22212230
// instruction, update the upper-bound of the appropriate counter's
22222231
// bracket and the destination operand scores.
2232+
// For architectures with X_CNT, mark the source address operands
2233+
// with the appropriate counter values.
22232234
// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
22242235

22252236
bool IsVMEMAccess = false;

llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
256256
; GCN-NEXT: s_wait_storecnt 0x0
257257
; GCN-NEXT: .LBB5_3: ; %bb4
258258
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
259-
; GCN-NEXT: s_wait_xcnt 0x0
260259
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
261260
; GCN-NEXT: s_wait_kmcnt 0x0
262261
; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS

llvm/test/CodeGen/AMDGPU/wait-xcnt.mir

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,7 @@ body: |
520520
; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
521521
; GCN-NEXT: S_WAIT_KMCNT 0
522522
; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
523+
; GCN-NEXT: S_WAIT_XCNT 0
523524
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
524525
$sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
525526
$vgpr0 = V_MOV_B32_e32 1, implicit $exec
@@ -921,7 +922,6 @@ body: |
921922
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
922923
...
923924

924-
# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
925925
---
926926
name: wait_kmcnt_with_outstanding_vmem
927927
tracksRegLiveness: true
@@ -937,14 +937,14 @@ body: |
937937
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
938938
; GCN-NEXT: S_WAIT_KMCNT 0
939939
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
940+
; GCN-NEXT: S_WAIT_XCNT 0
940941
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
941942
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
942943
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
943944
$sgpr2 = S_MOV_B32 $sgpr2
944945
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
945946
...
946947

947-
# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
948948
---
949949
name: wait_loadcnt_with_outstanding_smem
950950
tracksRegLiveness: true
@@ -960,14 +960,14 @@ body: |
960960
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
961961
; GCN-NEXT: S_WAIT_LOADCNT 0
962962
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
963+
; GCN-NEXT: S_WAIT_XCNT 0
963964
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
964965
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
965966
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
966967
$vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
967968
$sgpr0 = S_MOV_B32 0
968969
...
969970

970-
# TODO: Unnecessary wait before overwriting vgpr0.
971971
---
972972
name: overwrite_vgpr_after_smem
973973
tracksRegLiveness: true
@@ -981,14 +981,12 @@ body: |
981981
; GCN-NEXT: {{ $}}
982982
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
983983
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
984-
; GCN-NEXT: S_WAIT_XCNT 0
985984
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
986985
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
987986
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
988987
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
989988
...
990989

991-
# TODO: Unnecessary wait before overwriting sgpr0.
992990
---
993991
name: overwrite_sgpr_after_vmem
994992
tracksRegLiveness: true
@@ -1002,7 +1000,6 @@ body: |
10021000
; GCN-NEXT: {{ $}}
10031001
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
10041002
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1005-
; GCN-NEXT: S_WAIT_XCNT 0
10061003
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
10071004
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
10081005
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec

0 commit comments

Comments
 (0)