From 438429872ed15cbe3661028fb1024fc127c75789 Mon Sep 17 00:00:00 2001 From: Aaditya Date: Fri, 26 Sep 2025 11:26:22 +0530 Subject: [PATCH] [AMDGPU] Fix broken XCNT test cases Hardware inserts an implicit `S_WAIT_XCNT 0` between alternate SMEM and VMEM events, so there are never outstanding address translations for both SMEM and VMEM at the same time. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 11 +++++++++++ llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll | 1 - llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 9 +++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ae75fb529dade..ebf059a7bff2d 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1004,6 +1004,15 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } } } else if (T == X_CNT) { + WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP; + if (PendingEvents & (1 << OtherEvent)) { + // Hardware inserts an implicit xcnt between interleaved + // SMEM and VMEM operations. So there will never be + // outstanding address translations for both SMEM and + // VMEM at the same time. + setScoreLB(T, CurrScore - 1); + PendingEvents &= ~(1 << OtherEvent); + } for (const MachineOperand &Op : Inst.all_uses()) setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore); } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { @@ -2257,6 +2266,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. + // For architectures with X_CNT, mark the source address operands + // with the appropriate counter values. // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. bool IsVMEMAccess = false; diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 243f0ed3a8d0d..f8655a702180e 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: .LBB5_3: ; %bb4 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index af8b9e7c8cd82..6fe99d895c7bb 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -520,6 +520,7 @@ body: | ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) $vgpr0 = V_MOV_B32_e32 1, implicit $exec @@ -921,7 +922,6 @@ body: | $vgpr2 = V_MOV_B32_e32 1, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. --- name: wait_kmcnt_with_outstanding_vmem tracksRegLiveness: true @@ -937,6 +937,7 @@ body: | ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec @@ -944,7 +945,6 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0. --- name: wait_loadcnt_with_outstanding_smem tracksRegLiveness: true @@ -960,6 +960,7 @@ body: | ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ; GCN-NEXT: S_WAIT_LOADCNT 0 ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 @@ -967,7 +968,6 @@ body: | $sgpr0 = S_MOV_B32 0 ... -# TODO: Unnecessary wait before overwriting vgpr0. --- name: overwrite_vgpr_after_smem tracksRegLiveness: true @@ -981,14 +981,12 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 - ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# TODO: Unnecessary wait before overwriting sgpr0. --- name: overwrite_sgpr_after_vmem tracksRegLiveness: true @@ -1002,7 +1000,6 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec