diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 4ebb0f746f464..e3a2efdd3856f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2014,11 +2014,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, } } - // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does - // not, we need to ensure the subtarget is capable of backing off barrier - // instructions in case there are any outstanding memory operations that may - // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. - if (TII->isBarrierStart(MI.getOpcode()) && + // Ensure safety against exceptions from outstanding memory operations while + // waiting for a barrier: + // + // * Some subtargets safely handle backing off the barrier in hardware + // when an exception occurs. + // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that + // there can be no outstanding memory operations during the wait. + // * Subtargets with split barriers don't need to back off the barrier; it + // is up to the trap handler to preserve the user barrier state correctly. + // + // In all other cases, ensure safety by ensuring that there are no outstanding + // memory operations. + if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index f7c7bb509c9ef..fdbd9ce4a66bf 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -983,19 +983,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform; } - // Check to see if opcode is for a barrier start. Pre gfx12 this is just the - // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want - // to check for the barrier start (S_BARRIER_SIGNAL*) - bool isBarrierStart(unsigned Opcode) const { + bool isBarrier(unsigned Opcode) const { return Opcode == AMDGPU::S_BARRIER || Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 || Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 || Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM || - Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM; - } - - bool isBarrier(unsigned Opcode) const { - return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM || + Opcode == AMDGPU::S_BARRIER_WAIT || Opcode == AMDGPU::S_BARRIER_INIT_M0 || Opcode == AMDGPU::S_BARRIER_INIT_IMM || Opcode == AMDGPU::S_BARRIER_JOIN_IMM || diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 90e150c89955b..9003251253740 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -98,7 +98,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT4-NEXT: s_wait_kmcnt 0x0 ; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2 ; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1] -; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 ; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -145,7 +144,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 ; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1] ; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll index 651d204f65b6c..248e0c716b975 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll @@ -11,7 +11,6 @@ define i1 @func1() { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 ; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffe @@ -27,7 +26,6 @@ define i1 @func1() { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0 -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 ; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll index 1821bd45dc1cc..a4fa8e4b3c8e2 100644 --- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll @@ -14,11 +14,10 @@ define void @func1() { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 3 ; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003 +; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -30,13 +29,12 @@ define void @func1() { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003 -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 3 +; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7) call void @llvm.amdgcn.s.barrier.wait(i16 1) ret void } @@ -49,11 +47,10 @@ define void @func2() { ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001 -; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 -; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 1 ; GFX12-SDAG-NEXT: s_barrier_join m0 +; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001 +; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -65,13 +62,12 @@ define void @func2() { ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001 -; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 1 +; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2) + call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7) call void @llvm.amdgcn.s.barrier.wait(i16 1) ret void } @@ -102,9 +98,9 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX12-SDAG-NEXT: s_barrier_signal -1 -; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 +; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1 ; GFX12-SDAG-NEXT: s_barrier_wait 1 ; GFX12-SDAG-NEXT: s_barrier_leave ; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0 @@ -155,11 +151,11 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_mov_b32 m0, s1 ; GFX12-GISEL-NEXT: s_barrier_signal m0 +; GFX12-GISEL-NEXT: s_mov_b32 m0, s0 ; GFX12-GISEL-NEXT: s_barrier_signal -1 +; GFX12-GISEL-NEXT: s_barrier_join m0 ; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1 -; GFX12-GISEL-NEXT: s_mov_b32 m0, s0 ; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48 -; GFX12-GISEL-NEXT: s_barrier_join m0 ; GFX12-GISEL-NEXT: s_barrier_wait 1 ; GFX12-GISEL-NEXT: s_barrier_leave ; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2 @@ -194,8 +190,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12) call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9) call void @llvm.amdgcn.s.barrier.signal(i32 -1) - %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in) + %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1) call void @llvm.amdgcn.s.barrier.wait(i16 1) call void @llvm.amdgcn.s.barrier.leave(i16 1) %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar) @@ -219,7 +215,6 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002 ; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_barrier_signal m0 ; GFX12-SDAG-NEXT: s_mov_b32 m0, 2 ; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -227,6 +222,7 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-SDAG-NEXT: s_mov_b32 s32, 0 ; GFX12-SDAG-NEXT: s_barrier_join m0 ; GFX12-SDAG-NEXT: s_barrier_wait 1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-SDAG-NEXT: s_endpgm ; @@ -245,10 +241,10 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) ; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX12-GISEL-NEXT: s_mov_b32 s32, 0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_barrier_signal m0 ; GFX12-GISEL-NEXT: s_barrier_join 2 ; GFX12-GISEL-NEXT: s_barrier_wait 1 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX12-GISEL-NEXT: s_endpgm call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)