diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 15a1db51c6d78..072c1ab0c79d0 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1857,7 +1857,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // not, we need to ensure the subtarget is capable of backing off barrier // instructions in case there are any outstanding memory operations that may // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. - if (MI.getOpcode() == AMDGPU::S_BARRIER && + if (TII->isBarrierStart(MI.getOpcode()) && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index b314b9b2fb513..84bb73cc9a796 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -925,6 +925,17 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform; } + // Check to see if opcode is for a barrier start. Pre gfx12 this is just the + // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want + // to check for the barrier start (S_BARRIER_SIGNAL*) + bool isBarrierStart(unsigned Opcode) const { + return Opcode == AMDGPU::S_BARRIER || + Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 || + Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM; + } + static bool doesNotReadTiedSource(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index a7d3115af29bf..47c021769aa56 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -96,6 +96,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT4-NEXT: s_wait_kmcnt 0x0 ; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 ; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 ; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 @@ -142,6 +143,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 ; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 ; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 ; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 4ab5e97964a85..38a34ec6daf73 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -12,6 +12,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal -1 ; GCN-NEXT: s_barrier_wait -1 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -28,6 +29,7 @@ define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal -1 ; GLOBAL-ISEL-NEXT: s_barrier_wait -1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -56,6 +58,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal 1 ; GCN-NEXT: s_barrier_wait 1 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -72,6 +75,7 @@ define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal 1 ; GLOBAL-ISEL-NEXT: s_barrier_wait 1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -100,6 +104,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal 0 ; GCN-NEXT: s_barrier_wait 0 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -116,6 +121,7 @@ define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal 0 ; GLOBAL-ISEL-NEXT: s_barrier_wait 0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -146,6 +152,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v1, s[0:1] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal m0 ; GCN-NEXT: s_barrier_wait 1 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -163,6 +170,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal m0 ; GLOBAL-ISEL-NEXT: s_barrier_wait 1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] @@ -192,6 +200,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) { ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: s_mov_b32 m0, s0 +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal m0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -203,6 +212,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) { ; GLOBAL-ISEL-NEXT: s_wait_bvhcnt 0x0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal m0 ; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg) @@ -216,6 +226,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal_isfirst -1 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 ; GCN-NEXT: s_cselect_b32 s2, s2, s4 @@ -235,6 +246,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -270,6 +282,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal_isfirst 1 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 ; GCN-NEXT: s_cselect_b32 s2, s2, s4 @@ -289,6 +302,7 @@ define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -324,6 +338,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal_isfirst 1 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 ; GCN-NEXT: s_cselect_b32 s2, s2, s4 @@ -343,6 +358,7 @@ define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, p ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -379,6 +395,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GCN-NEXT: s_mov_b32 m0, 1 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[6:7] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal_isfirst m0 ; GCN-NEXT: s_cselect_b32 s3, s3, s5 ; GCN-NEXT: s_cselect_b32 s2, s2, s4 @@ -399,6 +416,7 @@ define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) % ; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -444,6 +462,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa ; GCN-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9 ; GCN-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo ; GCN-NEXT: global_store_b32 v[7:8], v10, off +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal_isfirst m0 ; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3 @@ -470,6 +489,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa ; GLOBAL-ISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo ; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v9, 0 ; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v9, off +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0 ; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1339,6 +1359,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v3, v2, s[0:1] +; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: s_barrier_signal -1 ; GCN-NEXT: s_barrier_wait -1 ; GCN-NEXT: global_store_b32 v3, v0, s[0:1] @@ -1355,6 +1376,7 @@ define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 { ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0 ; GLOBAL-ISEL-NEXT: s_barrier_signal -1 ; GLOBAL-ISEL-NEXT: s_barrier_wait -1 ; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]