diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd..c9b095456971f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1027,6 +1027,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait.VmCnt = 0; } + CallingConv::ID CC = MI.getMF()->getFunction().getCallingConv(); + // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. @@ -1039,10 +1041,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly // release all VGPRs before the stores have completed, but it is only safe to - // do this if there are no outstanding scratch stores. + // do this if there are no outstanding scratch stores (either from the current + // function or potentially from a caller or callee). else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { - if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && + if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && + AMDGPU::isEntryFunctionCC(CC) && !OptNone && ScoreBrackets.getScoreRange(VS_CNT) != 0 && !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) ReleaseVGPRInsts.insert(&MI); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index 6fd6d6e2e31a1..65b70587fa0ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -17,8 +17,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: v_mov_b32_e32 v0, v10 ; GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11-NEXT: global_store_b32 v[8:9], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg: @@ -39,8 +37,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 % ; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 ; GFX11_W64-NEXT: s_not_b64 exec, exec ; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off -; GFX11_W64-NEXT: s_nop 0 -; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg: @@ -68,8 +64,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; GFX11-NEXT: v_mov_b32_e32 v1, v11 ; GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg_64: @@ -94,8 +88,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11 ; GFX11_W64-NEXT: s_not_b64 exec, exec ; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11_W64-NEXT: s_nop 0 -; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg_64: @@ -133,8 +125,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-NEXT: global_store_b32 v[8:9], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg_dpp: @@ -174,8 +164,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1 ; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off -; GFX11_W64-NEXT: s_nop 0 -; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg_dpp: @@ -233,8 +221,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11-NEXT: s_nop 0 -; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11-NEXT: s_endpgm ; ; DAGISEL11-LABEL: set_inactive_chain_arg_call: @@ -265,8 +251,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11-NEXT: s_nop 0 -; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11-NEXT: s_endpgm ; ; GISEL10-LABEL: set_inactive_chain_arg_call: @@ -380,8 +364,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11_W64-NEXT: s_nop 0 -; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11_W64-NEXT: s_endpgm ; ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_call: @@ -419,8 +401,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11_W64-NEXT: s_nop 0 -; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11_W64-NEXT: s_endpgm ; ; GISEL10_W64-LABEL: set_inactive_chain_arg_call: @@ -538,8 +518,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11-NEXT: s_nop 0 -; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11-NEXT: s_endpgm ; ; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr: @@ -570,8 +548,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11-NEXT: s_nop 0 -; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11-NEXT: s_endpgm ; ; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr: @@ -685,8 +661,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11_W64-NEXT: s_nop 0 -; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11_W64-NEXT: s_endpgm ; ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr: @@ -724,8 +698,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11_W64-NEXT: s_nop 0 -; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11_W64-NEXT: s_endpgm ; ; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr: diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir index 3a879e818af79..6366485874c11 100644 --- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir @@ -22,6 +22,8 @@ define amdgpu_ps void @global_atomic() { ret void } define amdgpu_ps void @image_atomic() { ret void } define amdgpu_ps void @global_store_optnone() noinline optnone { ret void } + define amdgpu_gfx void @gfx_function() { ret void } + define void @ccc_function() { ret void } ... --- @@ -556,3 +558,25 @@ body: | S_WAITCNT_VSCNT undef $sgpr_null, 0 S_ENDPGM 0 ... + +--- +name: gfx_function +body: | + bb.0: + ; CHECK-LABEL: name: gfx_function + ; CHECK-NOT: S_SENDMSG 3 + ; CHECK: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + S_ENDPGM 0 +... + +--- +name: ccc_function +body: | + bb.0: + ; CHECK-LABEL: name: ccc_function + ; CHECK-NOT: S_SENDMSG 3 + ; CHECK: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + S_ENDPGM 0 +...