From 48b3801ed3a1d44e36aed980a053e67f274fd853 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Wed, 8 Nov 2023 08:57:45 +0100 Subject: [PATCH 1/3] [AMDGPU] Don't DEALLOC_VGPRS from callable functions Callable functions should not send the DEALLOC_VGPRS message, because that might release the VGPRs and scratch allocation before the caller is done with them. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 9 +++++-- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 ++++ llvm/test/CodeGen/AMDGPU/release-vgprs.mir | 26 +++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd..d862b37443aec 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1027,6 +1027,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait.VmCnt = 0; } + CallingConv::ID CC = MI.getMF()->getFunction().getCallingConv(); + // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. @@ -1039,10 +1041,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly // release all VGPRs before the stores have completed, but it is only safe to - // do this if there are no outstanding scratch stores. + // do this if: + // * there are no outstanding scratch stores + // * this is not a callable function else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { - if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && + if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && + !AMDGPU::isCallableCC(CC) && !OptNone && ScoreBrackets.getScoreRange(VS_CNT) != 0 && !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) ReleaseVGPRInsts.insert(&MI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a09abc639d759..5ea135c7b90dd 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1924,6 +1924,10 @@ bool isChainCC(CallingConv::ID CC) { } } +bool isCallableCC(CallingConv::ID CC) { + return !isEntryFunctionCC(CC) && !isChainCC(CC); +} + bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 1e0994d0862cf..9654140192634 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1114,6 +1114,11 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC); LLVM_READNONE bool isChainCC(CallingConv::ID CC); +// Functions that are called via the 'call' instruction, rather than launched +// by the hardware or via the 'llvm.amdgcn.cs.chain' intrinsic. +LLVM_READNONE +bool isCallableCC(CallingConv::ID CC); + bool isKernelCC(const Function *Func); // FIXME: Remove this when calling conventions cleaned up diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir index 3a879e818af79..39ced04253b57 100644 --- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir @@ -22,6 +22,8 @@ define amdgpu_ps void @global_atomic() { ret void } define amdgpu_ps void @image_atomic() { ret void } define amdgpu_ps void @global_store_optnone() noinline optnone { ret void } + define amdgpu_gfx void @gfx_function() { ret void } + define void @ccc_function() { ret void } ... --- @@ -556,3 +558,27 @@ body: | S_WAITCNT_VSCNT undef $sgpr_null, 0 S_ENDPGM 0 ... + +--- +name: gfx_function +body: | + bb.0: + ; CHECK-LABEL: name: gfx_function + ; CHECK-NOT: S_SENDMSG 3 + ; CHECK: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + S_WAITCNT_VSCNT undef $sgpr_null, 0 + S_ENDPGM 0 +... + +--- +name: ccc_function +body: | + bb.0: + ; CHECK-LABEL: name: ccc_function + ; CHECK-NOT: S_SENDMSG 3 + ; CHECK: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + S_WAITCNT_VSCNT undef $sgpr_null, 0 + S_ENDPGM 0 +... From 39f5b846aebc7a5ef44fab4797c469a7b60f11ef Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Wed, 8 Nov 2023 08:57:45 +0100 Subject: [PATCH 2/3] Address comments + minor cleanup --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 7 ++--- .../llvm.amdgcn.set.inactive.chain.arg.ll | 28 ------------------- llvm/test/CodeGen/AMDGPU/release-vgprs.mir | 2 -- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index d862b37443aec..c9b095456971f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1041,13 +1041,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM // stores. In this case it can be useful to send a message to explicitly // release all VGPRs before the stores have completed, but it is only safe to - // do this if: - // * there are no outstanding scratch stores - // * this is not a callable function + // do this if there are no outstanding scratch stores (either from the current + // function or potentially from a caller or callee). else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && - !AMDGPU::isCallableCC(CC) && !OptNone && + AMDGPU::isEntryFunctionCC(CC) && !OptNone && ScoreBrackets.getScoreRange(VS_CNT) != 0 && !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) ReleaseVGPRInsts.insert(&MI); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index 6fd6d6e2e31a1..65b70587fa0ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -17,8 +17,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 % ; GFX11-NEXT: v_mov_b32_e32 v0, v10 ; GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11-NEXT: global_store_b32 v[8:9], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg: @@ -39,8 +37,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 % ; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 ; GFX11_W64-NEXT: s_not_b64 exec, exec ; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off -; GFX11_W64-NEXT: s_nop 0 -; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg: @@ -68,8 +64,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; GFX11-NEXT: v_mov_b32_e32 v1, v11 ; GFX11-NEXT: s_not_b32 exec_lo, exec_lo ; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg_64: @@ -94,8 +88,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11 ; GFX11_W64-NEXT: s_not_b64 exec, exec ; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off -; GFX11_W64-NEXT: s_nop 0 -; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg_64: @@ -133,8 +125,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v2, v1 ; GFX11-NEXT: global_store_b32 v[8:9], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: set_inactive_chain_arg_dpp: @@ -174,8 +164,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i ; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1 ; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off -; GFX11_W64-NEXT: s_nop 0 -; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm ; ; GFX10_W64-LABEL: set_inactive_chain_arg_dpp: @@ -233,8 +221,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11-NEXT: s_nop 0 -; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11-NEXT: s_endpgm ; ; DAGISEL11-LABEL: set_inactive_chain_arg_call: @@ -265,8 +251,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11-NEXT: s_nop 0 -; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11-NEXT: s_endpgm ; ; GISEL10-LABEL: set_inactive_chain_arg_call: @@ -380,8 +364,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11_W64-NEXT: s_nop 0 -; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11_W64-NEXT: s_endpgm ; ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_call: @@ -419,8 +401,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, ; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11_W64-NEXT: s_nop 0 -; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11_W64-NEXT: s_endpgm ; ; GISEL10_W64-LABEL: set_inactive_chain_arg_call: @@ -538,8 +518,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11-NEXT: s_nop 0 -; GISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11-NEXT: s_endpgm ; ; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr: @@ -570,8 +548,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11-NEXT: s_nop 0 -; DAGISEL11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11-NEXT: s_endpgm ; ; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr: @@ -685,8 +661,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; GISEL11_W64-NEXT: s_nop 0 -; GISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL11_W64-NEXT: s_endpgm ; ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr: @@ -724,8 +698,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12 ; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off -; DAGISEL11_W64-NEXT: s_nop 0 -; DAGISEL11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; DAGISEL11_W64-NEXT: s_endpgm ; ; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr: diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir index 39ced04253b57..6366485874c11 100644 --- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir @@ -567,7 +567,6 @@ body: | ; CHECK-NOT: S_SENDMSG 3 ; CHECK: S_ENDPGM 0 GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec - S_WAITCNT_VSCNT undef $sgpr_null, 0 S_ENDPGM 0 ... @@ -579,6 +578,5 @@ body: | ; CHECK-NOT: S_SENDMSG 3 ; CHECK: S_ENDPGM 0 GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec - S_WAITCNT_VSCNT undef $sgpr_null, 0 S_ENDPGM 0 ... From a8249d20c1a49f557546da53e758106f8d481217 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 17 Nov 2023 09:56:34 +0100 Subject: [PATCH 3/3] Remove unused function --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 ---- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 ----- 2 files changed, 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5ea135c7b90dd..a09abc639d759 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1924,10 +1924,6 @@ bool isChainCC(CallingConv::ID CC) { } } -bool isCallableCC(CallingConv::ID CC) { - return !isEntryFunctionCC(CC) && !isChainCC(CC); -} - bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 9654140192634..1e0994d0862cf 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1114,11 +1114,6 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC); LLVM_READNONE bool isChainCC(CallingConv::ID CC); -// Functions that are called via the 'call' instruction, rather than launched -// by the hardware or via the 'llvm.amdgcn.cs.chain' intrinsic. -LLVM_READNONE -bool isCallableCC(CallingConv::ID CC); - bool isKernelCC(const Function *Func); // FIXME: Remove this when calling conventions cleaned up