diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index aa5ea77f17291..57bb06077b92e 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1408,11 +1408,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, LiveUnits.addReg(FramePtrRegScratchCopy); } - emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, + FuncInfo->isChainFunction() ? Register() : FramePtrReg, FramePtrRegScratchCopy); } - if (FPSaved) { + if (FPSaved && !FuncInfo->isChainFunction()) { // Insert the copy to restore FP. Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy : FramePtrRegScratchCopy; @@ -2170,13 +2171,13 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return (frameTriviallyRequiresSP(MFI) && - !MF.getInfo()->isChainFunction()) || - MFI.isFrameAddressTaken() || - MF.getSubtarget().getRegisterInfo()->hasStackRealignment( - MF) || - mayReserveScratchForCWSR(MF) || - MF.getTarget().Options.DisableFramePointerElim(MF); + return !MF.getInfo()->isChainFunction() && + (frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + MF.getSubtarget() + .getRegisterInfo() + ->hasStackRealignment(MF) || + mayReserveScratchForCWSR(MF) || + MF.getTarget().Options.DisableFramePointerElim(MF)); } bool SIFrameLowering::mayReserveScratchForCWSR( diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll index 06150e4277e9a..f079630c4bffc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -517,3 +517,28 @@ define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { store i32 0, ptr addrspace(5) %v, align 4 ret void } + +define amdgpu_cs_chain void @test_fp_all() #0 { +; GFX12-LABEL: test_fp_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: scratch_store_b32 off, v0, off +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_fp_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: scratch_store_dword off, v0, off +; GFX942-NEXT: s_endpgm + %v = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +attributes #0 = { "frame-pointer"="all" } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll new file mode 100644 index 0000000000000..833c81683ab6c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -O0 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_cs_chain void @recurse(ptr %callee) { +; CHECK-LABEL: recurse: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s32, 16 +; CHECK-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; CHECK-NEXT: scratch_store_dword off, v40, off ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_dword off, v41, off offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[0:1] +; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 +; CHECK-NEXT: ; implicit-def: $sgpr10_sgpr11 +; CHECK-NEXT: ; implicit-def: $sgpr0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v40, s1, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s1 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: s_mov_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_mov_b64 s[8:9], 36 +; CHECK-NEXT: s_mov_b32 s12, s0 +; CHECK-NEXT: s_mov_b32 s13, s0 +; CHECK-NEXT: s_mov_b32 s14, s0 +; CHECK-NEXT: s_mov_b32 s15, s0 +; CHECK-NEXT: v_mov_b32_e32 v31, s0 +; CHECK-NEXT: s_getpc_b64 s[0:1] +; CHECK-NEXT: s_add_u32 s0, s0, recurse@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s1, s1, recurse@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: v_readlane_b32 s3, v40, 0 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_readfirstlane_b32 s1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v8, s3 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v9, s4 +; CHECK-NEXT: v_mov_b32_e32 v10, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s3 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_readlane_b32 s3, v41, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: scratch_load_dword v40, off, off ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_dword v41, off, off offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s3, v41, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: scratch_load_dword v40, off, off ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_dword v41, off, off offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: s_setpc_b64 s[4:5] + call void @recurse(ptr null) + call void (ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr null, i64 0, <3 x i32> inreg zeroinitializer, { i32, ptr addrspace(5), i32, i32 } zeroinitializer, i32 0) + unreachable +} + +declare void @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32 immarg, ...)