From 4c9ae7c657210e7875bfcda4f96cdd6ffd93613c Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 29 Oct 2025 02:51:36 -0400 Subject: [PATCH 1/4] [AMDGPU] Initialize FrameOffsetReg for amdgpu_cs_chain functions Functions with the amdgpu_cs_chain calling convention were not initializing FrameOffsetReg, leaving it as FP_REG. This caused machine code verification failures as SCRATCH_STORE_DWORD_SADDR instructions require the saddr operand to be in the SReg_32_XEXEC_HI register class. This LLVM defect was identified via the AMD Fuzzing project. --- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 1 + .../AMDGPU/amdgpu-cs-chain-frame-pointer.ll | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index b398db4f7caff..a9f1e9a2ad31d 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -98,6 +98,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, // set one up. For now, we can use s32 to match what amdgpu_gfx functions // would use if called, but this can be revisited. // FIXME: Only reserve this if we actually need it. + FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll new file mode 100644 index 0000000000000..3e2ecad67fbec --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -O0 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_cs_chain void @indirect(ptr %callee) { +; CHECK-LABEL: indirect: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s32, 16 +; CHECK-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; CHECK-NEXT: scratch_store_dword off, v40, off ; 4-byte Folded Spill +; CHECK-NEXT: scratch_store_dword off, v41, off offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[0:1] +; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 +; CHECK-NEXT: ; implicit-def: $sgpr10_sgpr11 +; CHECK-NEXT: ; implicit-def: $sgpr0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v40, s1, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s1 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: s_mov_b64 s[4:5], s[6:7] +; CHECK-NEXT: s_mov_b64 s[8:9], 36 +; CHECK-NEXT: s_mov_b32 s12, s0 +; CHECK-NEXT: s_mov_b32 s13, s0 +; CHECK-NEXT: s_mov_b32 s14, s0 +; CHECK-NEXT: s_mov_b32 s15, s0 +; CHECK-NEXT: v_mov_b32_e32 v31, s0 +; CHECK-NEXT: s_getpc_b64 s[0:1] +; CHECK-NEXT: s_add_u32 s0, s0, indirect@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s1, s1, indirect@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] +; CHECK-NEXT: v_readlane_b32 s3, v40, 0 +; CHECK-NEXT: s_nop 1 +; CHECK-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_readfirstlane_b32 s1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s3 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v8, s3 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v9, s4 +; CHECK-NEXT: v_mov_b32_e32 v10, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s3 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_readlane_b32 s3, v41, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s3 +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: s_setpc_b64 s[4:5] + call void @indirect(ptr null) + call void (ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr null, i64 0, <3 x i32> inreg zeroinitializer, { i32, ptr addrspace(5), i32, i32 } zeroinitializer, i32 0) + unreachable +} + +declare void @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32 immarg, ...) From d4eecd03899caa1217f1f894eaee9cf9a15bed47 Mon Sep 17 00:00:00 2001 From: jofrn Date: Mon, 10 Nov 2025 06:11:59 -0500 Subject: [PATCH 2/4] For emitEpilogue, emitCSRSpillRestores' argument in FPSaved case to handle chain functions in same way as non-FPSaved case, an offset. Remove prior hardcode to s33 and do not restore FP for chain functions. --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 5 +++-- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 1 - .../CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll | 11 ++++++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index aa5ea77f17291..ae5e25406b4cb 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1408,11 +1408,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, LiveUnits.addReg(FramePtrRegScratchCopy); } - emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, + FuncInfo->isChainFunction() ? Register() : FramePtrReg, FramePtrRegScratchCopy); } - if (FPSaved) { + if (FPSaved && !FuncInfo->isChainFunction()) { // Insert the copy to restore FP. Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy : FramePtrRegScratchCopy; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index a9f1e9a2ad31d..b398db4f7caff 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -98,7 +98,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, // set one up. For now, we can use s32 to match what amdgpu_gfx functions // would use if called, but this can be revisited. // FIXME: Only reserve this if we actually need it. - FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll index 3e2ecad67fbec..6c62a65943b41 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll @@ -50,10 +50,15 @@ define amdgpu_cs_chain void @indirect(ptr %callee) { ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_readlane_b32 s3, v41, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; CHECK-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_dword v40, off, off ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_dword v41, off, off offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: s_mov_b32 s33, s3 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s3, v41, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: scratch_load_dword v40, off, off ; 4-byte Folded Reload +; CHECK-NEXT: scratch_load_dword v41, off, off offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_setpc_b64 s[4:5] call void @indirect(ptr null) From 820b9c3b03403a2f2f3b931648c9923cefdf8ff7 Mon Sep 17 00:00:00 2001 From: jofrn Date: Mon, 10 Nov 2025 08:07:55 -0500 Subject: [PATCH 3/4] Added test case to verify no FP setup occurs with "frame-pointer"="all". --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 14 +++++++------- .../AMDGPU/amdgpu-cs-chain-frame-pointer.ll | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index ae5e25406b4cb..57bb06077b92e 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2171,13 +2171,13 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return (frameTriviallyRequiresSP(MFI) && - !MF.getInfo()->isChainFunction()) || - MFI.isFrameAddressTaken() || - MF.getSubtarget().getRegisterInfo()->hasStackRealignment( - MF) || - mayReserveScratchForCWSR(MF) || - MF.getTarget().Options.DisableFramePointerElim(MF); + return !MF.getInfo()->isChainFunction() && + (frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + MF.getSubtarget() + .getRegisterInfo() + ->hasStackRealignment(MF) || + mayReserveScratchForCWSR(MF) || + MF.getTarget().Options.DisableFramePointerElim(MF)); } bool SIFrameLowering::mayReserveScratchForCWSR( diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll index 6c62a65943b41..85675042aed9d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll @@ -66,4 +66,19 @@ define amdgpu_cs_chain void @indirect(ptr %callee) { unreachable } +define amdgpu_cs_chain void @fp_all() #0 { +; CHECK-LABEL: fp_all: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: scratch_store_dword off, v0, off +; CHECK-NEXT: s_endpgm + %v = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +attributes #0 = { "frame-pointer"="all" } + declare void @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32 immarg, ...) From 506b411d359500e1f54388723d97b3fac80ac740 Mon Sep 17 00:00:00 2001 From: jofrn Date: Mon, 10 Nov 2025 10:51:05 -0500 Subject: [PATCH 4/4] Rename indirect to recurse and keep in original file -- invalid for gfx1200. Move fp_all test to fp-nosave since it compiles on both gfx942 and gfx1200. --- .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll | 25 +++++++++++++++++++ .../AMDGPU/amdgpu-cs-chain-frame-pointer.ll | 25 ++++--------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll index 06150e4277e9a..f079630c4bffc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -517,3 +517,28 @@ define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { store i32 0, ptr addrspace(5) %v, align 4 ret void } + +define amdgpu_cs_chain void @test_fp_all() #0 { +; GFX12-LABEL: test_fp_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: scratch_store_b32 off, v0, off +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_fp_all: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: scratch_store_dword off, v0, off +; GFX942-NEXT: s_endpgm + %v = alloca i32, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +attributes #0 = { "frame-pointer"="all" } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll index 85675042aed9d..833c81683ab6c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-frame-pointer.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -O0 -verify-machineinstrs < %s | FileCheck %s -define amdgpu_cs_chain void @indirect(ptr %callee) { -; CHECK-LABEL: indirect: +define amdgpu_cs_chain void @recurse(ptr %callee) { +; CHECK-LABEL: recurse: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s32, 16 @@ -26,8 +26,8 @@ define amdgpu_cs_chain void @indirect(ptr %callee) { ; CHECK-NEXT: s_mov_b32 s15, s0 ; CHECK-NEXT: v_mov_b32_e32 v31, s0 ; CHECK-NEXT: s_getpc_b64 s[0:1] -; CHECK-NEXT: s_add_u32 s0, s0, indirect@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s1, s1, indirect@gotpcrel32@hi+12 +; CHECK-NEXT: s_add_u32 s0, s0, recurse@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s1, s1, recurse@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -61,24 +61,9 @@ define amdgpu_cs_chain void @indirect(ptr %callee) { ; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_mov_b64 exec, 0 ; CHECK-NEXT: s_setpc_b64 s[4:5] - call void @indirect(ptr null) + call void @recurse(ptr null) call void (ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr null, i64 0, <3 x i32> inreg zeroinitializer, { i32, ptr addrspace(5), i32, i32 } zeroinitializer, i32 0) unreachable } -define amdgpu_cs_chain void @fp_all() #0 { -; CHECK-LABEL: fp_all: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: scratch_store_dword off, v0, off -; CHECK-NEXT: s_endpgm - %v = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %v, align 4 - ret void -} - -attributes #0 = { "frame-pointer"="all" } - declare void @llvm.amdgcn.cs.chain.p0.i64.v3i32.sl_i32p5i32i32s(ptr, i64, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32 immarg, ...)