diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 4d7090942142f..443bb45a23514 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -176,6 +176,10 @@ def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs< (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs) >; +def CSR_AMDGPU_CS_ChainPreserve : CalleeSavedRegs< + (sequence "VGPR%u", 8, 255) +>; + def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; // Calling convention for leaf functions diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 350e2f1e3b987..d21a4f779487c 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1322,7 +1322,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SIMachineFunctionInfo *FuncInfo = MF.getInfo(); // Allocate spill slots for WWM reserved VGPRs. - if (!FuncInfo->isEntryFunction()) { + // For chain functions, we only need to do this if we have calls to + // llvm.amdgcn.cs.chain. + bool IsChainWithoutCalls = + FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); + if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { for (Register Reg : FuncInfo->getWWMReservedRegs()) { const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), @@ -1535,8 +1539,15 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, RegScavenger *RS) const { - TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); SIMachineFunctionInfo *MFI = MF.getInfo(); + + // If this is a function with the amdgpu_cs_chain[_preserve] calling + // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then + // we don't need to save and restore anything. + if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) + return; + + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); if (MFI->isEntryFunction()) return; @@ -1563,7 +1574,9 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + (MFI->isChainFunction() && + TII->isChainCallOpcode(MI.getOpcode()))) { // We expect all return to be the same size. assert(!ReturnMI || (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 1fd72e6376e2d..29f549fc29a3c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -687,6 +687,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { Opcode == AMDGPU::SI_SPILL_WWM_AV32_RESTORE; } + static bool isChainCallOpcode(uint64_t Opcode) { + return Opcode == AMDGPU::SI_CS_CHAIN_TC_W32 || + Opcode == AMDGPU::SI_CS_CHAIN_TC_W64; + } + static bool isDPP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DPP; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 7ab5ccde4faf4..8e083505e3d31 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -276,6 +276,14 @@ void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, if (isEntryFunction() || WWMSpills.count(VGPR)) return; + // Skip if this is a function with the amdgpu_cs_chain or + // amdgpu_cs_chain_preserve calling convention and this is a scratch register. + // We never need to allocate a spill for these because we don't even need to + // restore the inactive lanes for them (they're scratchier than the usual + // scratch registers). + if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR)) + return; + WWMSpills.insert(std::make_pair( VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment))); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 95ca7cffc24ee..bb02f47914f4e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -397,6 +397,8 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( case CallingConv::AMDGPU_Gfx: return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList : CSR_AMDGPU_SI_Gfx_SaveList; + case CallingConv::AMDGPU_CS_ChainPreserve: + return CSR_AMDGPU_CS_ChainPreserve_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; @@ -435,6 +437,10 @@ const uint32_t *SIRegisterInfo::getNoPreservedMask() const { return CSR_AMDGPU_NoRegs_RegMask; } +bool SIRegisterInfo::isChainScratchRegister(Register VGPR) { + return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8; +} + const TargetRegisterClass * SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index e45a0fae5d6c4..14fa91a715e19 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -90,6 +90,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; + // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve calling + // conventions are free to use certain VGPRs without saving and restoring any + // lanes (not even inactive ones). + static bool isChainScratchRegister(Register VGPR); + // Stack access is very expensive. CSRs are also the high registers, and we // want to minimize the number of used registers. unsigned getCSRFirstUseCost() const override { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll index ce8b3b3ede027..967bc985754ee 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll @@ -505,6 +505,92 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) { unreachable } +define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_to_chain_wwm: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_to_chain_wwm: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_to_chain_wwm: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_to_chain_wwm: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %i = call i32 @llvm.amdgcn.set.inactive(i32 3, i32 4) + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + %w = call i32 @llvm.amdgcn.wwm(i32 %i) + %c = insertelement <3 x i32> %b, i32 %w, i32 0 + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %c, i32 0) + unreachable +} + define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7: ; GISEL-GFX11: ; %bb.0: @@ -716,3 +802,5 @@ declare void @llvm.amdgcn.cs.chain.v4i32(ptr, i32, <4 x i32>, <4 x i32>, i32, .. declare amdgpu_cs_chain void @chain_callee_2(<2 x i32> inreg, <2 x i32>) declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>) declare amdgpu_cs_chain void @chain_callee_4(<4 x i32> inreg, <4 x i32>) +declare i32 @llvm.amdgcn.set.inactive(i32, i32) +declare i32 @llvm.amdgcn.wwm(i32) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll index aa9f0a99e2e64..11d7faf8218dc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll @@ -177,18 +177,19 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3 unreachable } -; FIXME: Preserve things (i.e. v16)! ; FIXME: Setup s32. define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 @@ -199,11 +200,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 @@ -214,14 +217,16 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi -; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -229,14 +234,16 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee@abs32@hi -; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -249,11 +256,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; GISEL-GFX11-LABEL: chain_preserve_to_chain: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 @@ -264,11 +273,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; GISEL-GFX10-LABEL: chain_preserve_to_chain: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 @@ -279,14 +290,16 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi -; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -294,14 +307,16 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi -; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -310,10 +325,107 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a unreachable } +define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b) { +; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX11-NEXT: ;;#ASMSTART +; GISEL-GFX11-NEXT: s_nop +; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload +; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: +; GISEL-GFX10: ; %bb.0: +; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; GISEL-GFX10-NEXT: ;;#ASMSTART +; GISEL-GFX10-NEXT: s_nop +; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload +; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm: +; DAGISEL-GFX11: ; %bb.0: +; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill +; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX11-NEXT: ;;#ASMSTART +; DAGISEL-GFX11-NEXT: s_nop +; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm: +; DAGISEL-GFX10: ; %bb.0: +; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4 +; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo +; DAGISEL-GFX10-NEXT: ;;#ASMSTART +; DAGISEL-GFX10-NEXT: s_nop +; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi +; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo +; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 +; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v2 +; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] + %i = call i32 @llvm.amdgcn.set.inactive(i32 3, i32 4) + call void asm "s_nop", "~{v0},~{v8},~{v16},~{s0}"() + %w = call i32 @llvm.amdgcn.wwm(i32 %i) + %c = insertelement <3 x i32> %b, i32 %w, i32 0 + call void(ptr, i32, <3 x i32>, <3 x i32>, i32, ...) @llvm.amdgcn.cs.chain.v3i32(ptr @chain_callee, i32 -1, <3 x i32> inreg %a, <3 x i32> %c, i32 0) + unreachable +} + define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) { ; GISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: s_clause 0x1 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX11-NEXT: ;;#ASMSTART @@ -322,6 +434,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 +; GISEL-GFX11-NEXT: s_clause 0x1 +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 +; GISEL-GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:4 ; GISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -329,6 +444,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], s32 offset:4 ; 4-byte Folded Spill +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; GISEL-GFX10-NEXT: ;;#ASMSTART @@ -337,6 +454,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 +; GISEL-GFX10-NEXT: s_clause 0x1 +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 +; GISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], s32 offset:4 ; GISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; GISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -344,6 +464,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: s_clause 0x1 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v11, s32 offset:4 +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v11, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART @@ -352,6 +475,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v11 +; DAGISEL-GFX11-NEXT: s_clause 0x1 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 +; DAGISEL-GFX11-NEXT: scratch_load_b32 v11, off, s32 offset:4 ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -359,6 +485,8 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: buffer_store_dword v11, off, s[48:51], s32 offset:4 ; 4-byte Folded Spill +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v11, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART @@ -367,6 +495,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_callee@abs32@hi ; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_callee@abs32@lo ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v11 +; DAGISEL-GFX10-NEXT: s_clause 0x1 +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 +; DAGISEL-GFX10-NEXT: buffer_load_dword v11, off, s[48:51], s32 offset:4 ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s3 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -379,11 +510,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; GISEL-GFX11-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX11-NEXT: ;;#ASMSTART ; GISEL-GFX11-NEXT: s_nop ; GISEL-GFX11-NEXT: ;;#ASMEND +; GISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; GISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; GISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; GISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 @@ -394,11 +527,13 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; GISEL-GFX10: ; %bb.0: ; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; GISEL-GFX10-NEXT: s_mov_b32 s2, s0 +; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GISEL-GFX10-NEXT: ;;#ASMSTART ; GISEL-GFX10-NEXT: s_nop ; GISEL-GFX10-NEXT: ;;#ASMEND +; GISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; GISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; GISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi ; GISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 @@ -409,14 +544,16 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; DAGISEL-GFX11: ; %bb.0: ; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, s32 ; 4-byte Folded Spill ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX11-NEXT: s_mov_b32 s2, s0 ; DAGISEL-GFX11-NEXT: ;;#ASMSTART ; DAGISEL-GFX11-NEXT: s_nop ; DAGISEL-GFX11-NEXT: ;;#ASMEND +; DAGISEL-GFX11-NEXT: scratch_load_b32 v16, off, s32 ; 4-byte Folded Reload ; DAGISEL-GFX11-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi -; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX11-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; DAGISEL-GFX11-NEXT: s_mov_b32 s0, s2 ; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX11-NEXT: s_setpc_b64 s[4:5] @@ -424,14 +561,16 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args: ; DAGISEL-GFX10: ; %bb.0: ; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], s32 ; 4-byte Folded Spill ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, v8 ; DAGISEL-GFX10-NEXT: s_mov_b32 s2, s0 ; DAGISEL-GFX10-NEXT: ;;#ASMSTART ; DAGISEL-GFX10-NEXT: s_nop ; DAGISEL-GFX10-NEXT: ;;#ASMEND +; DAGISEL-GFX10-NEXT: buffer_load_dword v16, off, s[48:51], s32 ; 4-byte Folded Reload ; DAGISEL-GFX10-NEXT: s_mov_b32 s5, chain_preserve_callee_2@abs32@hi -; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v8, v1 +; DAGISEL-GFX10-NEXT: s_mov_b32 s4, chain_preserve_callee_2@abs32@lo ; DAGISEL-GFX10-NEXT: s_mov_b32 s0, s2 ; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, -1 ; DAGISEL-GFX10-NEXT: s_setpc_b64 s[4:5] @@ -451,3 +590,6 @@ declare amdgpu_cs_chain void @chain_callee(<3 x i32> inreg, <3 x i32>) declare void @llvm.amdgcn.cs.chain.v2i32(ptr, i32, <2 x i32>, <2 x i32>, i32, ...) declare amdgpu_cs_chain_preserve void @chain_preserve_callee_2(<2 x i32> inreg, <2 x i32>) + +declare i32 @llvm.amdgcn.set.inactive(i32, i32) +declare i32 @llvm.amdgcn.wwm(i32) diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir new file mode 100644 index 0000000000000..dc57165af9935 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir @@ -0,0 +1,270 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +--- | + declare amdgpu_cs_chain void @callee() + declare amdgpu_gfx void @gfx_callee() + + define amdgpu_cs_chain_preserve void @preserve_active_lanes_above_args() {ret void} + define amdgpu_cs_chain_preserve void @preserve_all_lanes_wwm_above_args() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_args() {ret void} + define amdgpu_cs_chain_preserve void @preserve_inactive_lanes_wwm_args() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_if_no_chain_calls() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_v0_v7() {ret void} + define amdgpu_cs_chain_preserve void @dont_preserve_sgpr() {ret void} +... +--- + +# NOTE: Since we don't know what the args are, we rely on the fact that we can't +# call llvm.amdgcn.cs.chain with more parameters than we received - so anything +# that is used by the SI_CS_CHAIN_TC_W32 is assumed to have been an arg and therefore +# not preserved. + +--- +name: preserve_active_lanes_above_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: preserve_active_lanes_above_args + ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + $vgpr8 = COPY renamable killed $vgpr10 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: preserve_all_lanes_wwm_above_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr11' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + + ; GCN-LABEL: name: preserve_all_lanes_wwm_above_args + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr10, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr11, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0 + ; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0 + renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec + $vgpr8 = COPY renamable killed $vgpr10 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: dont_preserve_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: dont_preserve_args + ; GCN: liveins: $sgpr0, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: preserve_inactive_lanes_wwm_args +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + + ; GCN-LABEL: name: preserve_inactive_lanes_wwm_args + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: dont_preserve_if_no_chain_calls +tracksRegLiveness: true +frameInfo: + hasTailCall: false +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: dont_preserve_if_no_chain_calls + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: $vgpr9 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: $vgpr10 = V_MOV_B32_e32 30, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + $vgpr9 = V_MOV_B32_e32 20, implicit $exec + $vgpr10 = V_MOV_B32_e32 30, implicit $exec + S_ENDPGM 0 +... + +--- +name: dont_preserve_v0_v7 +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr1' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8 + + ; GCN-LABEL: name: dont_preserve_v0_v7 + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0 + ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + renamable $vgpr8 = COPY killed renamable $vgpr0 + renamable $vgpr9 = COPY killed renamable $vgpr7 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 +... + +--- +name: dont_preserve_sgpr +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: dont_preserve_sgpr + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + ; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0 + renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + $sgpr0 = COPY killed renamable $sgpr1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir new file mode 100644 index 0000000000000..354f65d3235c7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir @@ -0,0 +1,232 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +# We're keeping the IR around for the callees and the CCs + +--- | + declare amdgpu_cs_chain void @callee() + declare amdgpu_gfx void @gfx_callee() + + define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void} + define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void} + define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void} + define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void} + define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void} + define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void} +... +--- + +# Check that we preserve the inactive lanes of registers v8+ received in the +# MachineFunctionInfo as wwmReservedRegs. + +--- +name: preserve_inactive_wwm +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true + wwmReservedRegs: + - '$vgpr8' + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: preserve_inactive_wwm + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + +... + +# Check that it also works for SGPR to VGPR spills. + +--- +name: preserve_inactive_detected_wwm +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: preserve_inactive_detected_wwm + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr9, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0 + ; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0 + renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + +... + +--- +name: dont_preserve_wwm_if_no_chain_calls +tracksRegLiveness: true +frameInfo: + hasTailCall: false +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true + wwmReservedRegs: + - '$vgpr9' +body: | + bb.0: + liveins: $sgpr35, $vgpr8 + + ; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls + ; GCN: liveins: $sgpr35, $vgpr8 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0 + renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec + S_ENDPGM 0 +... + +--- +name: dont_preserve_non_wwm +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16 + + ; GCN-LABEL: name: dont_preserve_non_wwm + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec + renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8 + +... + +--- +name: dont_preserve_v0_v7 +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + isChainFunction: true + returnsVoid: true + wwmReservedRegs: + - '$vgpr1' +body: | + bb.0: + liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9 + + ; GCN-LABEL: name: dont_preserve_v0_v7 + ; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + ; GCN-NEXT: $sgpr35 = S_MOV_B32 5 + ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0 + ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0 + $sgpr35 = S_MOV_B32 5 + $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec + renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec + renamable $vgpr8 = COPY killed renamable $vgpr0 + renamable $vgpr9 = COPY killed renamable $vgpr7 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9 + +... + +--- +name: dont_preserve_sgpr +tracksRegLiveness: true +frameInfo: + hasTailCall: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + returnsVoid: true +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0 + + ; GCN-LABEL: name: dont_preserve_sgpr + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + ; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1 + ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + ; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0 + renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc + $sgpr0 = COPY killed renamable $sgpr1 + renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc + renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) + SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0 + +...