From 5fefdddc8f4f1cc0167e1034ae3750f574825496 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Mon, 17 Nov 2025 17:55:22 +0800 Subject: [PATCH] [LoongArch] Enable tail calls for sret functions and relax argument matching Allow tail-calling functions that return via sret when the caller has an incoming sret pointer that can be forwarded. Remove the overly strict requirement that tail-call argument values must exactly match the caller's incoming arguments. The real constraint is only that the callee uses no more argument stack space than the caller. This fixes musttail codegen and enables significantly more tail-call optimizations. --- .../LoongArch/LoongArchISelLowering.cpp | 75 +++- .../Target/LoongArch/LoongArchISelLowering.h | 6 + .../LoongArch/LoongArchMachineFunctionInfo.h | 7 + llvm/test/CodeGen/LoongArch/musttail.ll | 397 ++++++++++++++++++ llvm/test/CodeGen/LoongArch/tail-calls.ll | 13 +- 5 files changed, 479 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/musttail.ll diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index cf4ffc82f6009..2a55558e00e78 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -8069,6 +8069,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); + auto *LoongArchFI = MF.getInfo(); switch (CallConv) { default: @@ -8140,7 +8141,6 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( const TargetRegisterClass *RC = &LoongArch::GPRRegClass; MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - auto *LoongArchFI = MF.getInfo(); // Offset of the first variable argument from stack pointer, and size of // the vararg save area. For now, the varargs save area is either zero or @@ -8190,6 +8190,8 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( LoongArchFI->setVarArgsSaveSize(VarArgsSaveSize); } + LoongArchFI->setArgumentStackSize(CCInfo.getStackSize()); + // All stores are grouped in one node to allow the matching between // the size of Ins and InVals. This only happens for vararg functions. if (!OutChains.empty()) { @@ -8246,9 +8248,11 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( auto &Outs = CLI.Outs; auto &Caller = MF.getFunction(); auto CallerCC = Caller.getCallingConv(); + auto *LoongArchFI = MF.getInfo(); - // Do not tail call opt if the stack is used to pass parameters. - if (CCInfo.getStackSize() != 0) + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getStackSize() > LoongArchFI->getArgumentStackSize()) return false; // Do not tail call opt if any parameters need to be passed indirectly. @@ -8260,7 +8264,7 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( // semantics. auto IsCallerStructRet = Caller.hasStructRetAttr(); auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet(); - if (IsCallerStructRet || IsCalleeStructRet) + if (IsCallerStructRet != IsCalleeStructRet) return false; // Do not tail call opt if either the callee or caller has a byval argument. @@ -8276,9 +8280,47 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } + + // If the callee takes no arguments then go on to check the results of the + // call. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SmallVectorImpl &OutVals = CLI.OutVals; + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; + return true; } +SDValue LoongArchTargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const { + SmallVector ArgChains; + int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode *U : DAG.getEntryNode().getNode()->users()) + if (LoadSDNode *L = dyn_cast(U)) + if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} static Align getPrefTypeAlign(EVT VT, SelectionDAG &DAG) { return DAG.getDataLayout().getPrefTypeAlign( VT.getTypeForEVT(*DAG.getContext())); @@ -8454,19 +8496,32 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); } else { assert(VA.isMemLoc() && "Argument not register or memory"); - assert(!IsTailCall && "Tail call not allowed if stack is used " - "for passing parameters"); + SDValue DstAddr; + MachinePointerInfo DstInfo; + int32_t Offset = VA.getLocMemOffset(); // Work out the address of the stack slot. if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, DL, LoongArch::R3, PtrVT); - SDValue Address = - DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, - DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); + + if (IsTailCall) { + unsigned OpSize = (VA.getValVT().getSizeInBits() + 7) / 8; + int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstInfo = MachinePointerInfo::getFixedStack(MF, FI); + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(MF, Offset); + } // Emit the store. MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); + DAG.getStore(Chain, DL, ArgValue, DstAddr, DstInfo)); } } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 8a4d7748467c7..e95f70f06cc7b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -438,6 +438,12 @@ class LoongArchTargetLowering : public TargetLowering { CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, const SmallVectorImpl &ArgLocs) const; + /// Finds the incoming stack arguments which overlap the given fixed stack + /// object and incorporates their load into the current chain. This prevents + /// an upcoming store from clobbering the stack argument before it's used. + SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, + MachineFrameInfo &MFI, int ClobberedFI) const; + bool softPromoteHalfType() const override { return true; } bool diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h index 904985c189dba..cf0837cbf09c7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h @@ -32,6 +32,10 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { /// Size of stack frame to save callee saved registers unsigned CalleeSavedStackSize = 0; + /// ArgumentStackSize - amount of bytes on stack consumed by the arguments + /// being passed on the stack + unsigned ArgumentStackSize = 0; + /// FrameIndex of the spill slot when there is no scavenged register in /// insertIndirectBranch. int BranchRelaxationSpillFrameIndex = -1; @@ -63,6 +67,9 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + int getBranchRelaxationSpillFrameIndex() { return BranchRelaxationSpillFrameIndex; } diff --git a/llvm/test/CodeGen/LoongArch/musttail.ll b/llvm/test/CodeGen/LoongArch/musttail.ll new file mode 100644 index 0000000000000..cf436e0505ad4 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/musttail.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64 + +declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) + +define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; LA32-LABEL: many_args_tail: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: many_args_tail: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; LA32-LABEL: many_args_musttail: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: many_args_musttail: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = musttail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; This function has more arguments than it's tail-callee. This isn't valid for +; the musttail attribute, but can still be tail-called as a non-guaranteed +; optimisation, because the outgoing arguments to @many_args_callee fit in the +; stack space allocated by the caller of @more_args_tail. +define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; LA32-LABEL: more_args_tail: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: more_args_tail: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; Again, this isn't valid for musttail, but can be tail-called in practice +; because the stack size if the same. +define i32 @different_args_tail_32bit(i64 %0, i64 %1, i64 %2, i64 %3, i64 %4) { +; LA32-LABEL: different_args_tail_32bit: +; LA32: # %bb.0: +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: b many_args_callee +; +; LA64-LABEL: different_args_tail_32bit: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: .cfi_def_cfa_offset 32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $ra, %call36(many_args_callee) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +define i32 @different_args_tail_64bit(i128 %0, i128 %1, i128 %2, i128 %3, i128 %4) { +; LA32-LABEL: different_args_tail_64bit: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: bl many_args_callee +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: different_args_tail_64bit: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(many_args_callee) +; LA64-NEXT: jr $t8 + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +; Here, the caller requires less stack space for it's arguments than the +; callee, so it would not ba valid to do a tail-call. +define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) { +; LA32-LABEL: fewer_args_tail: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: ori $a0, $zero, 9 +; LA32-NEXT: st.w $a0, $sp, 4 +; LA32-NEXT: ori $a0, $zero, 8 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: ori $a2, $zero, 2 +; LA32-NEXT: ori $a3, $zero, 3 +; LA32-NEXT: ori $a4, $zero, 4 +; LA32-NEXT: ori $a5, $zero, 5 +; LA32-NEXT: ori $a6, $zero, 6 +; LA32-NEXT: ori $a7, $zero, 7 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $zero +; LA32-NEXT: bl many_args_callee +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: fewer_args_tail: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: .cfi_def_cfa_offset 32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: ori $a0, $zero, 9 +; LA64-NEXT: st.d $a0, $sp, 8 +; LA64-NEXT: ori $a0, $zero, 8 +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: ori $a2, $zero, 2 +; LA64-NEXT: ori $a3, $zero, 3 +; LA64-NEXT: ori $a4, $zero, 4 +; LA64-NEXT: ori $a5, $zero, 5 +; LA64-NEXT: ori $a6, $zero, 6 +; LA64-NEXT: ori $a7, $zero, 7 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $ra, %call36(many_args_callee) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %ret = tail call i32 @many_args_callee(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) + ret i32 %ret +} + +declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, i32) + +define void @bar(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8) nounwind { +; LA32-LABEL: bar: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a7 +; LA32-NEXT: move $s0, $a6 +; LA32-NEXT: move $s1, $a5 +; LA32-NEXT: move $s2, $a4 +; LA32-NEXT: move $s3, $a3 +; LA32-NEXT: move $s4, $a2 +; LA32-NEXT: move $s5, $a1 +; LA32-NEXT: move $s6, $a0 +; LA32-NEXT: ori $a0, $zero, 1 +; LA32-NEXT: st.w $a0, $sp, 0 +; LA32-NEXT: move $a0, $s6 +; LA32-NEXT: bl foo +; LA32-NEXT: ori $a0, $zero, 2 +; LA32-NEXT: st.w $a0, $sp, 48 +; LA32-NEXT: move $a0, $s6 +; LA32-NEXT: move $a1, $s5 +; LA32-NEXT: move $a2, $s4 +; LA32-NEXT: move $a3, $s3 +; LA32-NEXT: move $a4, $s2 +; LA32-NEXT: move $a5, $s1 +; LA32-NEXT: move $a6, $s0 +; LA32-NEXT: move $a7, $fp +; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: b foo +; +; LA64-LABEL: bar: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -96 +; LA64-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s3, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: st.d $s4, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $s5, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a7 +; LA64-NEXT: move $s0, $a6 +; LA64-NEXT: move $s1, $a5 +; LA64-NEXT: move $s2, $a4 +; LA64-NEXT: move $s3, $a3 +; LA64-NEXT: move $s4, $a2 +; LA64-NEXT: move $s5, $a1 +; LA64-NEXT: move $s6, $a0 +; LA64-NEXT: ori $a0, $zero, 1 +; LA64-NEXT: st.d $a0, $sp, 0 +; LA64-NEXT: move $a0, $s6 +; LA64-NEXT: pcaddu18i $ra, %call36(foo) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ori $a0, $zero, 2 +; LA64-NEXT: st.d $a0, $sp, 96 +; LA64-NEXT: move $a0, $s6 +; LA64-NEXT: move $a1, $s5 +; LA64-NEXT: move $a2, $s4 +; LA64-NEXT: move $a3, $s3 +; LA64-NEXT: move $a4, $s2 +; LA64-NEXT: move $a5, $s1 +; LA64-NEXT: move $a6, $s0 +; LA64-NEXT: move $a7, $fp +; LA64-NEXT: ld.d $s6, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s5, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s4, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s3, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s2, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 96 +; LA64-NEXT: pcaddu18i $t8, %call36(foo) +; LA64-NEXT: jr $t8 +entry: + call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 1) + musttail call void @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 2) + ret void +} + +declare void @sret_callee(ptr sret({ double, double }) align 8) + +; Functions which return by sret can be tail-called because the incoming sret +; pointer gets passed through to the callee. +define void @sret_caller_tail(ptr sret({ double, double }) align 8 %result) { +; LA32-LABEL: sret_caller_tail: +; LA32: # %bb.0: # %entry +; LA32-NEXT: b sret_callee +; +; LA64-LABEL: sret_caller_tail: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcaddu18i $t8, %call36(sret_callee) +; LA64-NEXT: jr $t8 +entry: + tail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} + +define void @sret_caller_musttail(ptr sret({ double, double }) align 8 %result) { +; LA32-LABEL: sret_caller_musttail: +; LA32: # %bb.0: # %entry +; LA32-NEXT: b sret_callee +; +; LA64-LABEL: sret_caller_musttail: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcaddu18i $t8, %call36(sret_callee) +; LA64-NEXT: jr $t8 +entry: + musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result) + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/tail-calls.ll b/llvm/test/CodeGen/LoongArch/tail-calls.ll index 533761c8a1c70..e14fbc2302cce 100644 --- a/llvm/test/CodeGen/LoongArch/tail-calls.ll +++ b/llvm/test/CodeGen/LoongArch/tail-calls.ll @@ -80,20 +80,15 @@ entry: ret void } -;; Do not tail call optimize if stack is used to pass parameters. +;; Perform tail call optimization if callee arg stack usage ≤ caller declare i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) nounwind { ; CHECK-LABEL: caller_args: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -16 -; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: ld.d $t0, $sp, 16 +; CHECK-NEXT: ld.d $t0, $sp, 0 ; CHECK-NEXT: st.d $t0, $sp, 0 -; CHECK-NEXT: pcaddu18i $ra, %call36(callee_args) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 16 -; CHECK-NEXT: ret +; CHECK-NEXT: pcaddu18i $t8, %call36(callee_args) +; CHECK-NEXT: jr $t8 entry: %r = tail call i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) ret i32 %r