diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp index d99b7757166ca..bad39dc3a14fe 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp @@ -77,7 +77,8 @@ class LoongArchPreRAExpandPseudo : public MachineFunctionPass { MachineBasicBlock::iterator &NextMBBI); bool expandFunctionCALL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI); + MachineBasicBlock::iterator &NextMBBI, + bool IsTailCall); }; char LoongArchPreRAExpandPseudo::ID = 0; @@ -121,7 +122,9 @@ bool LoongArchPreRAExpandPseudo::expandMI( case LoongArch::PseudoLA_TLS_GD: return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI); case LoongArch::PseudoCALL: - return expandFunctionCALL(MBB, MBBI, NextMBBI); + return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); + case LoongArch::PseudoTAIL: + return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); } return false; } @@ -247,27 +250,43 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD( bool LoongArchPreRAExpandPseudo::expandFunctionCALL( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI) { + MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { MachineFunction *MF = MBB.getParent(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Func = MI.getOperand(0); MachineInstrBuilder CALL; + unsigned Opcode; switch (MF->getTarget().getCodeModel()) { default: report_fatal_error("Unsupported code model"); break; - case CodeModel::Small: // Default CodeModel. - CALL = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::BL)).add(Func); + case CodeModel::Small: { + // CALL: + // bl func + // TAIL: + // b func + Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; + CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); break; + } case CodeModel::Medium: { + // CALL: // pcalau12i $ra, %pc_hi20(func) // jirl $ra, $ra, %pc_lo12(func) + // TAIL: + // pcalau12i $scratch, %pc_hi20(func) + // jirl $r0, $scratch, %pc_lo12(func) + Opcode = + IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; + Register ScratchReg = + IsTailCall + ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) + : LoongArch::R1; MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), LoongArch::R1); - CALL = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoJIRL_CALL)) - .addReg(LoongArch::R1); + BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg); + CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg); if (Func.isSymbol()) { const char *FnName = Func.getSymbolName(); MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index f0386949f611d..eec32fd490542 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -31,6 +31,8 @@ using namespace llvm; #define DEBUG_TYPE "loongarch-isel-lowering" +STATISTIC(NumTailCalls, "Number of tail calls"); + static cl::opt ZeroDivCheck( "loongarch-check-zero-division", cl::Hidden, cl::desc("Trap on integer division by zero."), @@ -1334,6 +1336,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { // TODO: Add more target-dependent nodes later. NODE_NAME_CASE(CALL) NODE_NAME_CASE(RET) + NODE_NAME_CASE(TAIL) NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) @@ -1808,6 +1811,48 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( return Chain; } +// Check whether the call is eligible for tail call optimization. +bool LoongArchTargetLowering::isEligibleForTailCallOptimization( + CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, + const SmallVectorImpl &ArgLocs) const { + + auto CalleeCC = CLI.CallConv; + auto &Outs = CLI.Outs; + auto &Caller = MF.getFunction(); + auto CallerCC = Caller.getCallingConv(); + + // Do not tail call opt if the stack is used to pass parameters. + if (CCInfo.getNextStackOffset() != 0) + return false; + + // Do not tail call opt if any parameters need to be passed indirectly. + for (auto &VA : ArgLocs) + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + + // Do not tail call opt if either caller or callee uses struct return + // semantics. + auto IsCallerStructRet = Caller.hasStructRetAttr(); + auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet(); + if (IsCallerStructRet || IsCalleeStructRet) + return false; + + // Do not tail call opt if either the callee or caller has a byval argument. + for (auto &Arg : Outs) + if (Arg.Flags.isByVal()) + return false; + + // The callee has to preserve all registers the caller needs to preserve. + const LoongArchRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + if (CalleeCC != CallerCC) { + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) + return false; + } + return true; +} + static Align getPrefTypeAlign(EVT VT, SelectionDAG &DAG) { return DAG.getDataLayout().getPrefTypeAlign( VT.getTypeForEVT(*DAG.getContext())); @@ -1829,7 +1874,7 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT GRLenVT = Subtarget.getGRLenVT(); - CLI.IsTailCall = false; + bool &IsTailCall = CLI.IsTailCall; MachineFunction &MF = DAG.getMachineFunction(); @@ -1839,6 +1884,16 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI, CC_LoongArch); + // Check if it's really possible to do a tail call. + if (IsTailCall) + IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs); + + if (IsTailCall) + ++NumTailCalls; + else if (CLI.CB && CLI.CB->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = ArgCCInfo.getNextStackOffset(); @@ -1860,12 +1915,13 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, /*IsVolatile=*/false, - /*AlwaysInline=*/false, /*isTailCall=*/false, + /*AlwaysInline=*/false, /*isTailCall=*/IsTailCall, MachinePointerInfo(), MachinePointerInfo()); ByValArgs.push_back(FIPtr); } - Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); + if (!IsTailCall) + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); // Copy argument values to their designated locations. SmallVector> RegsToPass; @@ -1932,6 +1988,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); } else { assert(VA.isMemLoc() && "Argument not register or memory"); + assert(!IsTailCall && "Tail call not allowed if stack is used " + "for passing parameters"); // Work out the address of the stack slot. if (!StackPtr.getNode()) @@ -1986,11 +2044,13 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, for (auto &Reg : RegsToPass) Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); - // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); + if (!IsTailCall) { + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + } // Glue the call to the argument copies, if any. if (Glue.getNode()) @@ -1999,6 +2059,11 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, // Emit the call. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + if (IsTailCall) { + MF.getFrameInfo().setHasTailCall(); + return DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops); + } + Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops); DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); Glue = Chain.getValue(1); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 4b7bf9d9c6994..e181c104b1e19 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -29,6 +29,8 @@ enum NodeType : unsigned { // TODO: add more LoongArchISDs CALL, RET, + TAIL, + // 32-bit shifts, directly matching the semantics of the named LoongArch // instructions. SLL_W, @@ -204,6 +206,10 @@ class LoongArchTargetLowering : public TargetLowering { void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; + + bool isEligibleForTailCallOptimization( + CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, + const SmallVectorImpl &ArgLocs) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 885c4d75f0b91..84b9f2c29e5a7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -50,6 +50,9 @@ def loongarch_call : SDNode<"LoongArchISD::CALL", SDT_LoongArchCall, SDNPVariadic]>; def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; @@ -232,8 +235,8 @@ def SImm26OperandBL: AsmOperandClass { let ParserMethod = "parseSImm26Operand"; } -// A symbol or an imm used in BL/PseudoCALL. -def simm26_bl : Operand { +// A symbol or an imm used in BL/PseudoCALL/PseudoTAIL. +def simm26_symbol : Operand { let ParserMatchClass = SImm26OperandBL; let EncoderMethod = "getImmOpValueAsr2"; let DecoderMethod = "decodeSImmOperand<26, 2>"; @@ -455,7 +458,7 @@ def BNEZ : BrCCZ_1RI21<0b010001, "bnez">; def B : Br_I26<0b010100, "b">; let isCall = 1, Defs=[R1] in -def BL : FmtI26<0b010101, (outs), (ins simm26_bl:$imm26), "bl", "$imm26">; +def BL : FmtI26<0b010101, (outs), (ins simm26_symbol:$imm26), "bl", "$imm26">; def JIRL : Fmt2RI16<0b010011, (outs GPR:$rd), (ins GPR:$rj, simm16_lsl2:$imm16), "jirl", "$rd, $rj, $imm16">; @@ -934,7 +937,7 @@ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; let isCall = 1, Defs = [R1] in -def PseudoCALL : Pseudo<(outs), (ins simm26_bl:$func)>; +def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>; def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; @@ -953,6 +956,28 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, PseudoInstExpansion<(JIRL R0, R1, 0)>; +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in +def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>; + +def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)), + (PseudoTAIL tglobaladdr:$dst)>; +def : Pat<(loongarch_tail (iPTR texternalsym:$dst)), + (PseudoTAIL texternalsym:$dst)>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in +def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj), + [(loongarch_tail GPRT:$rj)]>, + PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in +def PseudoB_TAIL : Pseudo<(outs), (ins simm26_b:$imm26)>, + PseudoInstExpansion<(B simm26_b:$imm26)>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in +def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, + PseudoInstExpansion<(JIRL R0, GPR:$rj, + simm16_lsl2:$imm16)>; + let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins grlenimm:$src)>; diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td index 2a46c6e57a493..ff914f805e5b2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td @@ -98,6 +98,16 @@ def GPR : RegisterClass<"LoongArch", [GRLenVT], 32, (add let RegInfos = GRLenRI; } +// GPR for indirect tail calls. We can't use callee-saved registers, as they are +// restored to the saved value before the tail call, which would clobber a call +// address. +def GPRT : RegisterClass<"LoongArch", [GRLenVT], 32, (add + // a0...a7, t0...t8 + (sequence "R%u", 4, 20) + )> { + let RegInfos = GRLenRI; +} + // Floating point registers let RegAltNameIndices = [RegAliasName] in { diff --git a/llvm/test/CodeGen/LoongArch/codemodel-medium.ll b/llvm/test/CodeGen/LoongArch/codemodel-medium.ll index aad38bb81952d..d4d97e7df804d 100644 --- a/llvm/test/CodeGen/LoongArch/codemodel-medium.ll +++ b/llvm/test/CodeGen/LoongArch/codemodel-medium.ll @@ -61,3 +61,19 @@ entry: call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 1000, i1 false) ret void } + +;; Tail call with different codemodel. +declare i32 @callee_tail(i32 %i) +define i32 @caller_tail(i32 %i) nounwind { +; SMALL-LABEL: caller_tail: +; SMALL: # %bb.0: # %entry +; SMALL-NEXT: b %plt(callee_tail) +; +; MEDIUM-LABEL: caller_tail: +; MEDIUM: # %bb.0: # %entry +; MEDIUM-NEXT: pcalau12i $a1, %pc_hi20(callee_tail) +; MEDIUM-NEXT: jirl $zero, $a1, %pc_lo12(callee_tail) +entry: + %r = tail call i32 @callee_tail(i32 %i) + ret i32 %r +} diff --git a/llvm/test/CodeGen/LoongArch/nomerge.ll b/llvm/test/CodeGen/LoongArch/nomerge.ll index 6c69f0d15675a..e4aecd79993ea 100644 --- a/llvm/test/CodeGen/LoongArch/nomerge.ll +++ b/llvm/test/CodeGen/LoongArch/nomerge.ll @@ -32,4 +32,4 @@ attributes #0 = { nomerge } ; CHECK: .LBB0_3: # %if.then2 ; CHECK-NEXT: bl %plt(bar) ; CHECK: .LBB0_4: # %if.end3 -; CHECK: bl %plt(bar) +; CHECK: b %plt(bar) diff --git a/llvm/test/CodeGen/LoongArch/tail-calls.ll b/llvm/test/CodeGen/LoongArch/tail-calls.ll new file mode 100644 index 0000000000000..f09b49688263e --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/tail-calls.ll @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s + +;; Perform tail call optimization for global address. +declare i32 @callee_tail(i32 %i) +define i32 @caller_tail(i32 %i) nounwind { +; CHECK-LABEL: caller_tail: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: b %plt(callee_tail) +entry: + %r = tail call i32 @callee_tail(i32 %i) + ret i32 %r +} + +;; Perform tail call optimization for external symbol. +@dest = global [2 x i8] zeroinitializer +declare void @llvm.memcpy.p0i8.p0i8.i32(ptr, ptr, i32, i1) +define void @caller_extern(ptr %src) optsize { +; CHECK-LABEL: caller_extern: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: move $a1, $a0 +; CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(dest) +; CHECK-NEXT: ld.d $a0, $a0, %got_pc_lo12(dest) +; CHECK-NEXT: ori $a2, $zero, 7 +; CHECK-NEXT: b %plt(memcpy) +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr getelementptr inbounds ([2 x i8], ptr @dest, i32 0, i32 0), ptr %src, i32 7, i1 false) + ret void +} + +;; Perform indirect tail call optimization (for function pointer call). +declare void @callee_indirect1() +declare void @callee_indirect2() +define void @caller_indirect_tail(i32 %a) nounwind { +; CHECK-LABEL: caller_indirect_tail: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: bstrpick.d $a0, $a0, 31, 0 +; CHECK-NEXT: sltui $a0, $a0, 1 +; CHECK-NEXT: pcalau12i $a1, %got_pc_hi20(callee_indirect2) +; CHECK-NEXT: ld.d $a1, $a1, %got_pc_lo12(callee_indirect2) +; CHECK-NEXT: masknez $a1, $a1, $a0 +; CHECK-NEXT: pcalau12i $a2, %got_pc_hi20(callee_indirect1) +; CHECK-NEXT: ld.d $a2, $a2, %got_pc_lo12(callee_indirect1) +; CHECK-NEXT: maskeqz $a0, $a2, $a0 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: jr $a0 +entry: + %tobool = icmp eq i32 %a, 0 + %callee = select i1 %tobool, ptr @callee_indirect1, ptr @callee_indirect2 + tail call void %callee() + ret void +} + +;; Do not tail call optimize functions with varargs passed by stack. +declare i32 @callee_varargs(i32, ...) +define void @caller_varargs(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: caller_varargs: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: st.d $a0, $sp, 0 +; CHECK-NEXT: move $a2, $a1 +; CHECK-NEXT: move $a3, $a0 +; CHECK-NEXT: move $a4, $a0 +; CHECK-NEXT: move $a5, $a1 +; CHECK-NEXT: move $a6, $a1 +; CHECK-NEXT: move $a7, $a0 +; CHECK-NEXT: bl %plt(callee_varargs) +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret +entry: + %call = tail call i32 (i32, ...) @callee_varargs(i32 %a, i32 %b, i32 %b, i32 %a, i32 %a, i32 %b, i32 %b, i32 %a, i32 %a) + ret void +} + +;; Do not tail call optimize if stack is used to pass parameters. +declare i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) +define i32 @caller_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) nounwind { +; CHECK-LABEL: caller_args: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: ld.d $t0, $sp, 16 +; CHECK-NEXT: st.d $t0, $sp, 0 +; CHECK-NEXT: bl %plt(callee_args) +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret +entry: + %r = tail call i32 @callee_args(i32 %a, i32 %b, i32 %c, i32 %dd, i32 %e, i32 %ff, i32 %g, i32 %h, i32 %i) + ret i32 %r +} + +;; Do not tail call optimize if parameters need to be passed indirectly. +declare i32 @callee_indirect_args(i256 %a) +define void @caller_indirect_args() nounwind { +; CHECK-LABEL: caller_indirect_args: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -48 +; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; CHECK-NEXT: st.d $zero, $sp, 24 +; CHECK-NEXT: st.d $zero, $sp, 16 +; CHECK-NEXT: st.d $zero, $sp, 8 +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: st.d $a0, $sp, 0 +; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: bl %plt(callee_indirect_args) +; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 48 +; CHECK-NEXT: ret +entry: + %call = tail call i32 @callee_indirect_args(i256 1) + ret void +} + +;; Do not tail call optimize if byval parameters need to be passed. +declare i32 @callee_byval(ptr byval(ptr) %a) +define i32 @caller_byval() nounwind { +; CHECK-LABEL: caller_byval: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -32 +; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; CHECK-NEXT: ld.d $a0, $sp, 16 +; CHECK-NEXT: st.d $a0, $sp, 8 +; CHECK-NEXT: addi.d $a0, $sp, 8 +; CHECK-NEXT: bl %plt(callee_byval) +; CHECK-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 32 +; CHECK-NEXT: ret +entry: + %a = alloca ptr + %r = tail call i32 @callee_byval(ptr byval(ptr) %a) + ret i32 %r +} + +;; Do not tail call optimize if callee uses structret semantics. +%struct.A = type { i32 } +@a = global %struct.A zeroinitializer + +declare void @callee_struct(ptr sret(%struct.A) %a) +define void @caller_nostruct() nounwind { +; CHECK-LABEL: caller_nostruct: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(a) +; CHECK-NEXT: ld.d $a0, $a0, %got_pc_lo12(a) +; CHECK-NEXT: bl %plt(callee_struct) +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret +entry: + tail call void @callee_struct(ptr sret(%struct.A) @a) + ret void +} + +;; Do not tail call optimize if caller uses structret semantics. +declare void @callee_nostruct() +define void @caller_struct(ptr sret(%struct.A) %a) nounwind { +; CHECK-LABEL: caller_struct: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: bl %plt(callee_nostruct) +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret +entry: + tail call void @callee_nostruct() + ret void +} + +;; Do not tail call optimize if disabled. +define i32 @disable_tail_calls(i32 %i) nounwind "disable-tail-calls"="true" { +; CHECK-LABEL: disable_tail_calls: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: bl %plt(callee_tail) +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret +entry: + %rv = tail call i32 @callee_tail(i32 %i) + ret i32 %rv +}