diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c11a04cd..157642e14f68d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1067,6 +1067,19 @@ namespace llvm { //===--------------------------------------------------------------------===// // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { + // Copying needed for an outgoing byval argument. + enum ByValCopyKind { + // Argument is already in the correct location, no copy needed. + NoCopy, + // Argument value is currently in the local stack frame, needs copying to + // outgoing arguemnt area. + CopyOnce, + // Argument value is currently in the outgoing argument area, but not at + // the correct offset, so needs copying via a temporary in local stack + // space. + CopyViaTemp, + }; + public: explicit X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI); @@ -1775,6 +1788,9 @@ namespace llvm { SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; + ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src, + SDValue Dst, + ISD::ArgFlagsTy Flags) const; SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 37d77728882b1..a281a02077991 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -2018,6 +2018,146 @@ SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. +static bool MatchingStackOffset(SDValue Arg, unsigned Offset, + ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, + const MachineRegisterInfo *MRI, + const X86InstrInfo *TII, + const CCValAssign &VA) { + unsigned Bytes = Arg.getValueSizeInBits() / 8; + + for (;;) { + // Look through nodes that don't alter the bits of the incoming value. + unsigned Op = Arg.getOpcode(); + if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST || + Op == ISD::AssertZext) { + Arg = Arg.getOperand(0); + continue; + } + if (Op == ISD::TRUNCATE) { + const SDValue &TruncInput = Arg.getOperand(0); + if (TruncInput.getOpcode() == ISD::AssertZext && + cast(TruncInput.getOperand(1))->getVT() == + Arg.getValueType()) { + Arg = TruncInput.getOperand(0); + continue; + } + } + break; + } + + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + Register VR = cast(Arg.getOperand(1))->getReg(); + if (!VR.isVirtual()) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(*Def, FI)) + return false; + } else { + unsigned Opcode = Def->getOpcode(); + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r) && + Def->getOperand(1).isFI()) { + FI = Def->getOperand(1).getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { + FrameIndexSDNode *FINode = cast(Arg); + FI = FINode->getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI.isFixedObjectIndex(FI)) + return false; + + if (Offset != MFI.getObjectOffset(FI)) + return false; + + // If this is not byval, check that the argument stack object is immutable. + // inalloca and argument copy elision can create mutable argument stack + // objects. Byval objects can be mutated, but a byval call intends to pass the + // mutated memory. + if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) + return false; + + if (VA.getLocVT().getFixedSizeInBits() > + Arg.getValueSizeInBits().getFixedValue()) { + // If the argument location is wider than the argument type, check that any + // extension flags match. + if (Flags.isZExt() != MFI.isObjectZExt(FI) || + Flags.isSExt() != MFI.isObjectSExt(FI)) { + return false; + } + } + + return Bytes == MFI.getObjectSize(FI); +} + +// Returns the type of copying which is required to set up a byval argument to +// a tail-called function. This isn't needed for non-tail calls, because they +// always need the equivalent of CopyOnce, but tail-calls sometimes need two to +// avoid clobbering another argument (CopyViaTemp), and sometimes can be +// optimised to zero copies when forwarding an argument from the caller's +// caller (NoCopy). +X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall( + SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + // Globals are always safe to copy from. + if (isa(Src) || isa(Src)) + return CopyOnce; + + // Can only analyse frame index nodes, conservatively assume we need a + // temporary. + auto *SrcFrameIdxNode = dyn_cast(Src); + auto *DstFrameIdxNode = dyn_cast(Dst); + if (!SrcFrameIdxNode || !DstFrameIdxNode) + return CopyViaTemp; + + int SrcFI = SrcFrameIdxNode->getIndex(); + int DstFI = DstFrameIdxNode->getIndex(); + assert(MFI.isFixedObjectIndex(DstFI) && + "byval passed in non-fixed stack slot"); + + int64_t SrcOffset = MFI.getObjectOffset(SrcFI); + int64_t DstOffset = MFI.getObjectOffset(DstFI); + + // If the source is in the local frame, then the copy to the argument + // memory is always valid. + bool FixedSrc = MFI.isFixedObjectIndex(SrcFI); + if (!FixedSrc || (FixedSrc && SrcOffset < 0)) + return CopyOnce; + + // If the value is already in the correct location, then no copying is + // needed. If not, then we need to copy via a temporary. + if (SrcOffset == DstOffset) + return NoCopy; + else + return CopyViaTemp; +} + SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -2098,21 +2238,20 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isTailCall = false; } - if (isTailCall && !IsMustTail) { + if (isTailCall) { // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, - IsCalleePopSRet); + IsSibcall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, + IsCalleePopSRet); - // Sibcalls are automatically detected tailcalls which do not require - // ABI changes. - if (!IsGuaranteeTCO && isTailCall) + if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail) IsSibcall = true; if (isTailCall) ++NumTailCalls; } - if (IsMustTail && !isTailCall) + if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -2128,8 +2267,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); + // A sibcall is ABI-compatible and does not need to adjust the stack pointer. int FPDiff = 0; - if (isTailCall && + if (isTailCall && !IsSibcall && shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) { // Lower arguments at fp - stackoffset + fpdiff. @@ -2146,6 +2286,75 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; + SDValue StackPtr; + const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + // If we are doing a tail-call, any byval arguments will be written to stack + // space which was used for incoming arguments. If any the values being used + // are incoming byval arguments to this function, then they might be + // overwritten by the stores of the outgoing arguments. To avoid this, we + // need to make a temporary copy of them in local stack space, then copy back + // to the argument area. + DenseMap ByValTemporaries; + SDValue ByValTempChain; + if (isTailCall) { + SmallVector ByValCopyChains; + for (const CCValAssign &VA : ArgLocs) { + unsigned ArgIdx = VA.getValNo(); + SDValue Src = OutVals[ArgIdx]; + ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags; + + if (!Flags.isByVal()) + continue; + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + if (!StackPtr.getNode()) + StackPtr = + DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), PtrVT); + + // Destination: where this byval should live in the callee’s frame + // after the tail call. + int32_t Offset = VA.getLocMemOffset() + FPDiff; + int Size = VA.getLocVT().getFixedSizeInBits() / 8; + int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true); + SDValue Dst = DAG.getFrameIndex(FI, PtrVT); + + ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags); + + if (Copy == NoCopy) { + // If the argument is already at the correct offset on the stack + // (because we are forwarding a byval argument from our caller), we + // don't need any copying. + continue; + } else if (Copy == CopyOnce) { + // If the argument is in our local stack frame, no other argument + // preparation can clobber it, so we can copy it to the final location + // later. + ByValTemporaries[ArgIdx] = Src; + } else { + assert(Copy == CopyViaTemp && "unexpected enum value"); + // If we might be copying this argument from the outgoing argument + // stack area, we need to copy via a temporary in the local stack + // frame. + MachineFrameInfo &MFI = MF.getFrameInfo(); + int TempFrameIdx = MFI.CreateStackObject(Flags.getByValSize(), + Flags.getNonZeroByValAlign(), + /*isSS=*/false); + SDValue Temp = + DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout())); + + SDValue CopyChain = + CreateCopyOfByValArgument(Src, Temp, Chain, Flags, DAG, dl); + ByValCopyChains.push_back(CopyChain); + } + } + + if (!ByValCopyChains.empty()) + ByValTempChain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains); + } + // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. @@ -2186,7 +2395,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVector, 8> RegsToPass; SmallVector MemOpChains; - SDValue StackPtr; // The next loop assumes that the locations are in the same order of the // input arguments. @@ -2195,7 +2403,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); @@ -2285,7 +2492,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } - } else if (!IsSibcall && (!isTailCall || isByVal)) { + } else if (!IsSibcall && (!isTailCall || (isByVal && !IsMustTail))) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), @@ -2362,7 +2569,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. - if (!IsSibcall && isTailCall) { + if (isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments or the return address are stored to the // stack, because the outgoing stack slots may alias the incoming argument @@ -2372,6 +2579,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // would clobber. Chain = DAG.getStackArgumentTokenFactor(Chain); + if (ByValTempChain) + Chain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, ByValTempChain); + SmallVector MemOpChains2; SDValue FIN; int FI = 0; @@ -2404,21 +2615,40 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { - // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); - if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), - getPointerTy(DAG.getDataLayout())); - Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - StackPtr, Source); - - MemOpChains2.push_back( - CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl)); + SDValue ByValSrc; + bool NeedsStackCopy; + if (auto It = ByValTemporaries.find(OutsIndex); + It != ByValTemporaries.end()) { + ByValSrc = It->second; + NeedsStackCopy = true; + } else { + ByValSrc = Arg; + NeedsStackCopy = !isTailCall; + } + + if (NeedsStackCopy) { + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue DstAddr = DAG.getFrameIndex(FI, PtrVT); + + // Copy the struct contents from ByValSrc to DstAddr. + MemOpChains2.push_back(CreateCopyOfByValArgument( + ByValSrc, DstAddr, Chain, Flags, DAG, dl)); + } } else { - // Store relative to framepointer. - MemOpChains2.push_back(DAG.getStore( - Chain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo &MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, + TII, VA)) { + // Store relative to framepointer. + MemOpChains2.push_back(DAG.getStore( + Chain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); + } } } @@ -2684,102 +2914,6 @@ X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; } -/// Return true if the given stack call argument is already available in the -/// same position (relatively) of the caller's incoming argument stack. -static -bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, - MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, - const X86InstrInfo *TII, const CCValAssign &VA) { - unsigned Bytes = Arg.getValueSizeInBits() / 8; - - for (;;) { - // Look through nodes that don't alter the bits of the incoming value. - unsigned Op = Arg.getOpcode(); - if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST || - Op == ISD::AssertZext) { - Arg = Arg.getOperand(0); - continue; - } - if (Op == ISD::TRUNCATE) { - const SDValue &TruncInput = Arg.getOperand(0); - if (TruncInput.getOpcode() == ISD::AssertZext && - cast(TruncInput.getOperand(1))->getVT() == - Arg.getValueType()) { - Arg = TruncInput.getOperand(0); - continue; - } - } - break; - } - - int FI = INT_MAX; - if (Arg.getOpcode() == ISD::CopyFromReg) { - Register VR = cast(Arg.getOperand(1))->getReg(); - if (!VR.isVirtual()) - return false; - MachineInstr *Def = MRI->getVRegDef(VR); - if (!Def) - return false; - if (!Flags.isByVal()) { - if (!TII->isLoadFromStackSlot(*Def, FI)) - return false; - } else { - unsigned Opcode = Def->getOpcode(); - if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || - Opcode == X86::LEA64_32r) && - Def->getOperand(1).isFI()) { - FI = Def->getOperand(1).getIndex(); - Bytes = Flags.getByValSize(); - } else - return false; - } - } else if (LoadSDNode *Ld = dyn_cast(Arg)) { - if (Flags.isByVal()) - // ByVal argument is passed in as a pointer but it's now being - // dereferenced. e.g. - // define @foo(%struct.X* %A) { - // tail call @bar(%struct.X* byval %A) - // } - return false; - SDValue Ptr = Ld->getBasePtr(); - FrameIndexSDNode *FINode = dyn_cast(Ptr); - if (!FINode) - return false; - FI = FINode->getIndex(); - } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { - FrameIndexSDNode *FINode = cast(Arg); - FI = FINode->getIndex(); - Bytes = Flags.getByValSize(); - } else - return false; - - assert(FI != INT_MAX); - if (!MFI.isFixedObjectIndex(FI)) - return false; - - if (Offset != MFI.getObjectOffset(FI)) - return false; - - // If this is not byval, check that the argument stack object is immutable. - // inalloca and argument copy elision can create mutable argument stack - // objects. Byval objects can be mutated, but a byval call intends to pass the - // mutated memory. - if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) - return false; - - if (VA.getLocVT().getFixedSizeInBits() > - Arg.getValueSizeInBits().getFixedValue()) { - // If the argument location is wider than the argument type, check that any - // extension flags match. - if (Flags.isZExt() != MFI.isObjectZExt(FI) || - Flags.isSExt() != MFI.isObjectSExt(FI)) { - return false; - } - } - - return Bytes == MFI.getObjectSize(FI); -} - static bool mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI, Register CallerSRetReg) { @@ -2814,9 +2948,10 @@ mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI, /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. -/// Note that the x86 backend does not check musttail calls for eligibility! The -/// rest of x86 tail call lowering must be prepared to forward arguments of any -/// type. +/// +/// Note that this function also processes musttail calls, so when this +/// function returns false on a valid musttail call, a fatal backend error +/// occurs. bool X86TargetLowering::IsEligibleForTailCallOptimization( TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo, SmallVectorImpl &ArgLocs, bool IsCalleePopSRet) const { @@ -2943,26 +3078,6 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { - if (StackArgsSize > 0) { - // Check if the arguments are already laid out in the right way as - // the caller's fixed stack objects. - MachineFrameInfo &MFI = MF.getFrameInfo(); - const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const X86InstrInfo *TII = Subtarget.getInstrInfo(); - for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { - const CCValAssign &VA = ArgLocs[I]; - SDValue Arg = OutVals[I]; - ISD::ArgFlagsTy Flags = Outs[I].Flags; - if (VA.getLocInfo() == CCValAssign::Indirect) - return false; - if (!VA.isRegLoc()) { - if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, - TII, VA)) - return false; - } - } - } - bool PositionIndependent = isPositionIndependent(); // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can @@ -3000,6 +3115,11 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getStackSize() > FuncInfo->getArgumentStackSize()) + return false; + if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; diff --git a/llvm/test/CodeGen/X86/musttail-struct.ll b/llvm/test/CodeGen/X86/musttail-struct.ll new file mode 100644 index 0000000000000..62ca16589b3ee --- /dev/null +++ b/llvm/test/CodeGen/X86/musttail-struct.ll @@ -0,0 +1,187 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-asm-syntax=intel | FileCheck %s + +; Test correct handling of a musttail call with a byval struct argument. + +%struct.1xi32 = type { [1 x i32] } +%struct.3xi32 = type { [3 x i32] } +%struct.5xi32 = type { [5 x i32] } + +declare dso_local i32 @Func1(ptr byval(%struct.1xi32) %0) +declare dso_local i32 @Func3(ptr byval(%struct.3xi32) %0) +declare dso_local i32 @Func5(ptr byval(%struct.5xi32) %0) +declare dso_local i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) + +define dso_local i32 @test1(ptr byval(%struct.1xi32) %0) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp Func1 # TAILCALL + %r = musttail call i32 @Func1(ptr byval(%struct.1xi32) %0) + ret i32 %r +} + +define dso_local i32 @test3(ptr byval(%struct.3xi32) %0) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp Func3 # TAILCALL + %r = musttail call i32 @Func3(ptr byval(%struct.3xi32) %0) + ret i32 %r +} + +; sizeof(%struct.5xi32) > 16, in x64 this is passed on stack. +define dso_local i32 @test5(ptr byval(%struct.5xi32) %0) { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp Func5 # TAILCALL + %r = musttail call i32 @Func5(ptr byval(%struct.5xi32) %0) + ret i32 %r +} + +; Test passing multiple arguments with different sizes on stack. In x64 Linux +; the first 6 are passed by register. +define dso_local i32 @testManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) { +; CHECK-LABEL: testManyArgs: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp FuncManyArgs # TAILCALL + %r = musttail call i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) + ret i32 %r +} + +define dso_local i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) { +; CHECK-LABEL: testRecursion: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp testRecursion # TAILCALL + %r = musttail call i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) + ret i32 %r +} + +define dso_local i32 @swap(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) noinline { +; CHECK-LABEL: swap: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mov eax, dword ptr [rsp + 8] +; CHECK-NEXT: add eax, dword ptr [rsp + 16] +; CHECK-NEXT: ret +entry: + %a.ptr = getelementptr inbounds %struct.1xi32, ptr %0, i32 0, i32 0, i32 0 + %a = load i32, ptr %a.ptr, align 4 + %b.ptr = getelementptr inbounds %struct.1xi32, ptr %1, i32 0, i32 0, i32 0 + %b = load i32, ptr %b.ptr, align 4 + %sum = add i32 %a, %b + ret i32 %sum +} + +define dso_local i32 @swapByValArguments(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) { +; CHECK-LABEL: swapByValArguments: +; CHECK: # %bb.0: +; CHECK-NEXT: mov eax, dword ptr [rsp + 8] +; CHECK-NEXT: mov dword ptr [rsp - 16], eax +; CHECK-NEXT: mov eax, dword ptr [rsp + 16] +; CHECK-NEXT: mov dword ptr [rsp - 8], eax +; CHECK-NEXT: jmp swap # TAILCALL + + + %r = musttail call i32 @swap(ptr byval(%struct.1xi32) %1, ptr byval(%struct.1xi32) %0) + ret i32 %r +} + +; Clang only uses byval for arguments of 65 bytes or larger, but e.g. rustc +; does use byval for smaller types. Here we use a 20 byte struct to keep +; the tests more readable. +%twenty_bytes = type { [5 x i32] } +declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4) + +; Functions with byval parameters can be tail-called, because the value is +; actually passed in registers and the stack in the same way for the caller and +; callee. On x86 byval arguments are never (partially) passed via registers. +define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jmp large_callee@PLT # TAILCALL +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; The IR for this one looks dodgy, because it has an alloca passed to a +; musttail function, but it is passed as a byval argument, so will be copied +; into the stack space allocated by @large_caller_new_value's caller, so is +; valid. +define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller_new_value: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movabs rax, 4294967296 +; CHECK-NEXT: mov qword ptr [rsp - 20], rax +; CHECK-NEXT: movabs rcx, 12884901890 +; CHECK-NEXT: mov qword ptr [rsp - 12], rcx +; CHECK-NEXT: mov dword ptr [rsp - 4], 4 +; CHECK-NEXT: mov qword ptr [rsp + 8], rax +; CHECK-NEXT: mov qword ptr [rsp + 16], rcx +; CHECK-NEXT: mov dword ptr [rsp + 24], 4 +; CHECK-NEXT: jmp large_callee@PLT # TAILCALL +entry: + %y = alloca %twenty_bytes, align 4 + store i32 0, ptr %y, align 4 + %0 = getelementptr inbounds i8, ptr %y, i32 4 + store i32 1, ptr %0, align 4 + %1 = getelementptr inbounds i8, ptr %y, i32 8 + store i32 2, ptr %1, align 4 + %2 = getelementptr inbounds i8, ptr %y, i32 12 + store i32 3, ptr %2, align 4 + %3 = getelementptr inbounds i8, ptr %y, i32 16 + store i32 4, ptr %3, align 4 + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y) + ret void +} + +declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4) +define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; CHECK-LABEL: swap_byvals: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mov eax, dword ptr [rsp + 24] +; CHECK-NEXT: mov dword ptr [rsp - 8], eax +; CHECK-NEXT: movaps xmm0, xmmword ptr [rsp + 8] +; CHECK-NEXT: movaps xmmword ptr [rsp - 24], xmm0 +; CHECK-NEXT: mov eax, dword ptr [rsp + 48] +; CHECK-NEXT: mov dword ptr [rsp - 32], eax +; CHECK-NEXT: mov rax, qword ptr [rsp + 32] +; CHECK-NEXT: mov rcx, qword ptr [rsp + 40] +; CHECK-NEXT: mov qword ptr [rsp - 40], rcx +; CHECK-NEXT: mov qword ptr [rsp - 48], rax +; CHECK-NEXT: jmp two_byvals_callee@PLT # TAILCALL +entry: + musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a) + ret void +} + +; A forwarded byval arg, but at a different argument position. Because +; x86 does not (partially) pass byval arguments in registers, the byval +; arg is in the correct position already, so this is not a sibcall but +; can be tail-call optimized. +declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4) +define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; CHECK-LABEL: shift_byval: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: jmp shift_byval_callee@PLT # TAILCALL +entry: + tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b) + ret void +} + +; A global object passed to a byval argument, so it must be copied, but doesn't +; need a stack temporary. +@large_global = external global %twenty_bytes +define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller_from_global: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mov rax, qword ptr [rip + large_global@GOTPCREL] +; CHECK-NEXT: mov ecx, dword ptr [rax + 16] +; CHECK-NEXT: mov dword ptr [rsp + 24], ecx +; CHECK-NEXT: mov rcx, qword ptr [rax] +; CHECK-NEXT: mov rax, qword ptr [rax + 8] +; CHECK-NEXT: mov qword ptr [rsp + 16], rax +; CHECK-NEXT: mov qword ptr [rsp + 8], rcx +; CHECK-NEXT: jmp large_callee@PLT # TAILCALL +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global) + ret void +} diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll index 65cd1edd92e31..c1ec7ccbde177 100644 --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -243,18 +243,15 @@ define void @f_thunk(ptr %this, ...) { ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %ebp ; X86-NOSSE-NEXT: movl %esp, %ebp -; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: andl $-16, %esp ; X86-NOSSE-NEXT: subl $32, %esp -; X86-NOSSE-NEXT: movl 8(%ebp), %esi -; X86-NOSSE-NEXT: leal 12(%ebp), %eax -; X86-NOSSE-NEXT: movl %eax, (%esp) -; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-NEXT: leal 12(%ebp), %ecx +; X86-NOSSE-NEXT: movl %ecx, (%esp) +; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: calll _get_f ; X86-NOSSE-NEXT: addl $4, %esp -; X86-NOSSE-NEXT: movl %esi, 8(%ebp) -; X86-NOSSE-NEXT: leal -4(%ebp), %esp -; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: movl %ebp, %esp ; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: jmpl *%eax # TAILCALL ; @@ -262,24 +259,21 @@ define void @f_thunk(ptr %this, ...) { ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %ebp ; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $80, %esp ; X86-SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; X86-SSE-NEXT: movaps %xmm0, (%esp) # 16-byte Spill -; X86-SSE-NEXT: movl 8(%ebp), %esi -; X86-SSE-NEXT: leal 12(%ebp), %eax -; X86-SSE-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: leal 12(%ebp), %ecx +; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: calll _get_f ; X86-SSE-NEXT: addl $4, %esp -; X86-SSE-NEXT: movl %esi, 8(%ebp) ; X86-SSE-NEXT: movaps (%esp), %xmm0 # 16-byte Reload ; X86-SSE-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload ; X86-SSE-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload -; X86-SSE-NEXT: leal -4(%ebp), %esp -; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: movl %ebp, %esp ; X86-SSE-NEXT: popl %ebp ; X86-SSE-NEXT: jmpl *%eax # TAILCALL %ap = alloca [4 x ptr], align 16 @@ -310,11 +304,14 @@ define void @g_thunk(ptr %fptr_i8, ...) { ; WINDOWS: # %bb.0: ; WINDOWS-NEXT: rex64 jmpq *%rcx # TAILCALL ; -; X86-LABEL: g_thunk: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: jmpl *%eax # TAILCALL +; X86-NOSSE-LABEL: g_thunk: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: jmpl *{{[0-9]+}}(%esp) # TAILCALL +; +; X86-SSE-LABEL: g_thunk: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: jmpl *%eax # TAILCALL musttail call void (ptr, ...) %fptr_i8(ptr %fptr_i8, ...) ret void } @@ -374,10 +371,9 @@ define void @h_thunk(ptr %this, ...) { ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: jmpl *%ecx # TAILCALL ; X86-NEXT: LBB2_2: # %else -; X86-NEXT: movl 8(%eax), %ecx +; X86-NEXT: movl 8(%eax), %eax ; X86-NEXT: movl $42, _g -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: jmpl *%ecx # TAILCALL +; X86-NEXT: jmpl *%eax # TAILCALL %cond = load i1, ptr %this br i1 %cond, label %then, label %else diff --git a/llvm/test/CodeGen/X86/musttail.ll b/llvm/test/CodeGen/X86/musttail.ll index 9e02585a3ffdc..57c47ad683127 100644 --- a/llvm/test/CodeGen/X86/musttail.ll +++ b/llvm/test/CodeGen/X86/musttail.ll @@ -46,7 +46,7 @@ define i32 @t4(ptr %fn, i32 %n, i32 %r) { ; CHECK: decl %[[n:.*]] ; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%esp) ; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%esp) -; CHECK: jmpl *%{{.*}} +; CHECK: jmpl *{{.*}} # TAILCALL entry: %r1 = add i32 %r, 1 @@ -74,7 +74,7 @@ define i32 @t5(ptr %fn, i32 %n, i32 %r) alignstack(32) { ; CHECK: leal {{[-0-9]+}}(%ebp), %esp ; CHECK: popl %esi ; CHECK: popl %ebp -; CHECK: jmpl *%{{.*}} +; CHECK: jmpl *{{.*}} # TAILCALL entry: %a = alloca i8, i32 %n diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll index 2759a9883975e..d1137cac7d365 100644 --- a/llvm/test/CodeGen/X86/sibcall.ll +++ b/llvm/test/CodeGen/X86/sibcall.ll @@ -295,10 +295,15 @@ declare dso_local i32 @foo5(i32, i32, i32, i32, i32) define dso_local i32 @t12(i32 %x, i32 %y, ptr byval(%struct.t) align 4 %z) nounwind ssp { ; X86-LABEL: t12: ; X86: # %bb.0: # %entry +; X86-NEXT: subl $20, %esp ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: jne foo6 # TAILCALL -; X86-NEXT: # %bb.1: # %bb2 +; X86-NEXT: je .LBB12_1 +; X86-NEXT: # %bb.2: # %bb +; X86-NEXT: addl $20, %esp +; X86-NEXT: jmp foo6 # TAILCALL +; X86-NEXT: .LBB12_1: # %bb2 ; X86-NEXT: xorl %eax, %eax +; X86-NEXT: addl $20, %esp ; X86-NEXT: retl ; ; X64-LABEL: t12: diff --git a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll index cd669768705e5..b901d22f66392 100644 --- a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll +++ b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc %s -o - | FileCheck %s target triple = "x86_64-apple-macosx" @@ -24,9 +25,7 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4, ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; CHECK-NEXT: callq _foo ; CHECK-NEXT: movq %r14, (%rax) -; CHECK-NEXT: movl [[OFF:[0-9]+]](%rsp), %edx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rcx, [[OFF]](%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: movq %r15, %rsi @@ -34,7 +33,6 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4, ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r15 -; CHECK-NEXT: addq $16, %rsp ; CHECK-NEXT: jmp _tc_fn ## TAILCALL entry: %res = tail call ptr @foo()