diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..157642e14f68d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1067,6 +1067,19 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   //  X86 Implementation of the TargetLowering interface
   class X86TargetLowering final : public TargetLowering {
+    // Copying needed for an outgoing byval argument.
+    enum ByValCopyKind {
+      // Argument is already in the correct location, no copy needed.
+      NoCopy,
+      // Argument value is currently in the local stack frame, needs copying to
+      // outgoing arguemnt area.
+      CopyOnce,
+      // Argument value is currently in the outgoing argument area, but not at
+      // the correct offset, so needs copying via a temporary in local stack
+      // space.
+      CopyViaTemp,
+    };
+
   public:
     explicit X86TargetLowering(const X86TargetMachine &TM,
                                const X86Subtarget &STI);
@@ -1775,6 +1788,9 @@ namespace llvm {
     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src,
+                                            SDValue Dst,
+                                            ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 37d77728882b1..a281a02077991 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2018,6 +2018,146 @@ SDValue X86TargetLowering::getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static bool MatchingStackOffset(SDValue Arg, unsigned Offset,
+                                ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI,
+                                const MachineRegisterInfo *MRI,
+                                const X86InstrInfo *TII,
+                                const CCValAssign &VA) {
+  unsigned Bytes = Arg.getValueSizeInBits() / 8;
+
+  for (;;) {
+    // Look through nodes that don't alter the bits of the incoming value.
+    unsigned Op = Arg.getOpcode();
+    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
+        Op == ISD::AssertZext) {
+      Arg = Arg.getOperand(0);
+      continue;
+    }
+    if (Op == ISD::TRUNCATE) {
+      const SDValue &TruncInput = Arg.getOperand(0);
+      if (TruncInput.getOpcode() == ISD::AssertZext &&
+          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+              Arg.getValueType()) {
+        Arg = TruncInput.getOperand(0);
+        continue;
+      }
+    }
+    break;
+  }
+
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!VR.isVirtual())
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(*Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+           Opcode == X86::LEA64_32r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        Bytes = Flags.getByValSize();
+      } else
+        return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+    FI = FINode->getIndex();
+    Bytes = Flags.getByValSize();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI.isFixedObjectIndex(FI))
+    return false;
+
+  if (Offset != MFI.getObjectOffset(FI))
+    return false;
+
+  // If this is not byval, check that the argument stack object is immutable.
+  // inalloca and argument copy elision can create mutable argument stack
+  // objects. Byval objects can be mutated, but a byval call intends to pass the
+  // mutated memory.
+  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
+    return false;
+
+  if (VA.getLocVT().getFixedSizeInBits() >
+      Arg.getValueSizeInBits().getFixedValue()) {
+    // If the argument location is wider than the argument type, check that any
+    // extension flags match.
+    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
+        Flags.isSExt() != MFI.isObjectSExt(FI)) {
+      return false;
+    }
+  }
+
+  return Bytes == MFI.getObjectSize(FI);
+}
+
+// Returns the type of copying which is required to set up a byval argument to
+// a tail-called function. This isn't needed for non-tail calls, because they
+// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
+// avoid clobbering another argument (CopyViaTemp), and sometimes can be
+// optimised to zero copies when forwarding an argument from the caller's
+// caller (NoCopy).
+X86TargetLowering::ByValCopyKind X86TargetLowering::ByValNeedsCopyForTailCall(
+    SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // Globals are always safe to copy from.
+  if (isa<GlobalAddressSDNode>(Src) || isa<ExternalSymbolSDNode>(Src))
+    return CopyOnce;
+
+  // Can only analyse frame index nodes, conservatively assume we need a
+  // temporary.
+  auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
+  auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
+  if (!SrcFrameIdxNode || !DstFrameIdxNode)
+    return CopyViaTemp;
+
+  int SrcFI = SrcFrameIdxNode->getIndex();
+  int DstFI = DstFrameIdxNode->getIndex();
+  assert(MFI.isFixedObjectIndex(DstFI) &&
+         "byval passed in non-fixed stack slot");
+
+  int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
+  int64_t DstOffset = MFI.getObjectOffset(DstFI);
+
+  // If the source is in the local frame, then the copy to the argument
+  // memory is always valid.
+  bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
+  if (!FixedSrc || (FixedSrc && SrcOffset < 0))
+    return CopyOnce;
+
+  // If the value is already in the correct location, then no copying is
+  // needed. If not, then we need to copy via a temporary.
+  if (SrcOffset == DstOffset)
+    return NoCopy;
+  else
+    return CopyViaTemp;
+}
+
 SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
@@ -2098,21 +2238,20 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isTailCall = false;
   }
 
-  if (isTailCall && !IsMustTail) {
+  if (isTailCall) {
     // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
-                                                   IsCalleePopSRet);
+    IsSibcall = IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs,
+                                                  IsCalleePopSRet);
 
-    // Sibcalls are automatically detected tailcalls which do not require
-    // ABI changes.
-    if (!IsGuaranteeTCO && isTailCall)
+    if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
+        CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
       IsSibcall = true;
 
     if (isTailCall)
       ++NumTailCalls;
   }
 
-  if (IsMustTail && !isTailCall)
+  if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
@@ -2128,8 +2267,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
+  // A sibcall is ABI-compatible and does not need to adjust the stack pointer.
   int FPDiff = 0;
-  if (isTailCall &&
+  if (isTailCall && !IsSibcall &&
       shouldGuaranteeTCO(CallConv,
                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
     // Lower arguments at fp - stackoffset + fpdiff.
@@ -2146,6 +2286,75 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   unsigned NumBytesToPush = NumBytes;
   unsigned NumBytesToPop = NumBytes;
 
+  SDValue StackPtr;
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  // If we are doing a tail-call, any byval arguments will be written to stack
+  // space which was used for incoming arguments. If any the values being used
+  // are incoming byval arguments to this function, then they might be
+  // overwritten by the stores of the outgoing arguments. To avoid this, we
+  // need to make a temporary copy of them in local stack space, then copy back
+  // to the argument area.
+  DenseMap<unsigned, SDValue> ByValTemporaries;
+  SDValue ByValTempChain;
+  if (isTailCall) {
+    SmallVector<SDValue, 8> ByValCopyChains;
+    for (const CCValAssign &VA : ArgLocs) {
+      unsigned ArgIdx = VA.getValNo();
+      SDValue Src = OutVals[ArgIdx];
+      ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
+
+      if (!Flags.isByVal())
+        continue;
+
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+      if (!StackPtr.getNode())
+        StackPtr =
+            DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), PtrVT);
+
+      // Destination: where this byval should live in the callee’s frame
+      // after the tail call.
+      int32_t Offset = VA.getLocMemOffset() + FPDiff;
+      int Size = VA.getLocVT().getFixedSizeInBits() / 8;
+      int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+      SDValue Dst = DAG.getFrameIndex(FI, PtrVT);
+
+      ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
+
+      if (Copy == NoCopy) {
+        // If the argument is already at the correct offset on the stack
+        // (because we are forwarding a byval argument from our caller), we
+        // don't need any copying.
+        continue;
+      } else if (Copy == CopyOnce) {
+        // If the argument is in our local stack frame, no other argument
+        // preparation can clobber it, so we can copy it to the final location
+        // later.
+        ByValTemporaries[ArgIdx] = Src;
+      } else {
+        assert(Copy == CopyViaTemp && "unexpected enum value");
+        // If we might be copying this argument from the outgoing argument
+        // stack area, we need to copy via a temporary in the local stack
+        // frame.
+        MachineFrameInfo &MFI = MF.getFrameInfo();
+        int TempFrameIdx = MFI.CreateStackObject(Flags.getByValSize(),
+                                                 Flags.getNonZeroByValAlign(),
+                                                 /*isSS=*/false);
+        SDValue Temp =
+            DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
+
+        SDValue CopyChain =
+            CreateCopyOfByValArgument(Src, Temp, Chain, Flags, DAG, dl);
+        ByValCopyChains.push_back(CopyChain);
+      }
+    }
+
+    if (!ByValCopyChains.empty())
+      ByValTempChain =
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
+  }
+
   // If we have an inalloca argument, all stack space has already been allocated
   // for us and be right at the top of the stack.  We don't support multiple
   // arguments passed in memory when using inalloca.
@@ -2186,7 +2395,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
-  SDValue StackPtr;
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
@@ -2195,7 +2403,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
@@ -2285,7 +2492,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         if (ShadowReg)
           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
       }
-    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+    } else if (!IsSibcall && (!isTailCall || (isByVal && !IsMustTail))) {
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
@@ -2362,7 +2569,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   // don't need this because the eligibility check rejects calls that require
   // shuffling arguments passed in memory.
-  if (!IsSibcall && isTailCall) {
+  if (isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments or the return address are stored to the
     // stack, because the outgoing stack slots may alias the incoming argument
@@ -2372,6 +2579,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // would clobber.
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
+    if (ByValTempChain)
+      Chain =
+          DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain, ByValTempChain);
+
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
@@ -2404,21 +2615,40 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
-        // Copy relative to framepointer.
-        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
-        if (!StackPtr.getNode())
-          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                                        getPointerTy(DAG.getDataLayout()));
-        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                             StackPtr, Source);
-
-        MemOpChains2.push_back(
-            CreateCopyOfByValArgument(Source, FIN, Chain, Flags, DAG, dl));
+        SDValue ByValSrc;
+        bool NeedsStackCopy;
+        if (auto It = ByValTemporaries.find(OutsIndex);
+            It != ByValTemporaries.end()) {
+          ByValSrc = It->second;
+          NeedsStackCopy = true;
+        } else {
+          ByValSrc = Arg;
+          NeedsStackCopy = !isTailCall;
+        }
+
+        if (NeedsStackCopy) {
+
+          auto PtrVT = getPointerTy(DAG.getDataLayout());
+          SDValue DstAddr = DAG.getFrameIndex(FI, PtrVT);
+
+          // Copy the struct contents from ByValSrc to DstAddr.
+          MemOpChains2.push_back(CreateCopyOfByValArgument(
+              ByValSrc, DstAddr, Chain, Flags, DAG, dl));
+        }
       } else {
-        // Store relative to framepointer.
-        MemOpChains2.push_back(DAG.getStore(
-            Chain, dl, Arg, FIN,
-            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+        // Check if the arguments are already laid out in the right way as
+        // the caller's fixed stack objects.
+        MachineFrameInfo &MFI = MF.getFrameInfo();
+        const MachineRegisterInfo *MRI = &MF.getRegInfo();
+        const X86InstrInfo *TII = Subtarget.getInstrInfo();
+
+        if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
+                                 TII, VA)) {
+          // Store relative to framepointer.
+          MemOpChains2.push_back(DAG.getStore(
+              Chain, dl, Arg, FIN,
+              MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+        }
       }
     }
 
@@ -2684,102 +2914,6 @@ X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
 }
 
-/// Return true if the given stack call argument is already available in the
-/// same position (relatively) of the caller's incoming argument stack.
-static
-bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
-                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
-                         const X86InstrInfo *TII, const CCValAssign &VA) {
-  unsigned Bytes = Arg.getValueSizeInBits() / 8;
-
-  for (;;) {
-    // Look through nodes that don't alter the bits of the incoming value.
-    unsigned Op = Arg.getOpcode();
-    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
-        Op == ISD::AssertZext) {
-      Arg = Arg.getOperand(0);
-      continue;
-    }
-    if (Op == ISD::TRUNCATE) {
-      const SDValue &TruncInput = Arg.getOperand(0);
-      if (TruncInput.getOpcode() == ISD::AssertZext &&
-          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
-              Arg.getValueType()) {
-        Arg = TruncInput.getOperand(0);
-        continue;
-      }
-    }
-    break;
-  }
-
-  int FI = INT_MAX;
-  if (Arg.getOpcode() == ISD::CopyFromReg) {
-    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
-    if (!VR.isVirtual())
-      return false;
-    MachineInstr *Def = MRI->getVRegDef(VR);
-    if (!Def)
-      return false;
-    if (!Flags.isByVal()) {
-      if (!TII->isLoadFromStackSlot(*Def, FI))
-        return false;
-    } else {
-      unsigned Opcode = Def->getOpcode();
-      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
-           Opcode == X86::LEA64_32r) &&
-          Def->getOperand(1).isFI()) {
-        FI = Def->getOperand(1).getIndex();
-        Bytes = Flags.getByValSize();
-      } else
-        return false;
-    }
-  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
-    if (Flags.isByVal())
-      // ByVal argument is passed in as a pointer but it's now being
-      // dereferenced. e.g.
-      // define @foo(%struct.X* %A) {
-      //   tail call @bar(%struct.X* byval %A)
-      // }
-      return false;
-    SDValue Ptr = Ld->getBasePtr();
-    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
-    if (!FINode)
-      return false;
-    FI = FINode->getIndex();
-  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
-    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
-    FI = FINode->getIndex();
-    Bytes = Flags.getByValSize();
-  } else
-    return false;
-
-  assert(FI != INT_MAX);
-  if (!MFI.isFixedObjectIndex(FI))
-    return false;
-
-  if (Offset != MFI.getObjectOffset(FI))
-    return false;
-
-  // If this is not byval, check that the argument stack object is immutable.
-  // inalloca and argument copy elision can create mutable argument stack
-  // objects. Byval objects can be mutated, but a byval call intends to pass the
-  // mutated memory.
-  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
-    return false;
-
-  if (VA.getLocVT().getFixedSizeInBits() >
-      Arg.getValueSizeInBits().getFixedValue()) {
-    // If the argument location is wider than the argument type, check that any
-    // extension flags match.
-    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
-        Flags.isSExt() != MFI.isObjectSExt(FI)) {
-      return false;
-    }
-  }
-
-  return Bytes == MFI.getObjectSize(FI);
-}
-
 static bool
 mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI,
                             Register CallerSRetReg) {
@@ -2814,9 +2948,10 @@ mayBeSRetTailCallCompatible(const TargetLowering::CallLoweringInfo &CLI,
 
 /// Check whether the call is eligible for tail call optimization. Targets
 /// that want to do tail call optimization should implement this function.
-/// Note that the x86 backend does not check musttail calls for eligibility! The
-/// rest of x86 tail call lowering must be prepared to forward arguments of any
-/// type.
+///
+/// Note that this function also processes musttail calls, so when this
+/// function returns false on a valid musttail call, a fatal backend error
+/// occurs.
 bool X86TargetLowering::IsEligibleForTailCallOptimization(
     TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
     SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const {
@@ -2943,26 +3078,6 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
-    if (StackArgsSize > 0) {
-      // Check if the arguments are already laid out in the right way as
-      // the caller's fixed stack objects.
-      MachineFrameInfo &MFI = MF.getFrameInfo();
-      const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const X86InstrInfo *TII = Subtarget.getInstrInfo();
-      for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
-        const CCValAssign &VA = ArgLocs[I];
-        SDValue Arg = OutVals[I];
-        ISD::ArgFlagsTy Flags = Outs[I].Flags;
-        if (VA.getLocInfo() == CCValAssign::Indirect)
-          return false;
-        if (!VA.isRegLoc()) {
-          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI,
-                                   TII, VA))
-            return false;
-        }
-      }
-    }
-
     bool PositionIndependent = isPositionIndependent();
     // If the tailcall address may be in a register, then make sure it's
     // possible to register allocate for it. In 32-bit, the call address can
@@ -3000,6 +3115,11 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt);
 
+  // If the stack arguments for this call do not fit into our own save area then
+  // the call cannot be made tail.
+  if (CCInfo.getStackSize() > FuncInfo->getArgumentStackSize())
+    return false;
+
   if (unsigned BytesToPop = FuncInfo->getBytesToPopOnReturn()) {
     // If we have bytes to pop, the callee must pop them.
     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
diff --git a/llvm/test/CodeGen/X86/musttail-struct.ll b/llvm/test/CodeGen/X86/musttail-struct.ll
new file mode 100644
index 0000000000000..62ca16589b3ee
--- /dev/null
+++ b/llvm/test/CodeGen/X86/musttail-struct.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-asm-syntax=intel | FileCheck %s
+
+; Test correct handling of a musttail call with a byval struct argument.
+
+%struct.1xi32 = type { [1 x i32] }
+%struct.3xi32 = type { [3 x i32] }
+%struct.5xi32 = type { [5 x i32] }
+
+declare dso_local i32 @Func1(ptr byval(%struct.1xi32) %0)
+declare dso_local i32 @Func3(ptr byval(%struct.3xi32) %0)
+declare dso_local i32 @Func5(ptr byval(%struct.5xi32) %0)
+declare dso_local i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+
+define dso_local i32 @test1(ptr byval(%struct.1xi32) %0) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp Func1 # TAILCALL
+  %r = musttail call i32 @Func1(ptr byval(%struct.1xi32) %0)
+  ret i32 %r
+}
+
+define dso_local i32 @test3(ptr byval(%struct.3xi32) %0) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp Func3 # TAILCALL
+  %r = musttail call i32 @Func3(ptr byval(%struct.3xi32) %0)
+  ret i32 %r
+}
+
+; sizeof(%struct.5xi32) > 16, in x64 this is passed on stack.
+define dso_local i32 @test5(ptr byval(%struct.5xi32) %0) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp Func5 # TAILCALL
+  %r = musttail call i32 @Func5(ptr byval(%struct.5xi32) %0)
+  ret i32 %r
+}
+
+; Test passing multiple arguments with different sizes on stack. In x64 Linux
+; the first 6 are passed by register.
+define dso_local i32 @testManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) {
+; CHECK-LABEL: testManyArgs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp FuncManyArgs # TAILCALL
+  %r = musttail call i32 @FuncManyArgs(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+  ret i32 %r
+}
+
+define dso_local i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7) {
+; CHECK-LABEL: testRecursion:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp testRecursion # TAILCALL
+  %r = musttail call i32 @testRecursion(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i8 %6, ptr byval(%struct.5xi32) %7)
+  ret i32 %r
+}
+
+define dso_local i32 @swap(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) noinline {
+; CHECK-LABEL: swap:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mov eax, dword ptr [rsp + 8]
+; CHECK-NEXT:    add eax, dword ptr [rsp + 16]
+; CHECK-NEXT:    ret
+entry:
+  %a.ptr = getelementptr inbounds %struct.1xi32, ptr %0, i32 0, i32 0, i32 0
+  %a     = load i32, ptr %a.ptr, align 4
+  %b.ptr = getelementptr inbounds %struct.1xi32, ptr %1, i32 0, i32 0, i32 0
+  %b     = load i32, ptr %b.ptr, align 4
+  %sum   = add i32 %a, %b
+  ret i32 %sum
+}
+
+define dso_local i32 @swapByValArguments(ptr byval(%struct.1xi32) %0, ptr byval(%struct.1xi32) %1) {
+; CHECK-LABEL: swapByValArguments:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mov eax, dword ptr [rsp + 8]
+; CHECK-NEXT:    mov dword ptr [rsp - 16], eax
+; CHECK-NEXT:    mov eax, dword ptr [rsp + 16]
+; CHECK-NEXT:    mov dword ptr [rsp - 8], eax
+; CHECK-NEXT:    jmp swap # TAILCALL
+
+
+  %r = musttail call i32 @swap(ptr byval(%struct.1xi32) %1, ptr byval(%struct.1xi32) %0)
+  ret i32 %r
+}
+
+; Clang only uses byval for arguments of 65 bytes or larger, but e.g. rustc
+; does use byval for smaller types. Here we use a 20 byte struct to keep
+; the tests more readable.
+%twenty_bytes = type { [5 x i32] }
+declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+
+; Functions with byval parameters can be tail-called, because the value is
+; actually passed in registers and the stack in the same way for the caller and
+; callee. On x86 byval arguments are never (partially) passed via registers.
+define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; CHECK-LABEL: large_caller:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp large_callee@PLT # TAILCALL
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; The IR for this one looks dodgy, because it has an alloca passed to a
+; musttail function, but it is passed as a byval argument, so will be copied
+; into the stack space allocated by @large_caller_new_value's caller, so is
+; valid.
+define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; CHECK-LABEL: large_caller_new_value:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movabs rax, 4294967296
+; CHECK-NEXT:    mov qword ptr [rsp - 20], rax
+; CHECK-NEXT:    movabs rcx, 12884901890
+; CHECK-NEXT:    mov qword ptr [rsp - 12], rcx
+; CHECK-NEXT:    mov dword ptr [rsp - 4], 4
+; CHECK-NEXT:    mov qword ptr [rsp + 8], rax
+; CHECK-NEXT:    mov qword ptr [rsp + 16], rcx
+; CHECK-NEXT:    mov dword ptr [rsp + 24], 4
+; CHECK-NEXT:    jmp large_callee@PLT # TAILCALL
+entry:
+  %y = alloca %twenty_bytes, align 4
+  store i32 0, ptr %y, align 4
+  %0 = getelementptr inbounds i8, ptr %y, i32 4
+  store i32 1, ptr %0, align 4
+  %1 = getelementptr inbounds i8, ptr %y, i32 8
+  store i32 2, ptr %1, align 4
+  %2 = getelementptr inbounds i8, ptr %y, i32 12
+  store i32 3, ptr %2, align 4
+  %3 = getelementptr inbounds i8, ptr %y, i32 16
+  store i32 4, ptr %3, align 4
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
+  ret void
+}
+
+declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4)
+define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; CHECK-LABEL: swap_byvals:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mov eax, dword ptr [rsp + 24]
+; CHECK-NEXT:    mov dword ptr [rsp - 8], eax
+; CHECK-NEXT:    movaps xmm0, xmmword ptr [rsp + 8]
+; CHECK-NEXT:    movaps xmmword ptr [rsp - 24], xmm0
+; CHECK-NEXT:    mov eax, dword ptr [rsp + 48]
+; CHECK-NEXT:    mov dword ptr [rsp - 32], eax
+; CHECK-NEXT:    mov rax, qword ptr [rsp + 32]
+; CHECK-NEXT:    mov rcx, qword ptr [rsp + 40]
+; CHECK-NEXT:    mov qword ptr [rsp - 40], rcx
+; CHECK-NEXT:    mov qword ptr [rsp - 48], rax
+; CHECK-NEXT:    jmp two_byvals_callee@PLT # TAILCALL
+entry:
+  musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a)
+  ret void
+}
+
+; A forwarded byval arg, but at a different argument position. Because
+; x86 does not (partially) pass byval arguments in registers, the byval
+; arg is in the correct position already, so this is not a sibcall but
+; can be tail-call optimized.
+declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
+define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
+; CHECK-LABEL: shift_byval:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    jmp shift_byval_callee@PLT # TAILCALL
+entry:
+  tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b)
+  ret void
+}
+
+; A global object passed to a byval argument, so it must be copied, but doesn't
+; need a stack temporary.
+@large_global = external global %twenty_bytes
+define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
+; CHECK-LABEL: large_caller_from_global:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mov rax, qword ptr [rip + large_global@GOTPCREL]
+; CHECK-NEXT:    mov ecx, dword ptr [rax + 16]
+; CHECK-NEXT:    mov dword ptr [rsp + 24], ecx
+; CHECK-NEXT:    mov rcx, qword ptr [rax]
+; CHECK-NEXT:    mov rax, qword ptr [rax + 8]
+; CHECK-NEXT:    mov qword ptr [rsp + 16], rax
+; CHECK-NEXT:    mov qword ptr [rsp + 8], rcx
+; CHECK-NEXT:    jmp large_callee@PLT # TAILCALL
+entry:
+  musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll
index 65cd1edd92e31..c1ec7ccbde177 100644
--- a/llvm/test/CodeGen/X86/musttail-varargs.ll
+++ b/llvm/test/CodeGen/X86/musttail-varargs.ll
@@ -243,18 +243,15 @@ define void @f_thunk(ptr %this, ...) {
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $32, %esp
-; X86-NOSSE-NEXT:    movl 8(%ebp), %esi
-; X86-NOSSE-NEXT:    leal 12(%ebp), %eax
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    leal 12(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll _get_f
 ; X86-NOSSE-NEXT:    addl $4, %esp
-; X86-NOSSE-NEXT:    movl %esi, 8(%ebp)
-; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
-; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    movl %ebp, %esp
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    jmpl *%eax # TAILCALL
 ;
@@ -262,24 +259,21 @@ define void @f_thunk(ptr %this, ...) {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    movl %esp, %ebp
-; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $80, %esp
 ; X86-SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
-; X86-SSE-NEXT:    movl 8(%ebp), %esi
-; X86-SSE-NEXT:    leal 12(%ebp), %eax
-; X86-SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl 8(%ebp), %eax
+; X86-SSE-NEXT:    leal 12(%ebp), %ecx
+; X86-SSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    pushl %eax
 ; X86-SSE-NEXT:    calll _get_f
 ; X86-SSE-NEXT:    addl $4, %esp
-; X86-SSE-NEXT:    movl %esi, 8(%ebp)
 ; X86-SSE-NEXT:    movaps (%esp), %xmm0 # 16-byte Reload
 ; X86-SSE-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-SSE-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-SSE-NEXT:    leal -4(%ebp), %esp
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    jmpl *%eax # TAILCALL
   %ap = alloca [4 x ptr], align 16
@@ -310,11 +304,14 @@ define void @g_thunk(ptr %fptr_i8, ...) {
 ; WINDOWS:       # %bb.0:
 ; WINDOWS-NEXT:    rex64 jmpq *%rcx # TAILCALL
 ;
-; X86-LABEL: g_thunk:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    jmpl *%eax # TAILCALL
+; X86-NOSSE-LABEL: g_thunk:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    jmpl *{{[0-9]+}}(%esp) # TAILCALL
+;
+; X86-SSE-LABEL: g_thunk:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    jmpl *%eax # TAILCALL
   musttail call void (ptr, ...) %fptr_i8(ptr %fptr_i8, ...)
   ret void
 }
@@ -374,10 +371,9 @@ define void @h_thunk(ptr %this, ...) {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jmpl *%ecx # TAILCALL
 ; X86-NEXT:  LBB2_2: # %else
-; X86-NEXT:    movl 8(%eax), %ecx
+; X86-NEXT:    movl 8(%eax), %eax
 ; X86-NEXT:    movl $42, _g
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    jmpl *%ecx # TAILCALL
+; X86-NEXT:    jmpl *%eax # TAILCALL
   %cond = load i1, ptr %this
   br i1 %cond, label %then, label %else
 
diff --git a/llvm/test/CodeGen/X86/musttail.ll b/llvm/test/CodeGen/X86/musttail.ll
index 9e02585a3ffdc..57c47ad683127 100644
--- a/llvm/test/CodeGen/X86/musttail.ll
+++ b/llvm/test/CodeGen/X86/musttail.ll
@@ -46,7 +46,7 @@ define i32 @t4(ptr %fn, i32 %n, i32 %r) {
 ; CHECK: decl %[[n:.*]]
 ; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%esp)
 ; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%esp)
-; CHECK: jmpl *%{{.*}}
+; CHECK: jmpl *{{.*}} # TAILCALL
 
 entry:
   %r1 = add i32 %r, 1
@@ -74,7 +74,7 @@ define i32 @t5(ptr %fn, i32 %n, i32 %r) alignstack(32) {
 ; CHECK: leal {{[-0-9]+}}(%ebp), %esp
 ; CHECK: popl %esi
 ; CHECK: popl %ebp
-; CHECK: jmpl *%{{.*}}
+; CHECK: jmpl *{{.*}} # TAILCALL
 
 entry:
   %a = alloca i8, i32 %n
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index 2759a9883975e..d1137cac7d365 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -295,10 +295,15 @@ declare dso_local i32 @foo5(i32, i32, i32, i32, i32)
 define dso_local i32 @t12(i32 %x, i32 %y, ptr byval(%struct.t) align 4 %z) nounwind ssp {
 ; X86-LABEL: t12:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    jne foo6 # TAILCALL
-; X86-NEXT:  # %bb.1: # %bb2
+; X86-NEXT:    je .LBB12_1
+; X86-NEXT:  # %bb.2: # %bb
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    jmp foo6 # TAILCALL
+; X86-NEXT:  .LBB12_1: # %bb2
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $20, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t12:
diff --git a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
index cd669768705e5..b901d22f66392 100644
--- a/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
+++ b/llvm/test/CodeGen/X86/swifttailcc-store-ret-address-aliasing-stack-slot.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc %s -o - | FileCheck %s
 
 target triple = "x86_64-apple-macosx"
@@ -24,9 +25,7 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4,
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; CHECK-NEXT:    callq _foo
 ; CHECK-NEXT:    movq %r14, (%rax)
-; CHECK-NEXT:    movl [[OFF:[0-9]+]](%rsp), %edx
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    movq %rcx, [[OFF]](%rsp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    movq %rax, %r14
 ; CHECK-NEXT:    movq %r13, %rdi
 ; CHECK-NEXT:    movq %r15, %rsi
@@ -34,7 +33,6 @@ define swifttailcc void @test(ptr %0, ptr swiftasync %1, i64 %2, i64 %3, ptr %4,
 ; CHECK-NEXT:    addq $8, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r15
-; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    jmp _tc_fn ## TAILCALL
 entry:
   %res = tail call ptr @foo()