diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 0658ecc93d88d..9d683f27a0141 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2892,13 +2892,21 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     if (!Constraints.empty())
       Constraints += ',';
 
-    // If this is a register output, then make the inline asm return it
-    // by-value.  If this is a memory result, return the value by-reference.
+    // - If this is a register output, then make the inline asm return it
+    //   by-value.
+    // - If this is an "rm" constraint, then treat it like a register output.
+    //   (We'll correct this before ISel if using the fast register allocator.)
+    // - If this is a memory result, return the value by-reference.
     QualType QTy = OutExpr->getType();
     const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
                                      hasAggregateEvaluationKind(QTy);
-    if (!Info.allowsMemory() && IsScalarOrAggregate) {
+    // FIXME: Expand this to handle other constraints that include both 'r'
+    // and 'm', such as "g" (which expands to "imr").
+    const bool RegisterMemoryConstraints =
+        OutputConstraint == "rm" || OutputConstraint == "mr";
 
+    if (IsScalarOrAggregate &&
+        (!Info.allowsMemory() || RegisterMemoryConstraints)) {
       Constraints += "=" + OutputConstraint;
       ResultRegQualTys.push_back(QTy);
       ResultRegDests.push_back(Dest);
diff --git a/clang/test/CodeGen/asm.c b/clang/test/CodeGen/asm.c
index 9687c993e6464..66a7142ee7fca 100644
--- a/clang/test/CodeGen/asm.c
+++ b/clang/test/CodeGen/asm.c
@@ -259,7 +259,7 @@ void t31(int len) {
   __asm__ volatile(""
                    : "+%%rm"(len), "+rm"(len));
   // CHECK: @t31
-  // CHECK: call void asm sideeffect "", "=*%rm,=*rm,0,1,~{dirflag},~{fpsr},~{flags}"
+  // CHECK: call i32 asm sideeffect "", "=*%rm,=rm,0,1,~{dirflag},~{fpsr},~{flags}"
 }
 
 // CHECK: @t32
diff --git a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
index a400a78390dff..187e7a8130d37 100644
--- a/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
+++ b/llvm/include/llvm/CodeGen/InlineAsmPrepare.h
@@ -13,9 +13,16 @@
 
 namespace llvm {
 
+class TargetMachine;
+
 class InlineAsmPreparePass : public PassInfoMixin<InlineAsmPreparePass> {
+  const TargetMachine *TM;
+
 public:
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  explicit InlineAsmPreparePass(const TargetMachine &TM) : TM(&TM) {}
+  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+
+  static bool isRequired() { return true; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 6f7c3bd177cb0..63dd290c1fa28 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -622,6 +622,7 @@ LLVM_ABI ModulePass *createJMCInstrumenterPass();
 /// This pass converts conditional moves to conditional jumps when profitable.
 LLVM_ABI FunctionPass *createSelectOptimizePass();
 
+/// Process inline assembly calls to prepare for code generation.
 LLVM_ABI FunctionPass *createInlineAsmPreparePass();
 
 /// Creates Windows Secure Hot Patch pass. \see WindowsSecureHotPatching.cpp
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index f6d5578412d1e..0ed06fe7eb6fd 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5250,6 +5250,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
     /// The ValueType for the operand value.
     MVT ConstraintVT = MVT::Other;
 
+    /// The register may be folded. This is used if the constraint is "rm",
+    /// where we prefer using a register, but can fall back to a memory slot
+    /// under register pressure.
+    bool MayFoldRegister = false;
+
     /// Copy constructor for copying from a ConstraintInfo.
     AsmOperandInfo(InlineAsm::ConstraintInfo Info)
         : InlineAsm::ConstraintInfo(std::move(Info)) {}
diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h
index fed0ccc5818f8..5f9e77b321708 100644
--- a/llvm/include/llvm/IR/InlineAsm.h
+++ b/llvm/include/llvm/IR/InlineAsm.h
@@ -181,6 +181,14 @@ class InlineAsm final : public Value {
     bool hasArg() const {
       return Type == isInput || (Type == isOutput && isIndirect);
     }
+
+    /// hasRegMemConstraints - Returns true if and only if the constraint
+    /// codes are "rm". This is useful when converting between a register form
+    /// to a memory form.
+    bool hasRegMemConstraints() const {
+      return Codes.size() == 2 && is_contained(Codes, "r") &&
+             is_contained(Codes, "m");
+    }
   };
 
   /// ParseConstraints - Split up the constraint string into the specific
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 8130737ae4c20..a8752d6367a68 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -842,7 +842,8 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPrepare(
   if (getOptLevel() != CodeGenOptLevel::None)
     addFunctionPass(ObjCARCContractPass(), PMW);
 
-  addFunctionPass(InlineAsmPreparePass(), PMW);
+  addFunctionPass(InlineAsmPreparePass(TM), PMW);
+
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
   addFunctionPass(SafeStackPass(TM), PMW);
diff --git a/llvm/lib/CodeGen/InlineAsmPrepare.cpp b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
index 9cf54f54cd752..e5974c7dd36bb 100644
--- a/llvm/lib/CodeGen/InlineAsmPrepare.cpp
+++ b/llvm/lib/CodeGen/InlineAsmPrepare.cpp
@@ -1,4 +1,4 @@
-//===-- InlineAsmPrepare - Prepare inline asm for code gen ----------------===//
+//===-- InlineAsmPrepare - Prepare inline asm for code generation ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,47 +6,58 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass lowers callbrs in LLVM IR in order to to assist SelectionDAG's
-// codegen.
+// This pass lowers callbrs and inline asm in LLVM IR in order to assist
+// SelectionDAG's codegen.
 //
-// In particular, this pass assists in inserting register copies for the output
-// values of a callbr along the edges leading to the indirect target blocks.
-// Though the output SSA value is defined by the callbr instruction itself in
-// the IR representation, the value cannot be copied to the appropriate virtual
-// registers prior to jumping to an indirect label, since the jump occurs
-// within the user-provided assembly blob.
+// CallBrInst:
 //
-// Instead, those copies must occur separately at the beginning of each
-// indirect target. That requires that we create a separate SSA definition in
-// each of them (via llvm.callbr.landingpad), and may require splitting
-// critical edges so we have a location to place the intrinsic. Finally, we
-// remap users of the original callbr output SSA value to instead point to the
-// appropriate llvm.callbr.landingpad value.
+//   - Assists in inserting register copies for the output values of a callbr
+//     along the edges leading to the indirect target blocks. Though the output
+//     SSA value is defined by the callbr instruction itself in the IR
+//     representation, the value cannot be copied to the appropriate virtual
+//     registers prior to jumping to an indirect label, since the jump occurs
+//     within the user-provided assembly blob.
 //
-// Ideally, this could be done inside SelectionDAG, or in the
-// MachineInstruction representation, without the use of an IR-level intrinsic.
-// But, within the current framework, it’s simpler to implement as an IR pass.
-// (If support for callbr in GlobalISel is implemented, it’s worth considering
-// whether this is still required.)
+//     Instead, those copies must occur separately at the beginning of each
+//     indirect target. That requires that we create a separate SSA definition
+//     in each of them (via llvm.callbr.landingpad), and may require splitting
+//     critical edges so we have a location to place the intrinsic. Finally, we
+//     remap users of the original callbr output SSA value to instead point to
+//     the appropriate llvm.callbr.landingpad value.
+//
+//     Ideally, this could be done inside SelectionDAG, or in the
+//     MachineInstruction representation, without the use of an IR-level
+//     intrinsic.  But, within the current framework, it’s simpler to implement
+//     as an IR pass.  (If support for callbr in GlobalISel is implemented,
+//     it’s worth considering whether this is still required.)
+//
+// InlineAsm:
+//
+//   - Prepares inline assembly for code generation with the fast register
+//     allocator. In particular, it defaults "rm" (register-or-memory) to
+//     prefer the "m" constraints (the front-end opts for the "r" constraint),
+//     simplifying register allocation by forcing operands to memory locations.
+//     The other register allocators are equipped to handle folding registers
+//     already, so don't need to change the default.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/InlineAsmPrepare.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/BasicBlock.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
@@ -54,118 +65,361 @@ using namespace llvm;
 
 #define DEBUG_TYPE "inline-asm-prepare"
 
-static bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT);
-static bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs,
-                                 DominatorTree &DT);
-static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-                      SSAUpdater &SSAUpdate);
-static SmallVector<CallBrInst *, 2> FindCallBrs(Function &F);
-
 namespace {
 
 class InlineAsmPrepare : public FunctionPass {
 public:
   InlineAsmPrepare() : FunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
   bool runOnFunction(Function &F) override;
+
   static char ID;
 };
 
-} // end anonymous namespace
+char InlineAsmPrepare::ID = 0;
 
-PreservedAnalyses InlineAsmPreparePass::run(Function &F,
-                                            FunctionAnalysisManager &FAM) {
-  bool Changed = false;
-  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
+} // end anonymous namespace
 
-  if (CBRs.empty())
-    return PreservedAnalyses::all();
+INITIALIZE_PASS_BEGIN(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(InlineAsmPrepare, DEBUG_TYPE, "Prepare inline asm insts",
+                    false, false)
 
-  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+FunctionPass *llvm::createInlineAsmPreparePass() {
+  return new InlineAsmPrepare();
+}
 
-  Changed |= SplitCriticalEdges(CBRs, DT);
-  Changed |= InsertIntrinsicCalls(CBRs, DT);
+//===----------------------------------------------------------------------===//
+//                     Process InlineAsm instructions
+//===----------------------------------------------------------------------===//
 
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  return PA;
+/// The inline asm constraint allows both register and memory.
+static bool IsRegMemConstraint(StringRef Constraint) {
+  return Constraint.size() == 2 && (Constraint == "rm" || Constraint == "mr");
 }
 
-char InlineAsmPrepare::ID = 0;
-INITIALIZE_PASS_BEGIN(InlineAsmPrepare, "inline-asm-prepare",
-                      "Prepare inline asm insts", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(InlineAsmPrepare, "inline-asm-prepare",
-                    "Prepare inline asm insts", false, false)
+/// Tag "rm" output constraints with '*' to signify that they default to a
+/// memory location.
+static std::pair<std::string, bool>
+ConvertConstraintsToMemory(StringRef ConstraintStr) {
+  auto I = ConstraintStr.begin(), E = ConstraintStr.end();
+  std::string Out;
+  raw_string_ostream O(Out);
+  bool HasRegMem = false;
+
+  while (I != E) {
+    bool IsOutput = false;
+    bool HasIndirect = false;
+    if (*I == '=') {
+      O << *I;
+      IsOutput = true;
+      ++I;
+      if (I == E)
+        return {};
+    }
+    if (*I == '*') {
+      O << '*';
+      HasIndirect = true;
+      ++I;
+      if (I == E)
+        return {};
+    }
+    if (*I == '+') {
+      O << '+';
+      IsOutput = true;
+      ++I;
+      if (I == E)
+        return {};
+    }
 
-FunctionPass *llvm::createInlineAsmPreparePass() {
-  return new InlineAsmPrepare();
+    auto Comma = std::find(I, E, ',');
+    std::string Sub(I, Comma);
+    if (IsRegMemConstraint(Sub)) {
+      HasRegMem = true;
+      if (IsOutput && !HasIndirect)
+        O << '*';
+    }
+
+    O << Sub;
+
+    if (Comma == E)
+      break;
+
+    O << ',';
+    I = Comma + 1;
+  }
+
+  return {Out, HasRegMem};
 }
 
-void InlineAsmPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<DominatorTreeWrapperPass>();
+/// Build a map of tied constraints. TiedOutput[i] = j means Constraint i is an
+/// input tied to output constraint j.
+static void
+BuildTiedConstraintMap(const InlineAsm::ConstraintInfoVector &Constraints,
+                       SmallVectorImpl<int> &TiedOutput) {
+  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+    const InlineAsm::ConstraintInfo &C = Constraints[I];
+    if (C.Type == InlineAsm::isOutput && C.hasMatchingInput()) {
+      int InputIdx = C.MatchingInput;
+      if (InputIdx >= 0 && InputIdx < (int)Constraints.size())
+        TiedOutput[InputIdx] = I;
+    }
+
+    if (C.Type == InlineAsm::isInput && C.hasMatchingInput()) {
+      int OutputIdx = C.MatchingInput;
+      if (OutputIdx >= 0 && OutputIdx < (int)Constraints.size())
+        TiedOutput[I] = OutputIdx;
+    }
+  }
 }
 
-SmallVector<CallBrInst *, 2> FindCallBrs(Function &F) {
-  SmallVector<CallBrInst *, 2> CBRs;
-  for (BasicBlock &BB : F)
-    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator()))
-      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
-        CBRs.push_back(CBR);
-  return CBRs;
+/// Process an output constraint, creating allocas for converted constraints.
+static void ProcessOutputConstraint(
+    const InlineAsm::ConstraintInfo &C, Type *RetTy, unsigned OutputIdx,
+    IRBuilder<> &EntryBuilder, SmallVectorImpl<Value *> &NewArgs,
+    SmallVectorImpl<Type *> &NewArgTypes, SmallVectorImpl<Type *> &NewRetTypes,
+    SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
+    SmallVectorImpl<AllocaInst *> &OutputAllocas, unsigned ConstraintIdx) {
+  Type *SlotTy = RetTy;
+  if (StructType *ST = dyn_cast<StructType>(RetTy))
+    SlotTy = ST->getElementType(OutputIdx);
+
+  if (C.hasRegMemConstraints()) {
+    // Converted to memory constraint. Create alloca and pass pointer as
+    // argument.
+    AllocaInst *Slot = EntryBuilder.CreateAlloca(SlotTy, nullptr, "asm_mem");
+    NewArgs.push_back(Slot);
+    NewArgTypes.push_back(Slot->getType());
+    ElementTypeAttrs.push_back({NewArgs.size() - 1, SlotTy});
+    OutputAllocas[ConstraintIdx] = Slot;
+    // No return value for this output since it's now an out-parameter.
+  } else {
+    // Unchanged, still an output return value.
+    NewRetTypes.push_back(SlotTy);
+  }
 }
 
-bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
-  bool Changed = false;
-  CriticalEdgeSplittingOptions Options(&DT);
-  Options.setMergeIdenticalEdges();
+/// Process an input constraint, handling tied constraints and conversions.
+static void ProcessInputConstraint(const InlineAsm::ConstraintInfo &C,
+                                   Value *ArgVal, ArrayRef<int> TiedOutput,
+                                   ArrayRef<AllocaInst *> OutputAllocas,
+                                   unsigned ConstraintIdx, IRBuilder<> &Builder,
+                                   IRBuilder<> &EntryBuilder,
+                                   SmallVectorImpl<Value *> &NewArgs,
+                                   SmallVectorImpl<Type *> &NewArgTypes) {
+  Type *ArgTy = ArgVal->getType();
+
+  if (TiedOutput[ConstraintIdx] != -1) {
+    int MatchIdx = TiedOutput[ConstraintIdx];
+    if (AllocaInst *Slot = OutputAllocas[MatchIdx]) {
+      // The matched output was converted to memory. Store this input into the
+      // alloca.
+      Builder.CreateStore(ArgVal, Slot);
+
+      // Pass the alloca pointer as the argument, instead of ArgVal. This
+      // ensures the tied "0" constraint matches the "*m" output.
+      NewArgs.push_back(Slot);
+      NewArgTypes.push_back(Slot->getType());
+      return;
+    }
+  }
 
-  // The indirect destination might be duplicated between another parameter...
-  //   %0 = callbr ... [label %x, label %x]
-  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
-  // to split the default destination if it's duplicated between an indirect
-  // destination...
-  //   %1 = callbr ... to label %x [label %x]
-  // ...hence starting at 1 and checking against successor 0 (aka the default
-  // destination).
-  for (CallBrInst *CBR : CBRs)
-    for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
-      if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
-          isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
-        if (SplitKnownCriticalEdge(CBR, i, Options))
-          Changed = true;
-  return Changed;
+  if (C.hasRegMemConstraints()) {
+    // Converted to memory constraint. Create alloca, store input, pass pointer
+    // as argument.
+    AllocaInst *Slot = EntryBuilder.CreateAlloca(ArgTy, nullptr, "asm_mem");
+    Builder.CreateStore(ArgVal, Slot);
+    NewArgs.push_back(Slot);
+    NewArgTypes.push_back(Slot->getType());
+  } else {
+    // Unchanged
+    NewArgs.push_back(ArgVal);
+    NewArgTypes.push_back(ArgTy);
+  }
 }
 
-bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT) {
-  bool Changed = false;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  IRBuilder<> Builder(CBRs[0]->getContext());
-  for (CallBrInst *CBR : CBRs) {
-    if (!CBR->getNumIndirectDests())
-      continue;
+/// Build the return type from the collected return types.
+static Type *BuildReturnType(ArrayRef<Type *> NewRetTypes,
+                             LLVMContext &Context) {
+  if (NewRetTypes.empty())
+    return Type::getVoidTy(Context);
 
-    SSAUpdater SSAUpdate;
-    SSAUpdate.Initialize(CBR->getType(), CBR->getName());
-    SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
-    SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
+  if (NewRetTypes.size() == 1)
+    return NewRetTypes[0];
+
+  return StructType::get(Context, NewRetTypes);
+}
 
-    for (BasicBlock *IndDest : CBR->getIndirectDests()) {
-      if (!Visited.insert(IndDest).second)
+/// Create the new inline assembly call with converted constraints.
+static CallInst *CreateNewInlineAsm(
+    InlineAsm *IA, const std::string &NewConstraintStr, Type *NewRetTy,
+    const SmallVectorImpl<Type *> &NewArgTypes,
+    const SmallVectorImpl<Value *> &NewArgs,
+    const SmallVectorImpl<std::pair<unsigned, Type *>> &ElementTypeAttrs,
+    CallBase *CB, IRBuilder<> &Builder, LLVMContext &Context) {
+  FunctionType *NewFTy = FunctionType::get(NewRetTy, NewArgTypes, false);
+  InlineAsm *NewIA = InlineAsm::get(
+      NewFTy, IA->getAsmString(), NewConstraintStr, IA->hasSideEffects(),
+      IA->isAlignStack(), IA->getDialect(), IA->canThrow());
+
+  CallInst *NewCall = Builder.CreateCall(NewFTy, NewIA, NewArgs);
+  NewCall->setCallingConv(CB->getCallingConv());
+  NewCall->setAttributes(CB->getAttributes());
+  NewCall->setDebugLoc(CB->getDebugLoc());
+
+  for (const auto &[Index, Ty] : ElementTypeAttrs)
+    NewCall->addParamAttr(Index,
+                          Attribute::get(Context, Attribute::ElementType, Ty));
+
+  return NewCall;
+}
+
+/// Reconstruct the return value from the new call and allocas.
+static Value *
+ReconstructReturnValue(Type *RetTy, CallInst *NewCall,
+                       const InlineAsm::ConstraintInfoVector &Constraints,
+                       const SmallVectorImpl<AllocaInst *> &OutputAllocas,
+                       const SmallVectorImpl<Type *> &NewRetTypes,
+                       IRBuilder<> &Builder) {
+  if (RetTy->isVoidTy())
+    return nullptr;
+
+  if (isa<StructType>(RetTy)) {
+    // Multiple outputs. Reconstruct the struct.
+    Value *Res = PoisonValue::get(RetTy);
+    unsigned NewRetIdx = 0;
+    unsigned OriginalOutIdx = 0;
+
+    for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+      if (Constraints[I].Type != InlineAsm::isOutput)
         continue;
-      Builder.SetInsertPoint(&*IndDest->begin());
-      CallInst *Intrinsic = Builder.CreateIntrinsic(
-          CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
-      SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
-      UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
-      Changed = true;
+
+      Value *Val = nullptr;
+      if (AllocaInst *Slot = OutputAllocas[I]) {
+        // Converted to memory. Load from alloca.
+        Val = Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+      } else {
+        // Not converted. Extract from NewCall return.
+        if (NewRetTypes.size() == 1) {
+          Val = NewCall;
+        } else {
+          Val = Builder.CreateExtractValue(NewCall, NewRetIdx);
+        }
+        NewRetIdx++;
+      }
+
+      Res = Builder.CreateInsertValue(Res, Val, OriginalOutIdx++);
+    }
+
+    return Res;
+  }
+
+  // Single output.
+  // Find the output constraint (should be the first one).
+  unsigned OutConstraintIdx = 0;
+  for (unsigned I = 0; I < Constraints.size(); ++I) {
+    if (Constraints[I].Type == InlineAsm::isOutput) {
+      OutConstraintIdx = I;
+      break;
     }
   }
-  return Changed;
+
+  if (AllocaInst *Slot = OutputAllocas[OutConstraintIdx])
+    return Builder.CreateLoad(Slot->getAllocatedType(), Slot);
+
+  return NewCall;
+}
+
+static bool ProcessInlineAsm(Function &F, CallBase *CB) {
+  InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+  const InlineAsm::ConstraintInfoVector &Constraints = IA->ParseConstraints();
+
+  const auto &[NewConstraintStr, HasRegMem] =
+      ConvertConstraintsToMemory(IA->getConstraintString());
+  if (!HasRegMem)
+    return false;
+
+  IRBuilder<> Builder(CB);
+  IRBuilder<> EntryBuilder(&F.getEntryBlock(), F.getEntryBlock().begin());
+
+  // Collect new arguments and return types.
+  SmallVector<Value *, 8> NewArgs;
+  SmallVector<Type *, 8> NewArgTypes;
+  SmallVector<Type *, 2> NewRetTypes;
+  SmallVector<std::pair<unsigned, Type *>, 8> ElementTypeAttrs;
+
+  // Track allocas created for converted outputs. Indexed by position in the
+  // flat Constraints list (not by output index), so that both
+  // ProcessOutputConstraint and ReconstructReturnValue can look up entries
+  // using the same constraint index.
+  SmallVector<AllocaInst *, 8> OutputAllocas(Constraints.size(), nullptr);
+
+  // Build tied constraint map.
+  SmallVector<int, 8> TiedOutput(Constraints.size(), -1);
+  BuildTiedConstraintMap(Constraints, TiedOutput);
+
+  // Process constraints.
+  unsigned ArgNo = 0;
+  unsigned OutputIdx = 0;
+  for (unsigned I = 0, E = Constraints.size(); I != E; ++I) {
+    const InlineAsm::ConstraintInfo &C = Constraints[I];
+
+    if (C.Type == InlineAsm::isOutput) {
+      if (C.isIndirect) {
+        // Indirect output takes a pointer argument from the original call.
+        // Pass it through to the new call.
+        Value *ArgVal = CB->getArgOperand(ArgNo);
+        NewArgs.push_back(ArgVal);
+        NewArgTypes.push_back(ArgVal->getType());
+        // Preserve element type attribute if present.
+        if (auto *Ty = CB->getParamElementType(ArgNo))
+          ElementTypeAttrs.push_back({NewArgs.size() - 1, Ty});
+        ArgNo++;
+      } else {
+        ProcessOutputConstraint(C, CB->getType(), OutputIdx, EntryBuilder,
+                                NewArgs, NewArgTypes, NewRetTypes,
+                                ElementTypeAttrs, OutputAllocas, I);
+        OutputIdx++;
+      }
+    } else if (C.Type == InlineAsm::isInput) {
+      Value *ArgVal = CB->getArgOperand(ArgNo);
+      ProcessInputConstraint(C, ArgVal, TiedOutput, OutputAllocas, I, Builder,
+                             EntryBuilder, NewArgs, NewArgTypes);
+      ArgNo++;
+    }
+  }
+
+  // Build the new return type.
+  Type *NewRetTy = BuildReturnType(NewRetTypes, F.getContext());
+
+  // Create the new inline assembly call.
+  CallInst *NewCall =
+      CreateNewInlineAsm(IA, NewConstraintStr, NewRetTy, NewArgTypes, NewArgs,
+                         ElementTypeAttrs, CB, Builder, F.getContext());
+
+  // Reconstruct the return value and update users.
+  if (!CB->use_empty()) {
+    if (Value *Replacement =
+            ReconstructReturnValue(CB->getType(), NewCall, Constraints,
+                                   OutputAllocas, NewRetTypes, Builder))
+      CB->replaceAllUsesWith(Replacement);
+  }
+
+  CB->eraseFromParent();
+  return true;
 }
 
+//===----------------------------------------------------------------------===//
+//                           Process CallBrInsts
+//===----------------------------------------------------------------------===//
+
+/// The Use is in the same BasicBlock as the intrinsic call.
 static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
   const auto *I = dyn_cast<Instruction>(U.getUser());
   return I && I->getParent() == BB;
@@ -174,24 +428,23 @@ static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
 #ifndef NDEBUG
 static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
                               const BasicBlock *BB, bool IsDefaultDest) {
-  if (!isa<Instruction>(U.getUser()))
-    return;
-  LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
-                    << cast<Instruction>(U.getUser())->getParent()->getName()
-                    << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
-                    << "dominated by " << BB->getName() << " ("
-                    << (IsDefaultDest ? "in" : "") << "direct)\n");
+  if (isa<Instruction>(U.getUser()))
+    LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
+                      << cast<Instruction>(U.getUser())->getParent()->getName()
+                      << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
+                      << "dominated by " << BB->getName() << " ("
+                      << (IsDefaultDest ? "in" : "") << "direct)\n");
 }
 #endif
 
-void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
-               SSAUpdater &SSAUpdate) {
-
+static void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
+                      SSAUpdater &SSAUpdate) {
   SmallPtrSet<Use *, 4> Visited;
+
   BasicBlock *DefaultDest = CBR->getDefaultDest();
   BasicBlock *LandingPad = Intrinsic->getParent();
-
   SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
+
   for (Use *U : Uses) {
     if (!Visited.insert(U).second)
       continue;
@@ -221,12 +474,119 @@ void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
   }
 }
 
-bool InlineAsmPrepare::runOnFunction(Function &F) {
+static bool SplitCriticalEdges(CallBrInst *CBR, DominatorTree *DT) {
   bool Changed = false;
-  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(F);
 
-  if (CBRs.empty())
-    return Changed;
+  CriticalEdgeSplittingOptions Options(DT);
+  Options.setMergeIdenticalEdges();
+
+  // The indirect destination might be duplicated between another parameter...
+  //
+  //   %0 = callbr ... [label %x, label %x]
+  //
+  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
+  // to split the default destination if it's duplicated between an indirect
+  // destination...
+  //
+  //   %1 = callbr ... to label %x [label %x]
+  //
+  // ...hence starting at 1 and checking against successor 0 (aka the default
+  // destination).
+  for (unsigned I = 1, E = CBR->getNumSuccessors(); I != E; ++I)
+    if (CBR->getSuccessor(I) == CBR->getSuccessor(0) ||
+        isCriticalEdge(CBR, I, /*AllowIdenticalEdges*/ true))
+      if (SplitKnownCriticalEdge(CBR, I, Options))
+        Changed = true;
+
+  return Changed;
+}
+
+/// Create a separate SSA definition in each indirect target (via
+/// llvm.callbr.landingpad). This may require splitting critical edges so we
+/// have a location to place the intrinsic. Then remap users of the original
+/// callbr output SSA value to instead point to the appropriate
+/// llvm.callbr.landingpad value.
+static bool InsertIntrinsicCalls(CallBrInst *CBR, DominatorTree &DT) {
+  bool Changed = false;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  IRBuilder<> Builder(CBR->getContext());
+
+  if (!CBR->getNumIndirectDests())
+    return false;
+
+  SSAUpdater SSAUpdate;
+  SSAUpdate.Initialize(CBR->getType(), CBR->getName());
+  SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
+  SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
+
+  for (BasicBlock *IndDest : CBR->getIndirectDests()) {
+    if (!Visited.insert(IndDest).second)
+      continue;
+
+    Builder.SetInsertPoint(&*IndDest->begin());
+    CallInst *Intrinsic = Builder.CreateIntrinsic(
+        CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
+    SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
+    UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+static bool ProcessCallBrInst(Function &F, CallBrInst *CBR, DominatorTree *DT) {
+  bool Changed = false;
+
+  Changed |= SplitCriticalEdges(CBR, DT);
+  Changed |= InsertIntrinsicCalls(CBR, *DT);
+
+  return Changed;
+}
+
+static bool runImpl(Function &F, ArrayRef<CallBase *> IAs, DominatorTree *DT) {
+  bool Changed = false;
+
+  for (CallBase *CB : IAs)
+    if (auto *CBR = dyn_cast<CallBrInst>(CB))
+      Changed |= ProcessCallBrInst(F, CBR, DT);
+    else
+      Changed |= ProcessInlineAsm(F, CB);
+
+  return Changed;
+}
+
+/// Find all inline assembly calls that need preparation. This always collects
+/// CallBrInsts (which need SSA fixups), and at -O0 also collects regular
+/// inline asm calls (which need "rm" to "m" constraint conversion for the fast
+/// register allocator).
+static SmallVector<CallBase *, 4>
+FindInlineAsmCandidates(Function &F, const TargetMachine *TM) {
+  bool isOptLevelNone = TM->getOptLevel() == CodeGenOptLevel::None;
+  SmallVector<CallBase *, 4> InlineAsms;
+
+  for (BasicBlock &BB : F) {
+    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator())) {
+      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
+        InlineAsms.push_back(CBR);
+      continue;
+    }
+
+    if (isOptLevelNone)
+      // Only inline assembly compiled at '-O0' (i.e. uses the fast register
+      // allocator) needs to be processed.
+      for (Instruction &I : BB)
+        if (CallBase *CB = dyn_cast<CallBase>(&I); CB && CB->isInlineAsm())
+          InlineAsms.push_back(CB);
+  }
+
+  return InlineAsms;
+}
+
+bool InlineAsmPrepare::runOnFunction(Function &F) {
+  const auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  SmallVector<CallBase *, 4> IAs = FindInlineAsmCandidates(F, TM);
+  if (IAs.empty())
+    return false;
 
   // It's highly likely that most programs do not contain CallBrInsts. Follow a
   // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
@@ -244,11 +604,22 @@ bool InlineAsmPrepare::runOnFunction(Function &F) {
     DT = &*LazilyComputedDomTree;
   }
 
-  if (SplitCriticalEdges(CBRs, *DT))
-    Changed = true;
+  return runImpl(F, IAs, DT);
+}
 
-  if (InsertIntrinsicCalls(CBRs, *DT))
-    Changed = true;
+PreservedAnalyses InlineAsmPreparePass::run(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  SmallVector<CallBase *, 4> IAs = FindInlineAsmCandidates(F, TM);
+  if (IAs.empty())
+    return PreservedAnalyses::all();
 
-  return Changed;
+  DominatorTree *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
+
+  if (runImpl(F, IAs, DT)) {
+    PreservedAnalyses PA;
+    PA.preserve<DominatorTreeAnalysis>();
+    return PA;
+  }
+
+  return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7c762ed6d91ce..0d14cc771dab6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1032,7 +1032,8 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
 }
 
 void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
-                                        unsigned MatchingIdx, const SDLoc &dl,
+                                        unsigned MatchingIdx,
+                                        bool MayFoldRegister, const SDLoc &dl,
                                         SelectionDAG &DAG,
                                         std::vector<SDValue> &Ops) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -1049,6 +1050,7 @@ void RegsForValue::AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
     const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
     const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
     Flag.setRegClass(RC->getID());
+    Flag.setRegMayBeFolded(MayFoldRegister);
   }
 
   SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
@@ -10349,7 +10351,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
         OpInfo.AssignedRegs.AddInlineAsmOperands(
             OpInfo.isEarlyClobber ? InlineAsm::Kind::RegDefEarlyClobber
                                   : InlineAsm::Kind::RegDef,
-            false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
+            false, 0, OpInfo.MayFoldRegister, getCurSDLoc(), DAG,
+            AsmNodeOperands);
       }
       break;
 
@@ -10391,9 +10394,9 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
           SDLoc dl = getCurSDLoc();
           // Use the produced MatchedRegs object to
           MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call);
-          MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, true,
-                                           OpInfo.getMatchedOperand(), dl, DAG,
-                                           AsmNodeOperands);
+          MatchedRegs.AddInlineAsmOperands(
+              InlineAsm::Kind::RegUse, true, OpInfo.getMatchedOperand(),
+              OpInfo.MayFoldRegister, dl, DAG, AsmNodeOperands);
           break;
         }
 
@@ -10525,7 +10528,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
                                         &Call);
 
       OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::RegUse, false,
-                                               0, dl, DAG, AsmNodeOperands);
+                                               0, OpInfo.MayFoldRegister, dl,
+                                               DAG, AsmNodeOperands);
       break;
     }
     case InlineAsm::isClobber:
@@ -10533,8 +10537,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
       // allocator is aware that the physreg got clobbered.
       if (!OpInfo.AssignedRegs.Regs.empty())
         OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind::Clobber,
-                                                 false, 0, getCurSDLoc(), DAG,
-                                                 AsmNodeOperands);
+                                                 false, 0, false, getCurSDLoc(),
+                                                 DAG, AsmNodeOperands);
       break;
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f8aecea25b3d6..845d06f829730 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -822,8 +822,9 @@ struct RegsForValue {
   /// code marker, matching input operand index (if applicable), and includes
   /// the number of values added into it.
   void AddInlineAsmOperands(InlineAsm::Kind Code, bool HasMatching,
-                            unsigned MatchingIdx, const SDLoc &dl,
-                            SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+                            unsigned MatchingIdx, bool MayFoldRegister,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            std::vector<SDValue> &Ops) const;
 
   /// Check if the total RegCount is greater than one.
   bool occupiesMultipleRegs() const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4b4d80896fa7..d8d55dc4ff68c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5989,6 +5989,16 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
 
     OpInfo.ConstraintVT = MVT::Other;
 
+    // Special treatment for all platforms that can fold a register into a
+    // spill. This is used for the "rm" constraint, where we would vastly
+    // prefer to use 'r' over 'm'. The non-fast register allocators are able to
+    // handle the 'r' default by folding. The fast register allocator needs
+    // special handling to convert the instruction to use 'm' instead.
+    if (!OpInfo.hasMatchingInput() && OpInfo.Codes.size() == 2 &&
+        llvm::is_contained(OpInfo.Codes, "r") &&
+        llvm::is_contained(OpInfo.Codes, "m"))
+      OpInfo.MayFoldRegister = true;
+
     // Compute the value type for each operand.
     switch (OpInfo.Type) {
     case InlineAsm::isOutput: {
@@ -6269,7 +6279,12 @@ TargetLowering::ConstraintWeight
 ///  1) If there is an 'other' constraint, and if the operand is valid for
 ///     that constraint, use it.  This makes us take advantage of 'i'
 ///     constraints when available.
-///  2) Otherwise, pick the most general constraint present.  This prefers
+///  2) Special processing is done for the "rm" constraint. If specified, we
+///     opt for the 'r' constraint, but mark the operand as being "foldable."
+///     In the face of register exhaustion, the register allocator is free to
+///     choose to use a stack slot. The fast register allocator is handled
+///     separately via the InlineAsmPrepare pass.
+///  3) Otherwise, pick the most general constraint present.  This prefers
 ///     'm' over 'r', for example.
 ///
 TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
@@ -6277,6 +6292,16 @@ TargetLowering::ConstraintGroup TargetLowering::getConstraintPreferences(
   ConstraintGroup Ret;
 
   Ret.reserve(OpInfo.Codes.size());
+
+  // If we can fold the register (i.e. it has an "rm" constraint), opt for the
+  // 'r' constraint, and allow the register allocator to spill if need be.
+  const TargetMachine &TM = getTargetMachine();
+  if (TM.getOptLevel() != CodeGenOptLevel::None && OpInfo.MayFoldRegister) {
+    Ret.emplace_back(ConstraintPair("r", getConstraintType("r")));
+    Ret.emplace_back(ConstraintPair("m", getConstraintType("m")));
+    return Ret;
+  }
+
   for (StringRef Code : OpInfo.Codes) {
     TargetLowering::ConstraintType CType = getConstraintType(Code);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 879713f4d6e57..4695452915920 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -450,7 +450,7 @@ FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM))
 FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
 FUNCTION_PASS("infer-alignment", InferAlignmentPass())
 FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings())
-FUNCTION_PASS("inline-asm-prepare", InlineAsmPreparePass())
+FUNCTION_PASS("inline-asm-prepare", InlineAsmPreparePass(*TM))
 FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instnamer", InstructionNamerPass())
 FUNCTION_PASS("instsimplify", InstSimplifyPass())
diff --git a/llvm/test/CodeGen/AArch64/inline-asm-prepare.ll b/llvm/test/CodeGen/AArch64/inline-asm-prepare.ll
index 13ed24692b35e..2baaf0bef4713 100644
--- a/llvm/test/CodeGen/AArch64/inline-asm-prepare.ll
+++ b/llvm/test/CodeGen/AArch64/inline-asm-prepare.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt %s -inline-asm-prepare -S -o - | FileCheck %s
-; RUN: opt %s -passes=inline-asm-prepare -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=aarch64-unknown-linux-gnu -inline-asm-prepare -S -o - | FileCheck %s
+; RUN: opt %s -mtriple=aarch64-unknown-linux-gnu -passes=inline-asm-prepare -S -o - | FileCheck %s
 
 define i32 @test0() {
 ; CHECK-LABEL: @test0(
diff --git a/llvm/test/CodeGen/X86/asm-constraints-rm.ll b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
new file mode 100644
index 0000000000000..086b430bccac8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/asm-constraints-rm.ll
@@ -0,0 +1,1307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "^\t(mov|call|#)" --version 4
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O2 < %s | FileCheck --check-prefix=O2 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 < %s | FileCheck --check-prefix=O0 %s
+
+; The non-fast register allocators should use registers when there isn't
+; register pressure.
+
+define dso_local i32 @test1(ptr noundef readonly captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test1:
+; O2:    movl (%rdi), %eax
+; O2:    movl 4(%rdi), %ecx
+; O2:    movl 8(%rdi), %edx
+; O2:    movl 12(%rdi), %esi
+; O2:    movl 16(%rdi), %r8d
+; O2:    #APP
+; O2:    # rm input: no pressure
+; O2:    # %eax %ecx %edx %esi %r8d
+; O2:    #NO_APP
+; O2:    movl (%rdi), %eax
+;
+; O0-LABEL: test1:
+; O0:    movl (%rdi), %r8d
+; O0:    movl 4(%rdi), %esi
+; O0:    movl 8(%rdi), %edx
+; O0:    movl 12(%rdi), %ecx
+; O0:    movl 16(%rdi), %eax
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movl %esi, -{{[0-9]+}}(%rsp)
+; O0:    movl %edx, -{{[0-9]+}}(%rsp)
+; O0:    movl %ecx, -{{[0-9]+}}(%rsp)
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # rm input: no pressure
+; O0:    # -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %2 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %3 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %4 = load i32, ptr %e, align 4
+  tail call void asm sideeffect "# rm input: no pressure\0A\09# $0 $1 $2 $3 $4", "rm,rm,rm,rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  %5 = load i32, ptr %foo, align 4
+  ret i32 %5
+}
+
+define dso_local i32 @test2(ptr noundef readonly captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test2:
+; O2:    movq %rdi, (%rsp) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, %rbp
+; O2:    movq (%rsp), %rbx # 8-byte Reload
+; O2:    movl (%rbx), %esi
+; O2:    movl 4(%rbx), %edi
+; O2:    movl 8(%rbx), %r8d
+; O2:    movl 12(%rbx), %r9d
+; O2:    movl 16(%rbx), %eax
+; O2:    #APP
+; O2:    # rm input: pressure
+; O2:    # %esi %edi %r8d %r9d %eax
+; O2:    #NO_APP
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O2:    callq g@PLT
+; O2:    movl (%rbx), %eax
+;
+; O0-LABEL: test2:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl (%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 4(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 8(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 12(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl 16(%rax), %eax
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    #APP
+; O0:    # rm input: pressure
+; O0:    # {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %1 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %2 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %3 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %4 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %5 = load i32, ptr %e, align 4
+  tail call void asm sideeffect "# rm input: pressure\0A\09# $0 $1 $2 $3 $4", "rm,rm,rm,rm,rm,~{dirflag},~{fpsr},~{flags}"(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %6 = load i32, ptr %foo, align 4
+  ret i32 %6
+}
+
+define dso_local i32 @test3(ptr noundef writeonly captures(none) initializes((0, 20)) %foo) local_unnamed_addr {
+; O2-LABEL: test3:
+; O2:    #APP
+; O2:    # rm output: no pressure
+; O2:    # %eax %ecx %edx %esi %r8d
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+; O2:    movl %ecx, 4(%rdi)
+; O2:    movl %edx, 8(%rdi)
+; O2:    movl %esi, 12(%rdi)
+; O2:    movl %r8d, 16(%rdi)
+;
+; O0-LABEL: test3:
+; O0:    #APP
+; O0:    # rm output: no pressure
+; O0:    # -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl -{{[0-9]+}}(%rsp), %eax
+; O0:    movl -{{[0-9]+}}(%rsp), %r8d
+; O0:    movl -{{[0-9]+}}(%rsp), %esi
+; O0:    movl -{{[0-9]+}}(%rsp), %edx
+; O0:    movl -{{[0-9]+}}(%rsp), %ecx
+; O0:    movl %eax, (%rdi)
+; O0:    movl %r8d, 4(%rdi)
+; O0:    movl %esi, 8(%rdi)
+; O0:    movl %edx, 12(%rdi)
+; O0:    movl %ecx, 16(%rdi)
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %0 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm output: no pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i32, i32, i32, i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32 } %0, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32 } %0, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32 } %0, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32 } %0, 4
+  store i32 %asmresult, ptr %foo, align 4
+  store i32 %asmresult1, ptr %b, align 4
+  store i32 %asmresult2, ptr %c, align 4
+  store i32 %asmresult3, ptr %d, align 4
+  store i32 %asmresult4, ptr %e, align 4
+  ret i32 %asmresult
+}
+
+define dso_local i32 @test4(ptr noundef writeonly captures(none) initializes((0, 20)) %foo) local_unnamed_addr {
+; O2-LABEL: test4:
+; O2:    movq %rdi, (%rsp) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, %rbp
+; O2:    #APP
+; O2:    # rm output: pressure
+; O2:    # %esi %edi %r8d %r9d %eax
+; O2:    #NO_APP
+; O2:    movq (%rsp), %rbx # 8-byte Reload
+; O2:    movl %esi, (%rbx)
+; O2:    movl %edi, 4(%rbx)
+; O2:    movl %r8d, 8(%rbx)
+; O2:    movl %r9d, 12(%rbx)
+; O2:    movl %eax, 16(%rbx)
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O2:    callq g@PLT
+; O2:    movl (%rbx), %eax
+;
+; O0-LABEL: test4:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    # rm output: pressure
+; O0:    # {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl {{[0-9]+}}(%rsp), %eax
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl %eax, (%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 4(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 8(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 12(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 16(%rdi)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %1 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm output: pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,~{dirflag},~{fpsr},~{flags}"()
+  %asmresult15 = extractvalue { i32, i32, i32, i32, i32 } %1, 0
+  %asmresult16 = extractvalue { i32, i32, i32, i32, i32 } %1, 1
+  %asmresult17 = extractvalue { i32, i32, i32, i32, i32 } %1, 2
+  %asmresult18 = extractvalue { i32, i32, i32, i32, i32 } %1, 3
+  %asmresult19 = extractvalue { i32, i32, i32, i32, i32 } %1, 4
+  store i32 %asmresult15, ptr %foo, align 4
+  store i32 %asmresult16, ptr %b, align 4
+  store i32 %asmresult17, ptr %c, align 4
+  store i32 %asmresult18, ptr %d, align 4
+  store i32 %asmresult19, ptr %e, align 4
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %2 = load i32, ptr %foo, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test5(ptr noundef captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test5:
+; O2:    movl (%rdi), %eax
+; O2:    movl 4(%rdi), %ecx
+; O2:    movl 8(%rdi), %edx
+; O2:    movl 12(%rdi), %esi
+; O2:    movl 16(%rdi), %r8d
+; O2:    #APP
+; O2:    # rm tied output: no pressure
+; O2:    # %eax %ecx %edx %esi %r8d
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+; O2:    movl %ecx, 4(%rdi)
+; O2:    movl %edx, 8(%rdi)
+; O2:    movl %esi, 12(%rdi)
+; O2:    movl %r8d, 16(%rdi)
+;
+; O0-LABEL: test5:
+; O0:    movl (%rdi), %r8d
+; O0:    movl 4(%rdi), %esi
+; O0:    movl 8(%rdi), %edx
+; O0:    movl 12(%rdi), %ecx
+; O0:    movl 16(%rdi), %eax
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movl %esi, -{{[0-9]+}}(%rsp)
+; O0:    movl %edx, -{{[0-9]+}}(%rsp)
+; O0:    movl %ecx, -{{[0-9]+}}(%rsp)
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # rm tied output: no pressure
+; O0:    # %eax %ecx %edx %esi %r8d
+; O0:    #NO_APP
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movl %esi, -{{[0-9]+}}(%rsp)
+; O0:    movl %edx, -{{[0-9]+}}(%rsp)
+; O0:    movl %ecx, -{{[0-9]+}}(%rsp)
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    movl -{{[0-9]+}}(%rsp), %eax
+; O0:    movl -{{[0-9]+}}(%rsp), %r8d
+; O0:    movl -{{[0-9]+}}(%rsp), %esi
+; O0:    movl -{{[0-9]+}}(%rsp), %edx
+; O0:    movl -{{[0-9]+}}(%rsp), %ecx
+; O0:    movl %eax, (%rdi)
+; O0:    movl %r8d, 4(%rdi)
+; O0:    movl %esi, 8(%rdi)
+; O0:    movl %edx, 12(%rdi)
+; O0:    movl %ecx, 16(%rdi)
+entry:
+  %0 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %2 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %3 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %4 = load i32, ptr %e, align 4
+  %5 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm tied output: no pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,0,1,2,3,4,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  %asmresult = extractvalue { i32, i32, i32, i32, i32 } %5, 0
+  %asmresult1 = extractvalue { i32, i32, i32, i32, i32 } %5, 1
+  %asmresult2 = extractvalue { i32, i32, i32, i32, i32 } %5, 2
+  %asmresult3 = extractvalue { i32, i32, i32, i32, i32 } %5, 3
+  %asmresult4 = extractvalue { i32, i32, i32, i32, i32 } %5, 4
+  store i32 %asmresult, ptr %foo, align 4
+  store i32 %asmresult1, ptr %b, align 4
+  store i32 %asmresult2, ptr %c, align 4
+  store i32 %asmresult3, ptr %d, align 4
+  store i32 %asmresult4, ptr %e, align 4
+  ret i32 %asmresult
+}
+
+define dso_local i32 @test6(ptr noundef captures(none) %foo) local_unnamed_addr {
+; O2-LABEL: test6:
+; O2:    movq %rdi, (%rsp) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, %rbp
+; O2:    movq (%rsp), %rbx # 8-byte Reload
+; O2:    movl (%rbx), %esi
+; O2:    movl 4(%rbx), %edi
+; O2:    movl 8(%rbx), %r8d
+; O2:    movl 12(%rbx), %r9d
+; O2:    movl 16(%rbx), %eax
+; O2:    #APP
+; O2:    # rm tied output: pressure
+; O2:    # %esi %edi %r8d %r9d %eax
+; O2:    #NO_APP
+; O2:    movl %esi, (%rbx)
+; O2:    movl %edi, 4(%rbx)
+; O2:    movl %r8d, 8(%rbx)
+; O2:    movl %r9d, 12(%rbx)
+; O2:    movl %eax, 16(%rbx)
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O2:    callq g@PLT
+; O2:    movl (%rbx), %eax
+;
+; O0-LABEL: test6:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movl (%rax), %edi
+; O0:    movl 4(%rax), %esi
+; O0:    movl 8(%rax), %edx
+; O0:    movl 12(%rax), %ecx
+; O0:    movl 16(%rax), %eax
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movl %esi, {{[0-9]+}}(%rsp)
+; O0:    movl %edx, {{[0-9]+}}(%rsp)
+; O0:    movl %ecx, {{[0-9]+}}(%rsp)
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl %ecx, %edi
+; O0:    #APP
+; O0:    # rm tied output: pressure
+; O0:    # %eax %edi %ecx %edx %esi
+; O0:    #NO_APP
+; O0:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movl {{[0-9]+}}(%rsp), %eax
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movl {{[0-9]+}}(%rsp), %edi
+; O0:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl %eax, (%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 4(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 8(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 12(%rdi)
+; O0:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; O0:    movl %eax, 16(%rdi)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %1 = load i32, ptr %foo, align 4
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %2 = load i32, ptr %b, align 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %3 = load i32, ptr %c, align 4
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %4 = load i32, ptr %d, align 4
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %5 = load i32, ptr %e, align 4
+  %6 = tail call { i32, i32, i32, i32, i32 } asm sideeffect "# rm tied output: pressure\0A\09# $0 $1 $2 $3 $4", "=rm,=rm,=rm,=rm,=rm,0,1,2,3,4,~{dirflag},~{fpsr},~{flags}"(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)
+  %asmresult15 = extractvalue { i32, i32, i32, i32, i32 } %6, 0
+  %asmresult16 = extractvalue { i32, i32, i32, i32, i32 } %6, 1
+  %asmresult17 = extractvalue { i32, i32, i32, i32, i32 } %6, 2
+  %asmresult18 = extractvalue { i32, i32, i32, i32, i32 } %6, 3
+  %asmresult19 = extractvalue { i32, i32, i32, i32, i32 } %6, 4
+  store i32 %asmresult15, ptr %foo, align 4
+  store i32 %asmresult16, ptr %b, align 4
+  store i32 %asmresult17, ptr %c, align 4
+  store i32 %asmresult18, ptr %d, align 4
+  store i32 %asmresult19, ptr %e, align 4
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %7 = load i32, ptr %foo, align 4
+  ret i32 %7
+}
+
+define dso_local i32 @test7(ptr noundef captures(none) initializes((0, 4)) %foo) local_unnamed_addr {
+; O2-LABEL: test7:
+; O2:    movl 4(%rdi), %eax
+; O2:    #APP
+; O2:    # rm output, r input: no pressure
+; O2:    # %eax %eax
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+;
+; O0-LABEL: test7:
+; O0:    movl 4(%rdi), %eax
+; O0:    #APP
+; O0:    # rm output, r input: no pressure
+; O0:    # -{{[0-9]+}}(%rsp) %eax
+; O0:    #NO_APP
+; O0:    movl -{{[0-9]+}}(%rsp), %eax
+; O0:    movl %eax, (%rdi)
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %0 = load i32, ptr %b, align 4
+  %1 = tail call i32 asm sideeffect "# rm output, r input: no pressure\0A\09# $0 $1", "=rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  store i32 %1, ptr %foo, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @test8(ptr noundef captures(none) initializes((0, 4)) %foo) local_unnamed_addr {
+; O2-LABEL: test8:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl 4(%rbp), %esi
+; O2:    #APP
+; O2:    # rm output, r input: pressure
+; O2:    # %esi %esi
+; O2:    #NO_APP
+; O2:    movl %esi, (%rbp)
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g@PLT
+; O2:    movl (%rbp), %eax
+;
+; O0-LABEL: test8:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rdi
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movl 4(%rdi), %eax
+; O0:    #APP
+; O0:    # rm output, r input: pressure
+; O0:    # {{[0-9]+}}(%rsp) %eax
+; O0:    #NO_APP
+; O0:    movl {{[0-9]+}}(%rsp), %eax
+; O0:    movl %eax, (%rdi)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  %2 = tail call i32 asm sideeffect "# rm output, r input: pressure\0A\09# $0 $1", "=rm,r,~{dirflag},~{fpsr},~{flags}"(i32 %1)
+  store i32 %2, ptr %foo, align 4
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %3 = load i32, ptr %foo, align 4
+  ret i32 %3
+}
+
+define dso_local i32 @test9(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test9:
+; O2:    movl 4(%rdi), %eax
+; O2:    #APP
+; O2:    # m output, rm input: no pressure
+; O2:    # (%rdi) %eax
+; O2:    #NO_APP
+; O2:    movl (%rdi), %eax
+;
+; O0-LABEL: test9:
+; O0:    movl 4(%rdi), %eax
+; O0:    movl %eax, -{{[0-9]+}}(%rsp)
+; O0:    movq %rax, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # m output, rm input: no pressure
+; O0:    # (%rdi) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %0 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# m output, rm input: no pressure\0A\09# $0 $1", "=*m,rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %foo, i32 %0)
+  %1 = load i32, ptr %foo, align 4
+  ret i32 %1
+}
+
+define dso_local i32 @test10(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test10:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl 4(%rbp), %esi
+; O2:    #APP
+; O2:    # m output, rm input: pressure
+; O2:    # (%rbp) %esi
+; O2:    #NO_APP
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g@PLT
+; O2:    movl (%rbp), %eax
+;
+; O0-LABEL: test10:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rsi, %rcx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq %rdi, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rax
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl 4(%rdi), %edi
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    #APP
+; O0:    # m output, rm input: pressure
+; O0:    # (%rdi) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %1 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# m output, rm input: pressure\0A\09# $0 $1", "=*m,rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %foo, i32 %1)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %2 = load i32, ptr %foo, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test11(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test11:
+; O2:    movl (%rdi), %eax
+; O2:    movl 4(%rdi), %ecx
+; O2:    #APP
+; O2:    # multiple m output, rm input: no pressure
+; O2:    # (%rdi) 4(%rdi) 8(%rdi) 12(%rdi) 16(%rdi) %eax %ecx
+; O2:    #NO_APP
+; O2:    movl (%rdi), %eax
+;
+; O0-LABEL: test11:
+; O0:    movq %rdi, %rax
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %r9d
+; O0:    movl 4(%rdi), %r8d
+; O0:    movl %r9d, -{{[0-9]+}}(%rsp)
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # multiple m output, rm input: no pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %0 = load i32, ptr %foo, align 4
+  %1 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: no pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*m,=*m,=*m,=*m,=*m,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %0, i32 %1)
+  %2 = load i32, ptr %foo, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test12(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test12:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl (%rbp), %esi
+; O2:    movl 4(%rbp), %edi
+; O2:    #APP
+; O2:    # multiple m output, rm input: pressure
+; O2:    # (%rbp) 4(%rbp) 8(%rbp) 12(%rbp) 16(%rbp) %esi %edi
+; O2:    #NO_APP
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g@PLT
+; O2:    movl (%rbp), %eax
+;
+; O0-LABEL: test12:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rdi
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rax
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %eax
+; O0:    movl 4(%rdi), %edi
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    #APP
+; O0:    # multiple m output, rm input: pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %1 = load i32, ptr %foo, align 4
+  %2 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*m,=*m,=*m,=*m,=*m,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %1, i32 %2)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %3 = load i32, ptr %foo, align 4
+  ret i32 %3
+}
+
+define dso_local i32 @test13(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test13:
+; O2:    movl (%rdi), %ecx
+; O2:    movl 4(%rdi), %edx
+; O2:    #APP
+; O2:    # multiple m output, rm input: no pressure
+; O2:    # %eax %esi %r8d %r9d %r10d %ecx %edx
+; O2:    #NO_APP
+; O2:    movl %eax, (%rdi)
+; O2:    movl %esi, 4(%rdi)
+; O2:    movl %r8d, 8(%rdi)
+; O2:    movl %r9d, 12(%rdi)
+; O2:    movl %r10d, 16(%rdi)
+;
+; O0-LABEL: test13:
+; O0:    movq %rdi, %rax
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %r9d
+; O0:    movl 4(%rdi), %r8d
+; O0:    movl %r9d, -{{[0-9]+}}(%rsp)
+; O0:    movl %r8d, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    movq %r8, -{{[0-9]+}}(%rsp)
+; O0:    #APP
+; O0:    # multiple m output, rm input: no pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) -{{[0-9]+}}(%rsp) -{{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movl (%rdi), %eax
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %0 = load i32, ptr %foo, align 4
+  %1 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: no pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*&rm,=*&rm,=*&rm,=*&rm,=*&rm,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %0, i32 %1)
+  %2 = load i32, ptr %foo, align 4
+  ret i32 %2
+}
+
+define dso_local i32 @test14(ptr noundef %foo) local_unnamed_addr {
+; O2-LABEL: test14:
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    #APP
+; O2:    movq $0, %rax
+; O2:    movq $1, %rcx
+; O2:    movq $2, %rdx
+; O2:    movq $3, %rsi
+; O2:    movq $4, %rdi
+; O2:    movq $5, %rbx
+; O2:    movq $6, %rbp
+; O2:    movq $7, %r8
+; O2:    movq $8, %r9
+; O2:    movq $9, %r10
+; O2:    movq $10, %r11
+; O2:    movq $11, %r12
+; O2:    movq $12, %r13
+; O2:    movq $13, %r14
+; O2:    movq $14, %r15
+; O2:    #NO_APP
+; O2:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; O2:    movl (%rbp), %esi
+; O2:    movl 4(%rbp), %edi
+; O2:    #APP
+; O2:    # multiple m output, rm input: pressure
+; O2:    # (%rbp) 4(%rbp) 8(%rbp) 12(%rbp) 16(%rbp) %esi %edi
+; O2:    #NO_APP
+; O2:    movq %rax, %rdi
+; O2:    movq %rcx, %rsi
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O2:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O2:    movq %rbx, %r9
+; O2:    callq g@PLT
+; O2:    movl (%rbp), %eax
+;
+; O0-LABEL: test14:
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    #APP
+; O0:    movq $0, %rax
+; O0:    movq $1, %rcx
+; O0:    movq $2, %rdx
+; O0:    movq $3, %rsi
+; O0:    movq $4, %rdi
+; O0:    movq $5, %rbx
+; O0:    movq $6, %rbp
+; O0:    movq $7, %r8
+; O0:    movq $8, %r9
+; O0:    movq $9, %r10
+; O0:    movq $10, %r11
+; O0:    movq $11, %r12
+; O0:    movq $12, %r13
+; O0:    movq $13, %r14
+; O0:    movq $14, %r15
+; O0:    #NO_APP
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %r8, %rbx
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; O0:    movq %r9, %rdi
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; O0:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rax
+; O0:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; O0:    movq %rdi, %rcx
+; O0:    movq %rdi, %rdx
+; O0:    movq %rdi, %rsi
+; O0:    movl (%rdi), %eax
+; O0:    movl 4(%rdi), %edi
+; O0:    movl %eax, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movl %edi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq %rdi, {{[0-9]+}}(%rsp)
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    #APP
+; O0:    # multiple m output, rm input: pressure
+; O0:    # (%rdi) (%rax) (%rcx) (%rdx) (%rsi) {{[0-9]+}}(%rsp) {{[0-9]+}}(%rsp)
+; O0:    #NO_APP
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movq %rbp, (%rsp)
+; O0:    movq %rbx, {{[0-9]+}}(%rsp)
+; O0:    movq %rax, {{[0-9]+}}(%rsp)
+; O0:    movq %r10, {{[0-9]+}}(%rsp)
+; O0:    movq %r11, {{[0-9]+}}(%rsp)
+; O0:    movq %r12, {{[0-9]+}}(%rsp)
+; O0:    movq %r13, {{[0-9]+}}(%rsp)
+; O0:    movq %r14, {{[0-9]+}}(%rsp)
+; O0:    movq %r15, {{[0-9]+}}(%rsp)
+; O0:    callq g@PLT
+; O0:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; O0:    movl (%rdi), %eax
+entry:
+  %0 = tail call { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } asm sideeffect "mov $$0, $0\0A\09mov $$1, $1\0A\09mov $$2, $2\0A\09mov $$3, $3\0A\09mov $$4, $4\0A\09mov $$5, $5\0A\09mov $$6, $6\0A\09mov $$7, $7\0A\09mov $$8, $8\0A\09mov $$9, $9\0A\09mov $$10, $10\0A\09mov $$11, $11\0A\09mov $$12, $12\0A\09mov $$13, $13\0A\09mov $$14, $14", "={rax},={rcx},={rdx},={rsi},={rdi},={rbx},={rbp},={r8},={r9},={r10},={r11},={r12},={r13},={r14},={r15},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 0
+  %asmresult1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 1
+  %asmresult2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 2
+  %asmresult3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 3
+  %asmresult4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 4
+  %asmresult5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 5
+  %asmresult6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 6
+  %asmresult7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 7
+  %asmresult8 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 8
+  %asmresult9 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 9
+  %asmresult10 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 10
+  %asmresult11 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 11
+  %asmresult12 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 12
+  %asmresult13 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 13
+  %asmresult14 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 } %0, 14
+  %b = getelementptr inbounds nuw i8, ptr %foo, i64 4
+  %c = getelementptr inbounds nuw i8, ptr %foo, i64 8
+  %d = getelementptr inbounds nuw i8, ptr %foo, i64 12
+  %e = getelementptr inbounds nuw i8, ptr %foo, i64 16
+  %1 = load i32, ptr %foo, align 4
+  %2 = load i32, ptr %b, align 4
+  tail call void asm sideeffect "# multiple m output, rm input: pressure\0A\09# $0 $1 $2 $3 $4 $5 $6", "=*m,=*m,=*m,=*m,=*m,rm,rm,~{dirflag},~{fpsr},~{flags}"(ptr nonnull elementtype(i32) %foo, ptr nonnull elementtype(i32) %b, ptr nonnull elementtype(i32) %c, ptr nonnull elementtype(i32) %d, ptr nonnull elementtype(i32) %e, i32 %1, i32 %2)
+  tail call void @g(i64 noundef %asmresult, i64 noundef %asmresult1, i64 noundef %asmresult2, i64 noundef %asmresult3, i64 noundef %asmresult4, i64 noundef %asmresult5, i64 noundef %asmresult6, i64 noundef %asmresult7, i64 noundef %asmresult8, i64 noundef %asmresult9, i64 noundef %asmresult10, i64 noundef %asmresult11, i64 noundef %asmresult12, i64 noundef %asmresult13, i64 noundef %asmresult14)
+  %3 = load i32, ptr %foo, align 4
+  ret i32 %3
+}
+
+declare void @g(i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef, i64 noundef)
diff --git a/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
new file mode 100644
index 0000000000000..3cd664ab08754
--- /dev/null
+++ b/llvm/test/CodeGen/X86/inline-asm-prepare-memory.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=x86_64-unknown-linux-gnu -inline-asm-prepare < %s | FileCheck %s
+
+define void @test1(i32 %x) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ASM_MEM:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[X]], ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(ptr [[ASM_MEM]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i32 asm sideeffect "mov $1, $0", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %x)
+  ret void
+}
+
+define void @test2(ptr %p) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ASM_MEM:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void asm sideeffect "mov $1, $0", "=*rm,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) [[ASM_MEM]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i32 asm sideeffect "mov $1, $0", "=rm,~{dirflag},~{fpsr},~{flags}"()
+  store i32 %0, ptr %p
+  ret void
+}
+
+define void @test3(ptr %x_ptr) {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: ptr [[X_PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ASM_MEM:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_PTR]], align 4
+; CHECK-NEXT:    store i32 [[X]], ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    call void asm sideeffect "inc $0", "=*rm,0,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) [[ASM_MEM]], ptr [[ASM_MEM]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ASM_MEM]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[X_PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = load i32, ptr %x_ptr
+  %0 = call i32 asm sideeffect "inc $0", "=rm,0,~{dirflag},~{fpsr},~{flags}"(i32 %x)
+  store i32 %0, ptr %x_ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
index be4d1c29332f7..a322bd3003a58 100644
--- a/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
+++ b/llvm/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -6,16 +6,13 @@
 define i32 @foo(i32 %treemap) nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    negl %ecx
 ; CHECK-NEXT:    andl %eax, %ecx
-; CHECK-NEXT:    movl %ecx, (%esp)
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    bsfl (%esp), %eax
+; CHECK-NEXT:    bsfl %ecx, %eax
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    popl %ecx
 ; CHECK-NEXT:    retl
 entry:
   %sub = sub i32 0, %treemap