diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 170f20182001fa..4559f7a9bde787 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -84,6 +84,8 @@ class MachinePipeliner : public MachineFunctionPass {
     SmallVector<MachineOperand, 4> BrCond;
     MachineInstr *LoopInductionVar = nullptr;
     MachineInstr *LoopCompare = nullptr;
+    std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopPipelinerInfo =
+        nullptr;
   };
   LoopInfo LI;
 
@@ -119,6 +121,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   LiveIntervals &LIS;
   const RegisterClassInfo &RegClassInfo;
   unsigned II_setByPragma = 0;
+  TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr;
 
   /// A toplogical ordering of the SUnits, which is needed for changing
   /// dependences and iterating over the SUnits.
@@ -196,9 +199,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
-                    const RegisterClassInfo &rci, unsigned II)
+                    const RegisterClassInfo &rci, unsigned II,
+                    TargetInstrInfo::PipelinerLoopInfo *PLI)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
-        RegClassInfo(rci), II_setByPragma(II), Topo(SUnits, &ExitSU) {
+        RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
+        Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
     if (SwpEnableCopyToPhi)
       Mutations.push_back(std::make_unique<CopyToPhiMutation>());
@@ -589,6 +594,13 @@ class SMSchedule {
     return ScheduledInstrs[cycle];
   }
 
+  SmallSet<SUnit *, 8>
+  computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
+                             TargetInstrInfo::PipelinerLoopInfo *PLI);
+
+  bool
+  normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
+                                    TargetInstrInfo::PipelinerLoopInfo *PLI);
   bool isValidSchedule(SwingSchedulerDAG *SSD);
   void finalizeSchedule(SwingSchedulerDAG *SSD);
   void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h
index 4045df807cdb37..c515101e80fdfa 100644
--- a/llvm/include/llvm/CodeGen/ModuloSchedule.h
+++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h
@@ -191,8 +191,8 @@ class ModuloScheduleExpander {
   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
-                      ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
-                      MBBVectorTy &PrologBBs);
+                      MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
+                      MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
                             ValueMapTy *VRMap, InstrMapTy &InstrMap,
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 0bffa9154fc4b3..9ea6e9b981721c 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -255,6 +255,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
              << "Failed to pipeline loop";
     });
 
+    LI.LoopPipelinerInfo.reset();
     return Changed;
   }
 
@@ -262,6 +263,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
 
   Changed = swingModuloScheduler(L);
 
+  LI.LoopPipelinerInfo.reset();
   return Changed;
 }
 
@@ -354,7 +356,8 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
-  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
+  LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
+  if (!LI.LoopPipelinerInfo) {
     LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
     NumFailLoop++;
     ORE->emit([&]() {
@@ -419,7 +422,7 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
 
   SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
-                        II_setByPragma);
+                        II_setByPragma, LI.LoopPipelinerInfo.get());
 
   MachineBasicBlock *MBB = L.getHeader();
   // The kernel should not include any terminator instructions.  These
@@ -1422,7 +1425,7 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
 static bool ignoreDependence(const SDep &D, bool isPred) {
-  if (D.isArtificial())
+  if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
     return true;
   return D.getKind() == SDep::Anti && isPred;
 }
@@ -1471,6 +1474,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
     SUnit *SU = &SUnits[I];
     for (const SDep &S : SU->Succs) {
       SUnit *succ = S.getSUnit();
+      if (succ->isBoundaryNode())
+        continue;
       if (S.getLatency() == 0)
         zeroLatencyHeight =
             std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
@@ -1788,7 +1793,8 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
   NodesAdded.insert(SU);
   for (auto &SI : SU->Succs) {
     SUnit *Successor = SI.getSUnit();
-    if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
+    if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
+        NodesAdded.count(Successor) == 0)
       addConnectedNodes(Successor, NewSet, NodesAdded);
   }
   for (auto &PI : SU->Preds) {
@@ -2080,6 +2086,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       });
     } while (++NI != NE && scheduleFound);
 
+    // If a schedule is found, ensure non-pipelined instructions are in stage 0
+    if (scheduleFound)
+      scheduleFound =
+          Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
+
     // If a schedule is found, check if it is a valid schedule too.
     if (scheduleFound)
       scheduleFound = Schedule.isValidSchedule(this);
@@ -2263,7 +2274,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
                                          bool isSucc) {
   if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
-      Dep.isArtificial())
+      Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
     return false;
 
   if (!SwpPruneLoopCarried)
@@ -2430,7 +2441,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) {
   while (!Worklist.empty()) {
     const SDep &Cur = Worklist.pop_back_val();
     SUnit *SuccSU = Cur.getSUnit();
-    if (Visited.count(SuccSU))
+    if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
       continue;
     std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
     if (it == InstrToCycle.end())
@@ -2697,21 +2708,91 @@ bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
   return false;
 }
 
+/// Determine transitive dependences of unpipelineable instructions
+SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DoNotPipeline;
+  SmallVector<SUnit *, 8> Worklist;
+
+  for (auto &SU : SSD->SUnits)
+    if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
+      Worklist.push_back(&SU);
+
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    if (DoNotPipeline.count(SU))
+      continue;
+    LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
+    DoNotPipeline.insert(SU);
+    for (auto &Dep : SU->Preds)
+      Worklist.push_back(Dep.getSUnit());
+    if (SU->getInstr()->isPHI())
+      for (auto &Dep : SU->Succs)
+        if (Dep.getKind() == SDep::Anti)
+          Worklist.push_back(Dep.getSUnit());
+  }
+  return DoNotPipeline;
+}
+
+// Determine all instructions upon which any unpipelineable instruction depends
+// and ensure that they are in stage 0.  If unable to do so, return false.
+bool SMSchedule::normalizeNonPipelinedInstructions(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
+
+  int NewLastCycle = INT_MIN;
+  for (SUnit &SU : SSD->SUnits) {
+    if (!SU.isInstr())
+      continue;
+    if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
+      NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
+      continue;
+    }
+
+    // Put the non-pipelined instruction as early as possible in the schedule
+    int NewCycle = getFirstCycle();
+    for (auto &Dep : SU.Preds)
+      NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
+
+    int OldCycle = InstrToCycle[&SU];
+    if (OldCycle != NewCycle) {
+      InstrToCycle[&SU] = NewCycle;
+      auto &OldS = getInstructions(OldCycle);
+      OldS.erase(std::remove(OldS.begin(), OldS.end(), &SU), OldS.end());
+      getInstructions(NewCycle).emplace_back(&SU);
+      LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
+                        << ") is not pipelined; moving from cycle " << OldCycle
+                        << " to " << NewCycle << " Instr:" << *SU.getInstr());
+    }
+    NewLastCycle = std::max(NewLastCycle, NewCycle);
+  }
+  LastCycle = NewLastCycle;
+  return true;
+}
+
 // Check if the generated schedule is valid. This function checks if
 // an instruction that uses a physical register is scheduled in a
 // different stage than the definition. The pipeliner does not handle
 // physical register values that may cross a basic block boundary.
+// Furthermore, if a physical def/use pair is assigned to the same
+// cycle, orderDependence does not guarantee def/use ordering, so that
+// case should be considered invalid.  (The test checks for both
+// earlier and same-cycle use to be more robust.)
 bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
   for (SUnit &SU : SSD->SUnits) {
     if (!SU.hasPhysRegDefs)
       continue;
     int StageDef = stageScheduled(&SU);
+    int CycleDef = InstrToCycle[&SU];
     assert(StageDef != -1 && "Instruction should have been scheduled.");
     for (auto &SI : SU.Succs)
-      if (SI.isAssignedRegDep())
-        if (Register::isPhysicalRegister(SI.getReg()))
+      if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
+        if (Register::isPhysicalRegister(SI.getReg())) {
           if (stageScheduled(SI.getSUnit()) != StageDef)
             return false;
+          if (InstrToCycle[SI.getSUnit()] <= CycleDef)
+            return false;
+        }
   }
   return true;
 }
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index b974fa9846f250..20aecdf222e26b 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -158,7 +158,7 @@ void ModuloScheduleExpander::generatePipelinedLoop() {
 
   SmallVector<MachineBasicBlock *, 4> EpilogBBs;
   // Generate the epilog instructions to complete the pipeline.
-  generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
+  generateEpilog(MaxStageCount, KernelBB, BB, VRMap, EpilogBBs, PrologBBs);
 
   // We need this step because the register allocation doesn't handle some
   // situations well, so we insert copies to help out.
@@ -240,11 +240,9 @@ void ModuloScheduleExpander::generateProlog(unsigned LastStage,
 /// Generate the pipeline epilog code. The epilog code finishes the iterations
 /// that were started in either the prolog or the kernel.  We create a basic
 /// block for each stage that needs to complete.
-void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
-                                            MachineBasicBlock *KernelBB,
-                                            ValueMapTy *VRMap,
-                                            MBBVectorTy &EpilogBBs,
-                                            MBBVectorTy &PrologBBs) {
+void ModuloScheduleExpander::generateEpilog(
+    unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
+    ValueMapTy *VRMap, MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs) {
   // We need to change the branch from the kernel to the first epilog block, so
   // this call to analyze branch uses the kernel rather than the original BB.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -314,7 +312,12 @@ void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
   // Create a branch to the new epilog from the kernel.
   // Remove the original branch and add a new branch to the epilog.
   TII->removeBranch(*KernelBB);
-  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
+  assert((OrigBB == TBB || OrigBB == FBB) &&
+         "Unable to determine looping branch direction");
+  if (OrigBB != TBB)
+    TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
+  else
+    TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
   // Add a branch to the loop exit.
   if (EpilogBBs.size() > 0) {
     MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index d9bc2827f7d299..4bcba38efe02d5 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -494,6 +494,10 @@ def FeatureNoNegativeImmediates
 def FeatureUseMISched: SubtargetFeature<"use-misched", "UseMISched", "true",
                                         "Use the MachineScheduler">;
 
+// Use the MachinePipeliner for instruction scheduling for the subtarget.
+def FeatureUseMIPipeliner: SubtargetFeature<"use-mipipeliner", "UseMIPipeliner", "true",
+                                            "Use the MachinePipeliner">;
+
 // False if scheduling should happen again after register allocation.
 def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
     "DisablePostRAScheduler", "true",
@@ -1395,6 +1399,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
 def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
                                                          ProcM7,
                                                          FeatureFPARMv8_D16,
+                                                         FeatureUseMIPipeliner,
                                                          FeatureUseMISched]>;
 
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9bb9df536b2eef..9a7960268a7589 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6721,3 +6721,77 @@ unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) {
   return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip
                                                           : ARM::BLX_pred;
 }
+
+namespace {
+class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  MachineInstr *EndLoop, *LoopCount;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+
+  // Meanings of the various stuff with loop types:
+  // t2Bcc:
+  //   Loop = null -- there is no setup.
+  //   EndLoop = branch at end of original BB that will become a kernel
+  //   LoopCount = CC setter live into branch
+public:
+  ARMPipelinerLoopInfo(MachineInstr *EndLoop, MachineInstr *LoopCount)
+      : EndLoop(EndLoop), LoopCount(LoopCount),
+        MF(EndLoop->getParent()->getParent()),
+        TII(MF->getSubtarget().getInstrInfo()) {}
+
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    // Only ignore the terminator.
+    return MI == EndLoop || MI == LoopCount;
+  }
+
+  Optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &Cond) override {
+
+    if (isCondBranchOpcode(EndLoop->getOpcode())) {
+      Cond.push_back(EndLoop->getOperand(1));
+      Cond.push_back(EndLoop->getOperand(2));
+      if (EndLoop->getOperand(0).getMBB() == EndLoop->getParent()) {
+        TII->reverseBranchCondition(Cond);
+      }
+      return {};
+    } else
+      llvm_unreachable("Unknown EndLoop");
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+  void adjustTripCount(int TripCountAdjust) override {}
+
+  void disposed() override {}
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+ARMBaseInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
+  MachineBasicBlock *Preheader = *LoopBB->pred_begin();
+  if (Preheader == LoopBB)
+    Preheader = *std::next(LoopBB->pred_begin());
+
+  if (I != LoopBB->end() && I->getOpcode() == ARM::t2Bcc) {
+    // If the branch is a Bcc, then the CPSR should be set somewhere within the
+    // block.  We need to determine the reaching definition of CPSR so that
+    // it can be marked as non-pipelineable, allowing the pipeliner to force
+    // it into stage 0 or give up if it cannot or will not do so.
+    MachineInstr *CCSetter = nullptr;
+    for (auto &L : LoopBB->instrs()) {
+      if (L.isCall())
+        return nullptr;
+      if (isCPSRDefined(L))
+        CCSetter = &L;
+    }
+    if (CCSetter)
+      return std::make_unique<ARMPipelinerLoopInfo>(&*I, CCSetter);
+    else
+      return nullptr; // Unable to find the CC setter, so unable to guarantee
+                      // that pipeline will work
+  }
+
+  return nullptr;
+}
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index ab9643592724db..40acb27d1eb1ff 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -372,6 +372,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
            MI->getOpcode() == ARM::t2WhileLoopStartTP;
   }
 
+  /// Analyze loop L, which must be a single-basic-block loop, and if the
+  /// conditions can be understood enough produce a PipelinerLoopInfo object.
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
 private:
   /// Returns an unused general-purpose register which can be used for
   /// constructing an outlined call if one exists. Returns 0 otherwise.
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index b62f447e8d5818..89e5b8762d80fc 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -393,6 +393,14 @@ bool ARMSubtarget::enableSubRegLiveness() const {
   return hasMVEIntegerOps();
 }
 
+bool ARMSubtarget::enableMachinePipeliner() const {
+  // Enable the MachinePipeliner before register allocation for subtargets
+  // with the use-mipipeliner feature.
+  return getSchedModel().hasInstrSchedModel() && useMachinePipeliner();
+}
+
+bool ARMSubtarget::useDFAforSMS() const { return false; }
+
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
   if (enableMachineScheduler())
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index d426157c545362..8d56c70e80949a 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -417,6 +417,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isRWPI() const;
 
   bool useMachineScheduler() const { return UseMISched; }
+  bool useMachinePipeliner() const { return UseMIPipeliner; }
   bool hasMinSize() const { return OptMinSize; }
   bool isThumb1Only() const { return isThumb() && !hasThumb2(); }
   bool isThumb2() const { return isThumb() && hasThumb2(); }
@@ -465,6 +466,10 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   /// Returns true if machine scheduler should be enabled.
   bool enableMachineScheduler() const override;
 
+  /// Returns true if machine pipeliner should be enabled.
+  bool enableMachinePipeliner() const override;
+  bool useDFAforSMS() const override;
+
   /// True for some subtargets at > -O0.
   bool enablePostRAScheduler() const override;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 401a00841747d6..dadf7a557238cc 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -509,6 +509,9 @@ bool ARMPassConfig::addGlobalInstructionSelect() {
 
 void ARMPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    if (getOptLevel() == CodeGenOpt::Aggressive)
+      addPass(&MachinePipelinerID);
+
     addPass(createMVETPAndVPTOptimisationsPass());
 
     addPass(createMLxExpansionPass());
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 10c56a3c495b53..6e55134581113d 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -95,6 +95,13 @@
 ; CHECK-NEXT:      Peephole Optimizations
 ; CHECK-NEXT:      Remove dead machine instructions
 ; CHECK-NEXT:      MachineDominator Tree Construction
+; CHECK-NEXT:      Slot index numbering
+; CHECK-NEXT:      Live Interval Analysis
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
+; CHECK-NEXT:      Machine Optimization Remark Emitter
+; CHECK-NEXT:      Modulo Software Pipelining
+; CHECK-NEXT:      MachineDominator Tree Construction
+; CHECK-NEXT:      Machine Natural Loop Construction
 ; CHECK-NEXT:      MVE TailPred and VPT Optimisation Pass
 ; CHECK-NEXT:      ARM MLA / MLS expansion pass
 ; CHECK-NEXT:      MachineDominator Tree Construction
diff --git a/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir b/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir
new file mode 100644
index 00000000000000..3f3ff5e4bd290a
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK
+
+--- |
+  define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 {
+  entry:
+    %cmp8 = icmp sgt i32 %sz, 0
+    br i1 %cmp8, label %for.body.preheader, label %for.end
+
+  for.body.preheader:                               ; preds = %entry
+    %scevgep = getelementptr float, float* %b, i32 -1
+    %scevgep4 = getelementptr float, float* %a, i32 -1
+    br label %for.body
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+    %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+    %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+    %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
+    %0 = load float, float* %scevgep7, align 4
+    %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1
+    %1 = load float, float* %scevgep3, align 4
+    %mul = fmul fast float %1, %0
+    %add = fadd fast float %mul, %sum.010
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1
+    %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1
+    %exitcond.not = icmp ne i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.body, label %for.end, !llvm.loop !0
+
+  for.end:                                          ; preds = %for.body, %entry
+    %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+    ret float %sum.0.lcssa
+  }
+
+  !0 = distinct !{!0, !1, !2, !3}
+  !1 = !{!"llvm.loop.mustprogress"}
+  !2 = !{!"llvm.loop.unroll.disable"}
+  !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3}
+
+...
+---
+name:            dot
+alignment:       2
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'float 0.000000e+00'
+    alignment:       4
+    isTargetSpecific: false
+body:             |
+  ; CHECK-LABEL: name: dot
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnopc = COPY $r2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gprnopc = COPY $r1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gprnopc = COPY $r0
+  ; CHECK-NEXT:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.2, 10 /* CC::ge */, $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.for.body.preheader:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]]
+  ; CHECK-NEXT:   [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.for.body:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+  ; CHECK-NEXT:   [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+  ; CHECK-NEXT:   [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri2:%[0-9]+]]:rgpr = t2SUBri [[COPY]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gprnopc = COPY [[t2SUBri2]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gprnopc = COPY [[t2ADDri1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gprnopc = COPY [[t2ADDri]]
+  ; CHECK-NEXT:   t2Bcc %bb.9, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.6, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.for.body:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000), %bb.8(0x00000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[COPY7]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS4:%[0-9]+]]:spr = VLDRS [[COPY7]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[COPY6]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS5:%[0-9]+]]:spr = VLDRS [[COPY6]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS5]], [[VLDRS4]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri3:%[0-9]+]]:rgpr = t2SUBri [[COPY5]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gpr = COPY [[t2ADDri3]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]]
+  ; CHECK-NEXT:   t2Bcc %bb.8, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.for.body:
+  ; CHECK-NEXT:   successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY10]], %bb.6, %49, %bb.7
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY9]], %bb.6, %50, %bb.7
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.6, %51, %bb.7
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, %43, %bb.7
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, %52, %bb.7
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI5]], [[PHI3]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri4:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[VLDRS6:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VLDRS7:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri4]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gpr = COPY [[t2ADDri5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gpr = COPY [[t2SUBri4]]
+  ; CHECK-NEXT:   [[VMULS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS6]], [[VLDRS7]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2Bcc %bb.8, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, [[VADDS]], %bb.7
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, [[VMULS2]], %bb.7
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI8]], [[PHI6]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS1]], %bb.8
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[PHI7]], %bb.8
+  ; CHECK-NEXT:   [[VADDS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI10]], [[PHI9]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.for.end:
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS2]], %bb.9
+  ; CHECK-NEXT:   [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI11]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r0 = COPY [[VMOVRS]]
+  ; CHECK-NEXT:   tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.4(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %13:gprnopc = COPY $r2
+    %12:gprnopc = COPY $r1
+    %11:gprnopc = COPY $r0
+    t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.1, 10 /* CC::ge */, $cpsr
+
+  bb.4:
+    successors: %bb.3(0x80000000)
+
+    %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+
+    %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = COPY %16
+    %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg
+    %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    %1:gpr = COPY %17
+
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %2:gprnopc = PHI %1, %bb.1, %9, %bb.2
+    %3:gprnopc = PHI %0, %bb.1, %8, %bb.2
+    %4:gprnopc = PHI %13, %bb.1, %7, %bb.2
+    %5:spr = PHI %15, %bb.1, %6, %bb.2
+    %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg
+    %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+    %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg
+    %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+    %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg
+    %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg
+    %23:rgpr = t2SUBri %4, 1, 14 /* CC::al */, $noreg, def $cpsr
+    %7:gpr = COPY %23
+    %8:gpr = COPY %20
+    %9:gpr = COPY %18
+    t2Bcc %bb.3, 0 /* CC::eq */, $cpsr
+    t2B %bb.2, 14 /* CC::al */, $noreg
+
+  bb.3.for.end:
+    %10:spr = PHI %14, %bb.4, %6, %bb.2
+    %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg
+    $r0 = COPY %24
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/swp-fixedii.mir b/llvm/test/CodeGen/Thumb2/swp-fixedii.mir
new file mode 100644
index 00000000000000..579123c48a1f6d
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/swp-fixedii.mir
@@ -0,0 +1,203 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK
+
+--- |
+  define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 {
+  entry:
+    %cmp8 = icmp sgt i32 %sz, 0
+    br i1 %cmp8, label %for.body.preheader, label %for.end
+
+  for.body.preheader:                               ; preds = %entry
+    %scevgep = getelementptr float, float* %b, i32 -1
+    %scevgep4 = getelementptr float, float* %a, i32 -1
+    br label %for.body
+
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+    %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+    %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+    %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1
+    %0 = load float, float* %scevgep7, align 4
+    %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1
+    %1 = load float, float* %scevgep3, align 4
+    %mul = fmul fast float %1, %0
+    %add = fadd fast float %mul, %sum.010
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1
+    %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+  for.end:                                          ; preds = %for.body, %entry
+    %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+    ret float %sum.0.lcssa
+  }
+
+  !0 = distinct !{!0, !1, !2, !3}
+  !1 = !{!"llvm.loop.mustprogress"}
+  !2 = !{!"llvm.loop.unroll.disable"}
+  !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3}
+
+...
+---
+name:            dot
+alignment:       2
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'float 0.000000e+00'
+    alignment:       4
+    isTargetSpecific: false
+body:             |
+  ; CHECK-LABEL: name: dot
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x50000000), %bb.1(0x30000000)
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnopc = COPY $r2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gprnopc = COPY $r1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gprnopc = COPY $r0
+  ; CHECK-NEXT:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK-NEXT:   t2Bcc %bb.2, 10 /* CC::ge */, $cpsr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.for.body.preheader:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]]
+  ; CHECK-NEXT:   [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.for.body:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+  ; CHECK-NEXT:   [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+  ; CHECK-NEXT:   [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri2:%[0-9]+]]:rgpr = t2SUBri [[COPY]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gprnopc = COPY [[t2SUBri2]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gprnopc = COPY [[t2ADDri1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gprnopc = COPY [[t2ADDri]]
+  ; CHECK-NEXT:   t2Bcc %bb.9, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.6, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.for.body:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000), %bb.8(0x00000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[COPY7]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS4:%[0-9]+]]:spr = VLDRS [[COPY7]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[COPY6]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[VLDRS5:%[0-9]+]]:spr = VLDRS [[COPY6]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS5]], [[VLDRS4]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri3:%[0-9]+]]:rgpr = t2SUBri [[COPY5]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gpr = COPY [[t2ADDri3]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]]
+  ; CHECK-NEXT:   t2Bcc %bb.8, 0 /* CC::eq */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.7, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.for.body:
+  ; CHECK-NEXT:   successors: %bb.8(0x04000000), %bb.7(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY10]], %bb.6, %49, %bb.7
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY9]], %bb.6, %50, %bb.7
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.6, %51, %bb.7
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, %43, %bb.7
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, %52, %bb.7
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI5]], [[PHI3]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   [[t2SUBri4:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 1, 14 /* CC::al */, $noreg, def $cpsr
+  ; CHECK-NEXT:   [[VLDRS6:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4)
+  ; CHECK-NEXT:   [[VLDRS7:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4)
+  ; CHECK-NEXT:   [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri4]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gpr = COPY [[t2ADDri5]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gpr = COPY [[t2SUBri4]]
+  ; CHECK-NEXT:   [[VMULS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS6]], [[VLDRS7]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2Bcc %bb.7, 1 /* CC::ne */, $cpsr
+  ; CHECK-NEXT:   t2B %bb.8, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, [[VADDS]], %bb.7
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, [[VMULS2]], %bb.7
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7
+  ; CHECK-NEXT:   [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI8]], [[PHI6]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS1]], %bb.8
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[PHI7]], %bb.8
+  ; CHECK-NEXT:   [[VADDS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI10]], [[PHI9]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.for.end:
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS2]], %bb.9
+  ; CHECK-NEXT:   [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI11]], 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r0 = COPY [[VMOVRS]]
+  ; CHECK-NEXT:   tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.4(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %13:gprnopc = COPY $r2
+    %12:gprnopc = COPY $r1
+    %11:gprnopc = COPY $r0
+    t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.1, 10 /* CC::ge */, $cpsr
+
+  bb.4:
+    successors: %bb.3(0x80000000)
+
+    %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+
+    %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = COPY %16
+    %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg
+    %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool)
+    %1:gpr = COPY %17
+
+  bb.2.for.body:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %2:gprnopc = PHI %1, %bb.1, %9, %bb.2
+    %3:gprnopc = PHI %0, %bb.1, %8, %bb.2
+    %4:gprnopc = PHI %13, %bb.1, %7, %bb.2
+    %5:spr = PHI %15, %bb.1, %6, %bb.2
+    %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg
+    %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7)
+    %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg
+    %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3)
+    %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg
+    %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg
+    %23:rgpr = t2SUBri %4, 1, 14 /* CC::al */, $noreg, def $cpsr
+    %7:gpr = COPY %23
+    %8:gpr = COPY %20
+    %9:gpr = COPY %18
+    t2Bcc %bb.2, 1 /* CC::ne */, $cpsr
+    t2B %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.for.end:
+    %10:spr = PHI %14, %bb.4, %6, %bb.2
+    %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg
+    $r0 = COPY %24
+    tBX_RET 14 /* CC::al */, $noreg, implicit $r0
+
+...