diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 170f20182001fa..4559f7a9bde787 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -84,6 +84,8 @@ class MachinePipeliner : public MachineFunctionPass { SmallVector BrCond; MachineInstr *LoopInductionVar = nullptr; MachineInstr *LoopCompare = nullptr; + std::unique_ptr LoopPipelinerInfo = + nullptr; }; LoopInfo LI; @@ -119,6 +121,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { LiveIntervals &LIS; const RegisterClassInfo &RegClassInfo; unsigned II_setByPragma = 0; + TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr; /// A toplogical ordering of the SUnits, which is needed for changing /// dependences and iterating over the SUnits. @@ -196,9 +199,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { public: SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis, - const RegisterClassInfo &rci, unsigned II) + const RegisterClassInfo &rci, unsigned II, + TargetInstrInfo::PipelinerLoopInfo *PLI) : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis), - RegClassInfo(rci), II_setByPragma(II), Topo(SUnits, &ExitSU) { + RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI), + Topo(SUnits, &ExitSU) { P.MF->getSubtarget().getSMSMutations(Mutations); if (SwpEnableCopyToPhi) Mutations.push_back(std::make_unique()); @@ -589,6 +594,13 @@ class SMSchedule { return ScheduledInstrs[cycle]; } + SmallSet + computeUnpipelineableNodes(SwingSchedulerDAG *SSD, + TargetInstrInfo::PipelinerLoopInfo *PLI); + + bool + normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, + TargetInstrInfo::PipelinerLoopInfo *PLI); bool isValidSchedule(SwingSchedulerDAG *SSD); void finalizeSchedule(SwingSchedulerDAG *SSD); void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h index 4045df807cdb37..c515101e80fdfa 100644 --- a/llvm/include/llvm/CodeGen/ModuloSchedule.h +++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -191,8 +191,8 @@ class ModuloScheduleExpander { void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB, ValueMapTy *VRMap, MBBVectorTy &PrologBBs); void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB, - ValueMapTy *VRMap, MBBVectorTy &EpilogBBs, - MBBVectorTy &PrologBBs); + MachineBasicBlock *OrigBB, ValueMapTy *VRMap, + MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs); void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap, diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 0bffa9154fc4b3..9ea6e9b981721c 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -255,6 +255,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { << "Failed to pipeline loop"; }); + LI.LoopPipelinerInfo.reset(); return Changed; } @@ -262,6 +263,7 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) { Changed = swingModuloScheduler(L); + LI.LoopPipelinerInfo.reset(); return Changed; } @@ -354,7 +356,8 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) { LI.LoopInductionVar = nullptr; LI.LoopCompare = nullptr; - if (!TII->analyzeLoopForPipelining(L.getTopBlock())) { + LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock()); + if (!LI.LoopPipelinerInfo) { LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n"); NumFailLoop++; ORE->emit([&]() { @@ -419,7 +422,7 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) { assert(L.getBlocks().size() == 1 && "SMS works on single blocks only."); SwingSchedulerDAG SMS(*this, L, getAnalysis(), RegClassInfo, - II_setByPragma); + II_setByPragma, LI.LoopPipelinerInfo.get()); MachineBasicBlock *MBB = L.getHeader(); // The kernel should not include any terminator instructions. These @@ -1422,7 +1425,7 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) { /// We ignore the back-edge recurrence in order to avoid unbounded recursion /// in the calculation of the ASAP, ALAP, etc functions. static bool ignoreDependence(const SDep &D, bool isPred) { - if (D.isArtificial()) + if (D.isArtificial() || D.getSUnit()->isBoundaryNode()) return true; return D.getKind() == SDep::Anti && isPred; } @@ -1471,6 +1474,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { SUnit *SU = &SUnits[I]; for (const SDep &S : SU->Succs) { SUnit *succ = S.getSUnit(); + if (succ->isBoundaryNode()) + continue; if (S.getLatency() == 0) zeroLatencyHeight = std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1); @@ -1788,7 +1793,8 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet, NodesAdded.insert(SU); for (auto &SI : SU->Succs) { SUnit *Successor = SI.getSUnit(); - if (!SI.isArtificial() && NodesAdded.count(Successor) == 0) + if (!SI.isArtificial() && !Successor->isBoundaryNode() && + NodesAdded.count(Successor) == 0) addConnectedNodes(Successor, NewSet, NodesAdded); } for (auto &PI : SU->Preds) { @@ -2080,6 +2086,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { }); } while (++NI != NE && scheduleFound); + // If a schedule is found, ensure non-pipelined instructions are in stage 0 + if (scheduleFound) + scheduleFound = + Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo); + // If a schedule is found, check if it is a valid schedule too. if (scheduleFound) scheduleFound = Schedule.isValidSchedule(this); @@ -2263,7 +2274,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) { bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc) { if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) || - Dep.isArtificial()) + Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode()) return false; if (!SwpPruneLoopCarried) @@ -2430,7 +2441,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) { while (!Worklist.empty()) { const SDep &Cur = Worklist.pop_back_val(); SUnit *SuccSU = Cur.getSUnit(); - if (Visited.count(SuccSU)) + if (Visited.count(SuccSU) || SuccSU->isBoundaryNode()) continue; std::map::const_iterator it = InstrToCycle.find(SuccSU); if (it == InstrToCycle.end()) @@ -2697,21 +2708,91 @@ bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, return false; } +/// Determine transitive dependences of unpipelineable instructions +SmallSet SMSchedule::computeUnpipelineableNodes( + SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { + SmallSet DoNotPipeline; + SmallVector Worklist; + + for (auto &SU : SSD->SUnits) + if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr())) + Worklist.push_back(&SU); + + while (!Worklist.empty()) { + auto SU = Worklist.pop_back_val(); + if (DoNotPipeline.count(SU)) + continue; + LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n"); + DoNotPipeline.insert(SU); + for (auto &Dep : SU->Preds) + Worklist.push_back(Dep.getSUnit()); + if (SU->getInstr()->isPHI()) + for (auto &Dep : SU->Succs) + if (Dep.getKind() == SDep::Anti) + Worklist.push_back(Dep.getSUnit()); + } + return DoNotPipeline; +} + +// Determine all instructions upon which any unpipelineable instruction depends +// and ensure that they are in stage 0. If unable to do so, return false. +bool SMSchedule::normalizeNonPipelinedInstructions( + SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { + SmallSet DNP = computeUnpipelineableNodes(SSD, PLI); + + int NewLastCycle = INT_MIN; + for (SUnit &SU : SSD->SUnits) { + if (!SU.isInstr()) + continue; + if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) { + NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]); + continue; + } + + // Put the non-pipelined instruction as early as possible in the schedule + int NewCycle = getFirstCycle(); + for (auto &Dep : SU.Preds) + NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle); + + int OldCycle = InstrToCycle[&SU]; + if (OldCycle != NewCycle) { + InstrToCycle[&SU] = NewCycle; + auto &OldS = getInstructions(OldCycle); + OldS.erase(std::remove(OldS.begin(), OldS.end(), &SU), OldS.end()); + getInstructions(NewCycle).emplace_back(&SU); + LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum + << ") is not pipelined; moving from cycle " << OldCycle + << " to " << NewCycle << " Instr:" << *SU.getInstr()); + } + NewLastCycle = std::max(NewLastCycle, NewCycle); + } + LastCycle = NewLastCycle; + return true; +} + // Check if the generated schedule is valid. This function checks if // an instruction that uses a physical register is scheduled in a // different stage than the definition. The pipeliner does not handle // physical register values that may cross a basic block boundary. +// Furthermore, if a physical def/use pair is assigned to the same +// cycle, orderDependence does not guarantee def/use ordering, so that +// case should be considered invalid. (The test checks for both +// earlier and same-cycle use to be more robust.) bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) { for (SUnit &SU : SSD->SUnits) { if (!SU.hasPhysRegDefs) continue; int StageDef = stageScheduled(&SU); + int CycleDef = InstrToCycle[&SU]; assert(StageDef != -1 && "Instruction should have been scheduled."); for (auto &SI : SU.Succs) - if (SI.isAssignedRegDep()) - if (Register::isPhysicalRegister(SI.getReg())) + if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode()) + if (Register::isPhysicalRegister(SI.getReg())) { if (stageScheduled(SI.getSUnit()) != StageDef) return false; + if (InstrToCycle[SI.getSUnit()] <= CycleDef) + return false; + } } return true; } diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index b974fa9846f250..20aecdf222e26b 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -158,7 +158,7 @@ void ModuloScheduleExpander::generatePipelinedLoop() { SmallVector EpilogBBs; // Generate the epilog instructions to complete the pipeline. - generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs); + generateEpilog(MaxStageCount, KernelBB, BB, VRMap, EpilogBBs, PrologBBs); // We need this step because the register allocation doesn't handle some // situations well, so we insert copies to help out. @@ -240,11 +240,9 @@ void ModuloScheduleExpander::generateProlog(unsigned LastStage, /// Generate the pipeline epilog code. The epilog code finishes the iterations /// that were started in either the prolog or the kernel. We create a basic /// block for each stage that needs to complete. -void ModuloScheduleExpander::generateEpilog(unsigned LastStage, - MachineBasicBlock *KernelBB, - ValueMapTy *VRMap, - MBBVectorTy &EpilogBBs, - MBBVectorTy &PrologBBs) { +void ModuloScheduleExpander::generateEpilog( + unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB, + ValueMapTy *VRMap, MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs) { // We need to change the branch from the kernel to the first epilog block, so // this call to analyze branch uses the kernel rather than the original BB. MachineBasicBlock *TBB = nullptr, *FBB = nullptr; @@ -314,7 +312,12 @@ void ModuloScheduleExpander::generateEpilog(unsigned LastStage, // Create a branch to the new epilog from the kernel. // Remove the original branch and add a new branch to the epilog. TII->removeBranch(*KernelBB); - TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc()); + assert((OrigBB == TBB || OrigBB == FBB) && + "Unable to determine looping branch direction"); + if (OrigBB != TBB) + TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc()); + else + TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc()); // Add a branch to the loop exit. if (EpilogBBs.size() > 0) { MachineBasicBlock *LastEpilogBB = EpilogBBs.back(); diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index d9bc2827f7d299..4bcba38efe02d5 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -494,6 +494,10 @@ def FeatureNoNegativeImmediates def FeatureUseMISched: SubtargetFeature<"use-misched", "UseMISched", "true", "Use the MachineScheduler">; +// Use the MachinePipeliner for instruction scheduling for the subtarget. +def FeatureUseMIPipeliner: SubtargetFeature<"use-mipipeliner", "UseMIPipeliner", "true", + "Use the MachinePipeliner">; + // False if scheduling should happen again after register allocation. def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", "DisablePostRAScheduler", "true", @@ -1395,6 +1399,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, def : ProcessorModel<"cortex-m7", CortexM7Model, [ARMv7em, ProcM7, FeatureFPARMv8_D16, + FeatureUseMIPipeliner, FeatureUseMISched]>; def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 9bb9df536b2eef..9a7960268a7589 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6721,3 +6721,77 @@ unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) { return (MF.getSubtarget().hardenSlsBlr()) ? ARM::BLX_pred_noip : ARM::BLX_pred; } + +namespace { +class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { + MachineInstr *EndLoop, *LoopCount; + MachineFunction *MF; + const TargetInstrInfo *TII; + + // Meanings of the various stuff with loop types: + // t2Bcc: + // Loop = null -- there is no setup. + // EndLoop = branch at end of original BB that will become a kernel + // LoopCount = CC setter live into branch +public: + ARMPipelinerLoopInfo(MachineInstr *EndLoop, MachineInstr *LoopCount) + : EndLoop(EndLoop), LoopCount(LoopCount), + MF(EndLoop->getParent()->getParent()), + TII(MF->getSubtarget().getInstrInfo()) {} + + bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { + // Only ignore the terminator. + return MI == EndLoop || MI == LoopCount; + } + + Optional createTripCountGreaterCondition( + int TC, MachineBasicBlock &MBB, + SmallVectorImpl &Cond) override { + + if (isCondBranchOpcode(EndLoop->getOpcode())) { + Cond.push_back(EndLoop->getOperand(1)); + Cond.push_back(EndLoop->getOperand(2)); + if (EndLoop->getOperand(0).getMBB() == EndLoop->getParent()) { + TII->reverseBranchCondition(Cond); + } + return {}; + } else + llvm_unreachable("Unknown EndLoop"); + } + + void setPreheader(MachineBasicBlock *NewPreheader) override {} + + void adjustTripCount(int TripCountAdjust) override {} + + void disposed() override {} +}; +} // namespace + +std::unique_ptr +ARMBaseInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + MachineBasicBlock::iterator I = LoopBB->getFirstTerminator(); + MachineBasicBlock *Preheader = *LoopBB->pred_begin(); + if (Preheader == LoopBB) + Preheader = *std::next(LoopBB->pred_begin()); + + if (I != LoopBB->end() && I->getOpcode() == ARM::t2Bcc) { + // If the branch is a Bcc, then the CPSR should be set somewhere within the + // block. We need to determine the reaching definition of CPSR so that + // it can be marked as non-pipelineable, allowing the pipeliner to force + // it into stage 0 or give up if it cannot or will not do so. + MachineInstr *CCSetter = nullptr; + for (auto &L : LoopBB->instrs()) { + if (L.isCall()) + return nullptr; + if (isCPSRDefined(L)) + CCSetter = &L; + } + if (CCSetter) + return std::make_unique(&*I, CCSetter); + else + return nullptr; // Unable to find the CC setter, so unable to guarantee + // that pipeline will work + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index ab9643592724db..40acb27d1eb1ff 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -372,6 +372,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { MI->getOpcode() == ARM::t2WhileLoopStartTP; } + /// Analyze loop L, which must be a single-basic-block loop, and if the + /// conditions can be understood enough produce a PipelinerLoopInfo object. + std::unique_ptr + analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; + private: /// Returns an unused general-purpose register which can be used for /// constructing an outlined call if one exists. Returns 0 otherwise. diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index b62f447e8d5818..89e5b8762d80fc 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -393,6 +393,14 @@ bool ARMSubtarget::enableSubRegLiveness() const { return hasMVEIntegerOps(); } +bool ARMSubtarget::enableMachinePipeliner() const { + // Enable the MachinePipeliner before register allocation for subtargets + // with the use-mipipeliner feature. + return getSchedModel().hasInstrSchedModel() && useMachinePipeliner(); +} + +bool ARMSubtarget::useDFAforSMS() const { return false; } + // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { if (enableMachineScheduler()) diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index d426157c545362..8d56c70e80949a 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -417,6 +417,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool isRWPI() const; bool useMachineScheduler() const { return UseMISched; } + bool useMachinePipeliner() const { return UseMIPipeliner; } bool hasMinSize() const { return OptMinSize; } bool isThumb1Only() const { return isThumb() && !hasThumb2(); } bool isThumb2() const { return isThumb() && hasThumb2(); } @@ -465,6 +466,10 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// Returns true if machine scheduler should be enabled. bool enableMachineScheduler() const override; + /// Returns true if machine pipeliner should be enabled. + bool enableMachinePipeliner() const override; + bool useDFAforSMS() const override; + /// True for some subtargets at > -O0. bool enablePostRAScheduler() const override; diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 401a00841747d6..dadf7a557238cc 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -509,6 +509,9 @@ bool ARMPassConfig::addGlobalInstructionSelect() { void ARMPassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(&MachinePipelinerID); + addPass(createMVETPAndVPTOptimisationsPass()); addPass(createMLxExpansionPass()); diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index 10c56a3c495b53..6e55134581113d 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -95,6 +95,13 @@ ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions ; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Slot index numbering +; CHECK-NEXT: Live Interval Analysis +; CHECK-NEXT: Lazy Machine Block Frequency Analysis +; CHECK-NEXT: Machine Optimization Remark Emitter +; CHECK-NEXT: Modulo Software Pipelining +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: MVE TailPred and VPT Optimisation Pass ; CHECK-NEXT: ARM MLA / MLS expansion pass ; CHECK-NEXT: MachineDominator Tree Construction diff --git a/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir b/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir new file mode 100644 index 00000000000000..3f3ff5e4bd290a --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/swp-exitbranchdir.mir @@ -0,0 +1,203 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK + +--- | + define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %sz, 0 + br i1 %cmp8, label %for.body.preheader, label %for.end + + for.body.preheader: ; preds = %entry + %scevgep = getelementptr float, float* %b, i32 -1 + %scevgep4 = getelementptr float, float* %a, i32 -1 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] + %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] + %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ] + %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1 + %0 = load float, float* %scevgep7, align 4 + %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1 + %1 = load float, float* %scevgep3, align 4 + %mul = fmul fast float %1, %0 + %add = fadd fast float %mul, %sum.010 + %lsr.iv.next = add i32 %lsr.iv, -1 + %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1 + %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1 + %exitcond.not = icmp ne i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.body, label %for.end, !llvm.loop !0 + + for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %sum.0.lcssa + } + + !0 = distinct !{!0, !1, !2, !3} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.unroll.disable"} + !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3} + +... +--- +name: dot +alignment: 2 +tracksRegLiveness: true +constants: + - id: 0 + value: 'float 0.000000e+00' + alignment: 4 + isTargetSpecific: false +body: | + ; CHECK-LABEL: name: dot + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x50000000), %bb.1(0x30000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnopc = COPY $r2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnopc = COPY $r1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK-NEXT: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: t2Bcc %bb.2, 10 /* CC::ge */, $cpsr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.for.body.preheader: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]] + ; CHECK-NEXT: [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.for.body: + ; CHECK-NEXT: successors: %bb.6(0x40000000), %bb.9(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7) + ; CHECK-NEXT: [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3) + ; CHECK-NEXT: [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[t2SUBri2:%[0-9]+]]:rgpr = t2SUBri [[COPY]], 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gprnopc = COPY [[t2SUBri2]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gprnopc = COPY [[t2ADDri1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gprnopc = COPY [[t2ADDri]] + ; CHECK-NEXT: t2Bcc %bb.9, 0 /* CC::eq */, $cpsr + ; CHECK-NEXT: t2B %bb.6, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.for.body: + ; CHECK-NEXT: successors: %bb.7(0x80000000), %bb.8(0x00000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[COPY7]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS4:%[0-9]+]]:spr = VLDRS [[COPY7]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4) + ; CHECK-NEXT: [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[COPY6]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS5:%[0-9]+]]:spr = VLDRS [[COPY6]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4) + ; CHECK-NEXT: [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS5]], [[VLDRS4]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[t2SUBri3:%[0-9]+]]:rgpr = t2SUBri [[COPY5]], 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri3]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr = COPY [[t2ADDri3]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]] + ; CHECK-NEXT: t2Bcc %bb.8, 0 /* CC::eq */, $cpsr + ; CHECK-NEXT: t2B %bb.7, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.for.body: + ; CHECK-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY10]], %bb.6, %49, %bb.7 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY9]], %bb.6, %50, %bb.7 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.6, %51, %bb.7 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, %43, %bb.7 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, %52, %bb.7 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7 + ; CHECK-NEXT: [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI5]], [[PHI3]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[t2SUBri4:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK-NEXT: [[VLDRS6:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4) + ; CHECK-NEXT: [[VLDRS7:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4) + ; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri4]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr = COPY [[t2ADDri5]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr = COPY [[t2SUBri4]] + ; CHECK-NEXT: [[VMULS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS6]], [[VLDRS7]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2Bcc %bb.8, 0 /* CC::eq */, $cpsr + ; CHECK-NEXT: t2B %bb.7, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, [[VADDS]], %bb.7 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, [[VMULS2]], %bb.7 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7 + ; CHECK-NEXT: [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI8]], [[PHI6]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS1]], %bb.8 + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[PHI7]], %bb.8 + ; CHECK-NEXT: [[VADDS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI10]], [[PHI9]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.for.end: + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS2]], %bb.9 + ; CHECK-NEXT: [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI11]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r0 = COPY [[VMOVRS]] + ; CHECK-NEXT: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + bb.0.entry: + successors: %bb.1(0x50000000), %bb.4(0x30000000) + liveins: $r0, $r1, $r2 + + %13:gprnopc = COPY $r2 + %12:gprnopc = COPY $r1 + %11:gprnopc = COPY $r0 + t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 10 /* CC::ge */, $cpsr + + bb.4: + successors: %bb.3(0x80000000) + + %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg + %0:gpr = COPY %16 + %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg + %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + %1:gpr = COPY %17 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %2:gprnopc = PHI %1, %bb.1, %9, %bb.2 + %3:gprnopc = PHI %0, %bb.1, %8, %bb.2 + %4:gprnopc = PHI %13, %bb.1, %7, %bb.2 + %5:spr = PHI %15, %bb.1, %6, %bb.2 + %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg + %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7) + %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg + %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3) + %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg + %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg + %23:rgpr = t2SUBri %4, 1, 14 /* CC::al */, $noreg, def $cpsr + %7:gpr = COPY %23 + %8:gpr = COPY %20 + %9:gpr = COPY %18 + t2Bcc %bb.3, 0 /* CC::eq */, $cpsr + t2B %bb.2, 14 /* CC::al */, $noreg + + bb.3.for.end: + %10:spr = PHI %14, %bb.4, %6, %bb.2 + %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg + $r0 = COPY %24 + tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + +... diff --git a/llvm/test/CodeGen/Thumb2/swp-fixedii.mir b/llvm/test/CodeGen/Thumb2/swp-fixedii.mir new file mode 100644 index 00000000000000..579123c48a1f6d --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/swp-fixedii.mir @@ -0,0 +1,203 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK + +--- | + define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %sz, 0 + br i1 %cmp8, label %for.body.preheader, label %for.end + + for.body.preheader: ; preds = %entry + %scevgep = getelementptr float, float* %b, i32 -1 + %scevgep4 = getelementptr float, float* %a, i32 -1 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] + %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] + %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ] + %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1 + %0 = load float, float* %scevgep7, align 4 + %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1 + %1 = load float, float* %scevgep3, align 4 + %mul = fmul fast float %1, %0 + %add = fadd fast float %mul, %sum.010 + %lsr.iv.next = add i32 %lsr.iv, -1 + %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1 + %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + + for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %sum.0.lcssa + } + + !0 = distinct !{!0, !1, !2, !3} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.unroll.disable"} + !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3} + +... +--- +name: dot +alignment: 2 +tracksRegLiveness: true +constants: + - id: 0 + value: 'float 0.000000e+00' + alignment: 4 + isTargetSpecific: false +body: | + ; CHECK-LABEL: name: dot + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x50000000), %bb.1(0x30000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnopc = COPY $r2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnopc = COPY $r1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK-NEXT: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: t2Bcc %bb.2, 10 /* CC::ge */, $cpsr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.for.body.preheader: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]] + ; CHECK-NEXT: [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.for.body: + ; CHECK-NEXT: successors: %bb.6(0x40000000), %bb.9(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7) + ; CHECK-NEXT: [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3) + ; CHECK-NEXT: [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[t2SUBri2:%[0-9]+]]:rgpr = t2SUBri [[COPY]], 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gprnopc = COPY [[t2SUBri2]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gprnopc = COPY [[t2ADDri1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gprnopc = COPY [[t2ADDri]] + ; CHECK-NEXT: t2Bcc %bb.9, 0 /* CC::eq */, $cpsr + ; CHECK-NEXT: t2B %bb.6, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.for.body: + ; CHECK-NEXT: successors: %bb.7(0x80000000), %bb.8(0x00000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[COPY7]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS4:%[0-9]+]]:spr = VLDRS [[COPY7]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4) + ; CHECK-NEXT: [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[COPY6]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS5:%[0-9]+]]:spr = VLDRS [[COPY6]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4) + ; CHECK-NEXT: [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS5]], [[VLDRS4]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[t2SUBri3:%[0-9]+]]:rgpr = t2SUBri [[COPY5]], 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri3]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr = COPY [[t2ADDri3]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]] + ; CHECK-NEXT: t2Bcc %bb.8, 0 /* CC::eq */, $cpsr + ; CHECK-NEXT: t2B %bb.7, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7.for.body: + ; CHECK-NEXT: successors: %bb.8(0x04000000), %bb.7(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY10]], %bb.6, %49, %bb.7 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY9]], %bb.6, %50, %bb.7 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.6, %51, %bb.7 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, %43, %bb.7 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, %52, %bb.7 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7 + ; CHECK-NEXT: [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI5]], [[PHI3]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[t2SUBri4:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK-NEXT: [[VLDRS6:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4) + ; CHECK-NEXT: [[VLDRS7:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4) + ; CHECK-NEXT: [[t2ADDri4:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[t2ADDri5:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri4]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr = COPY [[t2ADDri5]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr = COPY [[t2SUBri4]] + ; CHECK-NEXT: [[VMULS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS6]], [[VLDRS7]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2Bcc %bb.7, 1 /* CC::ne */, $cpsr + ; CHECK-NEXT: t2B %bb.8, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: + ; CHECK-NEXT: successors: %bb.9(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.6, [[VADDS]], %bb.7 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:spr = PHI [[VMULS1]], %bb.6, [[VMULS2]], %bb.7 + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:spr = PHI [[VMULS]], %bb.6, [[PHI4]], %bb.7 + ; CHECK-NEXT: [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI8]], [[PHI6]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS1]], %bb.8 + ; CHECK-NEXT: [[PHI10:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[PHI7]], %bb.8 + ; CHECK-NEXT: [[VADDS2:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI10]], [[PHI9]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.for.end: + ; CHECK-NEXT: [[PHI11:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS2]], %bb.9 + ; CHECK-NEXT: [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI11]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r0 = COPY [[VMOVRS]] + ; CHECK-NEXT: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + bb.0.entry: + successors: %bb.1(0x50000000), %bb.4(0x30000000) + liveins: $r0, $r1, $r2 + + %13:gprnopc = COPY $r2 + %12:gprnopc = COPY $r1 + %11:gprnopc = COPY $r0 + t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 10 /* CC::ge */, $cpsr + + bb.4: + successors: %bb.3(0x80000000) + + %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg + %0:gpr = COPY %16 + %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg + %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + %1:gpr = COPY %17 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %2:gprnopc = PHI %1, %bb.1, %9, %bb.2 + %3:gprnopc = PHI %0, %bb.1, %8, %bb.2 + %4:gprnopc = PHI %13, %bb.1, %7, %bb.2 + %5:spr = PHI %15, %bb.1, %6, %bb.2 + %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg + %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7) + %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg + %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3) + %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg + %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg + %23:rgpr = t2SUBri %4, 1, 14 /* CC::al */, $noreg, def $cpsr + %7:gpr = COPY %23 + %8:gpr = COPY %20 + %9:gpr = COPY %18 + t2Bcc %bb.2, 1 /* CC::ne */, $cpsr + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.3.for.end: + %10:spr = PHI %14, %bb.4, %6, %bb.2 + %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg + $r0 = COPY %24 + tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + +...