diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index 04055ba9732dd..8f0a17cf99967 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -273,8 +273,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { /// Return the new base register that was stored away for the changed /// instruction. - unsigned getInstrBaseReg(SUnit *SU) { - DenseMap>::iterator It = + unsigned getInstrBaseReg(SUnit *SU) const { + DenseMap>::const_iterator It = InstrChanges.find(SU); if (It != InstrChanges.end()) return It->second.first; @@ -639,16 +639,20 @@ class SMSchedule { computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI); + std::deque + reorderInstructions(const SwingSchedulerDAG *SSD, + const std::deque &Instrs) const; + bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI); bool isValidSchedule(SwingSchedulerDAG *SSD); void finalizeSchedule(SwingSchedulerDAG *SSD); - void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, - std::deque &Insts); - bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi); - bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def, - MachineOperand &MO); + void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, + std::deque &Insts) const; + bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const; + bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, + MachineOperand &MO) const; void print(raw_ostream &os) const; void dump() const; }; diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 8cd7f4ebe88d9..5c9f0f1703a6e 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -35,6 +35,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PriorityQueue.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -60,9 +61,12 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ModuloSchedule.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -174,6 +178,20 @@ static cl::opt ExperimentalCodeGen( cl::desc( "Use the experimental peeling code generator for software pipelining")); +static cl::opt SwpIISearchRange("pipeliner-ii-search-range", + cl::desc("Range to search for II"), + cl::Hidden, cl::init(10)); + +static cl::opt + LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), + cl::desc("Limit register pressure of scheduled loop")); + +static cl::opt + RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, + cl::init(5), + cl::desc("Margin representing the unused percentage of " + "the register pressure limit")); + namespace llvm { // A command line option to enable the CopyToPhi DAG mutation. @@ -484,7 +502,7 @@ void SwingSchedulerDAG::setMAX_II() { else if (II_setByPragma > 0) MAX_II = II_setByPragma; else - MAX_II = MII + 10; + MAX_II = MII + SwpIISearchRange; } /// We override the schedule function in ScheduleDAGInstrs to implement the @@ -695,7 +713,8 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, } /// Return the Phi register value that comes the loop block. -static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) { +static unsigned getLoopPhiReg(const MachineInstr &Phi, + const MachineBasicBlock *LoopBB) { for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2) if (Phi.getOperand(i + 1).getMBB() == LoopBB) return Phi.getOperand(i).getReg(); @@ -996,6 +1015,41 @@ void SwingSchedulerDAG::changeDependences() { } } +/// Create an instruction stream that represents a single iteration and stage of +/// each instruction. This function differs from SMSchedule::finalizeSchedule in +/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this +/// function is an approximation of SMSchedule::finalizeSchedule with all +/// non-const operations removed. +static void computeScheduledInsts(const SwingSchedulerDAG *SSD, + SMSchedule &Schedule, + std::vector &OrderedInsts, + DenseMap &Stages) { + DenseMap> Instrs; + + // Move all instructions to the first stage from the later stages. + for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle(); + ++Cycle) { + for (int Stage = 0, LastStage = Schedule.getMaxStageCount(); + Stage <= LastStage; ++Stage) { + for (SUnit *SU : llvm::reverse(Schedule.getInstructions( + Cycle + Stage * Schedule.getInitiationInterval()))) { + Instrs[Cycle].push_front(SU); + } + } + } + + for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle(); + ++Cycle) { + std::deque &CycleInstrs = Instrs[Cycle]; + CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs)); + for (SUnit *SU : CycleInstrs) { + MachineInstr *MI = SU->getInstr(); + OrderedInsts.push_back(MI); + Stages[MI] = Schedule.stageScheduled(SU); + } + } +} + namespace { // FuncUnitSorter - Comparison operator used to sort instructions by @@ -1102,6 +1156,375 @@ struct FuncUnitSorter { } }; +/// Calculate the maximum register pressure of the scheduled instructions stream +class HighRegisterPressureDetector { + MachineBasicBlock *OrigMBB; + const MachineFunction &MF; + const MachineRegisterInfo &MRI; + const TargetRegisterInfo *TRI; + + const unsigned PSetNum; + + // Indexed by PSet ID + // InitSetPressure takes into account the register pressure of live-in + // registers. It's not depend on how the loop is scheduled, so it's enough to + // calculate them once at the beginning. + std::vector InitSetPressure; + + // Indexed by PSet ID + // Upper limit for each register pressure set + std::vector PressureSetLimit; + + DenseMap ROMap; + + using Instr2LastUsesTy = DenseMap>; + +public: + using OrderedInstsTy = std::vector; + using Instr2StageTy = DenseMap; + +private: + static void dumpRegisterPressures(const std::vector &Pressures) { + if (Pressures.size() == 0) { + dbgs() << "[]"; + } else { + char Prefix = '['; + for (unsigned P : Pressures) { + dbgs() << Prefix << P; + Prefix = ' '; + } + dbgs() << ']'; + } + } + + void dumpPSet(Register Reg) const { + dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet="; + for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid(); + ++PSetIter) { + dbgs() << *PSetIter << ' '; + } + dbgs() << '\n'; + } + + void increaseRegisterPressure(std::vector &Pressure, + Register Reg) const { + auto PSetIter = MRI.getPressureSets(Reg); + unsigned Weight = PSetIter.getWeight(); + for (; PSetIter.isValid(); ++PSetIter) + Pressure[*PSetIter] += Weight; + } + + void decreaseRegisterPressure(std::vector &Pressure, + Register Reg) const { + auto PSetIter = MRI.getPressureSets(Reg); + unsigned Weight = PSetIter.getWeight(); + for (; PSetIter.isValid(); ++PSetIter) { + auto &P = Pressure[*PSetIter]; + assert(P >= Weight && + "register pressure must be greater than or equal weight"); + P -= Weight; + } + } + + // Return true if Reg is fixed one, for example, stack pointer + bool isFixedRegister(Register Reg) const { + return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg()); + } + + bool isDefinedInThisLoop(Register Reg) const { + return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB; + } + + // Search for live-in variables. They are factored into the register pressure + // from the begining. Live-in variables used by every iteration should be + // considered as alive throughout the loop. For example, the variable `c` in + // following code. \code + // int c = ...; + // for (int i = 0; i < n; i++) + // a[i] += b[i] + c; + // \endcode + void computeLiveIn() { + DenseSet Used; + for (auto &MI : *OrigMBB) { + if (MI.isDebugInstr()) + continue; + for (auto Use : ROMap[&MI].Uses) { + auto Reg = Use.RegUnit; + // Ignore the variable that appears only on one side of phi instruction + // because it's used only at the first iteration. + if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB)) + continue; + if (isFixedRegister(Reg)) + continue; + if (isDefinedInThisLoop(Reg)) + continue; + Used.insert(Reg); + } + } + + for (auto LiveIn : Used) + increaseRegisterPressure(InitSetPressure, LiveIn); + } + + // Calculate the upper limit of each pressure set + void computePressureSetLimit(const RegisterClassInfo &RCI) { + for (unsigned PSet = 0; PSet < PSetNum; PSet++) + PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet); + + // We assume fixed registers, such as stack pointer, are already in use. + // Therefore subtracting the weight of the fixed registers from the limit of + // each pressure set in advance. + SmallDenseSet FixedRegs; + for (const TargetRegisterClass *TRC : TRI->regclasses()) { + for (const MCPhysReg Reg : *TRC) + if (isFixedRegister(Reg)) + FixedRegs.insert(Reg); + } + + LLVM_DEBUG({ + for (auto Reg : FixedRegs) { + dbgs() << printReg(Reg, TRI, 0, &MRI) << ": ["; + const int *Sets = TRI->getRegUnitPressureSets(Reg); + for (; *Sets != -1; Sets++) { + dbgs() << TRI->getRegPressureSetName(*Sets) << ", "; + } + dbgs() << "]\n"; + } + }); + + for (auto Reg : FixedRegs) { + LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI) + << "\n"); + auto PSetIter = MRI.getPressureSets(Reg); + unsigned Weight = PSetIter.getWeight(); + for (; PSetIter.isValid(); ++PSetIter) { + unsigned &Limit = PressureSetLimit[*PSetIter]; + assert(Limit >= Weight && + "register pressure limit must be greater than or equal weight"); + Limit -= Weight; + LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit + << " (decreased by " << Weight << ")\n"); + } + } + } + + // There are two patterns of last-use. + // - by an instruction of the current iteration + // - by a phi instruction of the next iteration (loop carried value) + // + // Furthermore, following two groups of instructions are executed + // simultaneously + // - next iteration's phi instructions in i-th stage + // - current iteration's instructions in i+1-th stage + // + // This function calculates the last-use of each register while taking into + // account the above two patterns. + Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts, + Instr2StageTy &Stages) const { + // We treat virtual registers that are defined and used in this loop. + // Following virtual register will be ignored + // - live-in one + // - defined but not used in the loop (potentially live-out) + DenseSet TargetRegs; + const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) { + if (isDefinedInThisLoop(Reg)) + TargetRegs.insert(Reg); + }; + for (MachineInstr *MI : OrderedInsts) { + if (MI->isPHI()) { + Register Reg = getLoopPhiReg(*MI, OrigMBB); + UpdateTargetRegs(Reg); + } else { + for (auto Use : ROMap.find(MI)->getSecond().Uses) + UpdateTargetRegs(Use.RegUnit); + } + } + + const auto InstrScore = [&Stages](MachineInstr *MI) { + return Stages[MI] + MI->isPHI(); + }; + + DenseMap LastUseMI; + for (MachineInstr *MI : llvm::reverse(OrderedInsts)) { + for (auto Use : ROMap.find(MI)->getSecond().Uses) { + auto Reg = Use.RegUnit; + if (!TargetRegs.contains(Reg)) + continue; + auto Ite = LastUseMI.find(Reg); + if (Ite == LastUseMI.end()) { + LastUseMI[Reg] = MI; + } else { + MachineInstr *Orig = Ite->second; + MachineInstr *New = MI; + if (InstrScore(Orig) < InstrScore(New)) + LastUseMI[Reg] = New; + } + } + } + + Instr2LastUsesTy LastUses; + for (auto &Entry : LastUseMI) + LastUses[Entry.second].insert(Entry.first); + return LastUses; + } + + // Compute the maximum register pressure of the kernel. We'll simulate #Stage + // iterations and check the register pressure at the point where all stages + // overlapping. + // + // An example of unrolled loop where #Stage is 4.. + // Iter i+0 i+1 i+2 i+3 + // ------------------------ + // Stage 0 + // Stage 1 0 + // Stage 2 1 0 + // Stage 3 2 1 0 <- All stages overlap + // + std::vector + computeMaxSetPressure(const OrderedInstsTy &OrderedInsts, + Instr2StageTy &Stages, + const unsigned StageCount) const { + using RegSetTy = SmallDenseSet; + + // Indexed by #Iter. To treat "local" variables of each stage separately, we + // manage the liveness of the registers independently by iterations. + SmallVector LiveRegSets(StageCount); + + auto CurSetPressure = InitSetPressure; + auto MaxSetPressure = InitSetPressure; + auto LastUses = std::move(computeLastUses(OrderedInsts, Stages)); + + LLVM_DEBUG({ + dbgs() << "Ordered instructions:\n"; + for (MachineInstr *MI : OrderedInsts) { + dbgs() << "Stage " << Stages[MI] << ": "; + MI->dump(); + } + }); + + const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet, + Register Reg) { + if (!Reg.isValid() || isFixedRegister(Reg)) + return; + + bool Inserted = RegSet.insert(Reg).second; + if (!Inserted) + return; + + LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n"); + increaseRegisterPressure(CurSetPressure, Reg); + LLVM_DEBUG(dumpPSet(Reg)); + }; + + const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet, + Register Reg) { + if (!Reg.isValid() || isFixedRegister(Reg)) + return; + + // live-in register + if (!RegSet.contains(Reg)) + return; + + LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n"); + RegSet.erase(Reg); + decreaseRegisterPressure(CurSetPressure, Reg); + LLVM_DEBUG(dumpPSet(Reg)); + }; + + for (unsigned I = 0; I < StageCount; I++) { + for (MachineInstr *MI : OrderedInsts) { + const auto Stage = Stages[MI]; + if (I < Stage) + continue; + + const unsigned Iter = I - Stage; + + for (auto Def : ROMap.find(MI)->getSecond().Defs) + InsertReg(LiveRegSets[Iter], Def.RegUnit); + + for (auto LastUse : LastUses[MI]) { + if (MI->isPHI()) { + if (Iter != 0) + EraseReg(LiveRegSets[Iter - 1], LastUse); + } else { + EraseReg(LiveRegSets[Iter], LastUse); + } + } + + for (unsigned PSet = 0; PSet < PSetNum; PSet++) + MaxSetPressure[PSet] = + std::max(MaxSetPressure[PSet], CurSetPressure[PSet]); + + LLVM_DEBUG({ + dbgs() << "CurSetPressure="; + dumpRegisterPressures(CurSetPressure); + dbgs() << " iter=" << Iter << " stage=" << Stage << ":"; + MI->dump(); + }); + } + } + + return MaxSetPressure; + } + +public: + HighRegisterPressureDetector(MachineBasicBlock *OrigMBB, + const MachineFunction &MF) + : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()), + TRI(MF.getSubtarget().getRegisterInfo()), + PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0), + PressureSetLimit(PSetNum, 0) {} + + // Used to calculate register pressure, which is independent of loop + // scheduling. + void init(const RegisterClassInfo &RCI) { + for (MachineInstr &MI : *OrigMBB) { + if (MI.isDebugInstr()) + continue; + ROMap[&MI].collect(MI, *TRI, MRI, false, true); + } + + computeLiveIn(); + computePressureSetLimit(RCI); + } + + // Calculate the maximum register pressures of the loop and check if they + // exceed the limit + bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, + const unsigned MaxStage) const { + assert(0 <= RegPressureMargin && RegPressureMargin <= 100 && + "the percentage of the margin must be between 0 to 100"); + + OrderedInstsTy OrderedInsts; + Instr2StageTy Stages; + computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages); + const auto MaxSetPressure = + std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1)); + + LLVM_DEBUG({ + dbgs() << "Dump MaxSetPressure:\n"; + for (unsigned I = 0; I < MaxSetPressure.size(); I++) { + dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]); + } + dbgs() << '\n'; + }); + + for (unsigned PSet = 0; PSet < PSetNum; PSet++) { + unsigned Limit = PressureSetLimit[PSet]; + unsigned Margin = Limit * RegPressureMargin / 100; + LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit + << " Margin=" << Margin << "\n"); + if (Limit < MaxSetPressure[PSet] + Margin) { + LLVM_DEBUG( + dbgs() + << "Rejected the schedule because of too high register pressure\n"); + return true; + } + } + return false; + } +}; + } // end anonymous namespace /// Calculate the resource constrained minimum initiation interval for the @@ -1967,6 +2390,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { } bool scheduleFound = false; + std::unique_ptr HRPDetector; + if (LimitRegPressure) { + HRPDetector = + std::make_unique(Loop.getHeader(), MF); + HRPDetector->init(RegClassInfo); + } // Keep increasing II until a valid schedule is found. for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) { Schedule.reset(); @@ -2044,6 +2473,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) { // If a schedule is found, check if it is a valid schedule too. if (scheduleFound) scheduleFound = Schedule.isValidSchedule(this); + + // If a schedule was found and the option is enabled, check if the schedule + // might generate additional register spills/fills. + if (scheduleFound && LimitRegPressure) + scheduleFound = + !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount()); } LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound @@ -2483,8 +2918,8 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, /// Order the instructions within a cycle so that the definitions occur /// before the uses. Returns true if the instruction is added to the start /// of the list, or false if added to the end. -void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, - std::deque &Insts) { +void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, + std::deque &Insts) const { MachineInstr *MI = SU->getInstr(); bool OrderBeforeUse = false; bool OrderAfterDef = false; @@ -2611,7 +3046,8 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, } /// Return true if the scheduled Phi has a loop carried operand. -bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) { +bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD, + MachineInstr &Phi) const { if (!Phi.isPHI()) return false; assert(Phi.isPHI() && "Expecting a Phi."); @@ -2639,8 +3075,9 @@ bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) { /// (MO) = v1 /// If MO appears before Def, then v1 and v3 may get assigned to the same /// register. -bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, - MachineInstr *Def, MachineOperand &MO) { +bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, + MachineInstr *Def, + MachineOperand &MO) const { if (!MO.isReg()) return false; if (Def->isPHI()) @@ -2895,6 +3332,23 @@ void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque &Instrs) { } } +std::deque +SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD, + const std::deque &Instrs) const { + std::deque NewOrderPhi; + for (SUnit *SU : Instrs) { + if (SU->getInstr()->isPHI()) + NewOrderPhi.push_back(SU); + } + std::deque NewOrderI; + for (SUnit *SU : Instrs) { + if (!SU->getInstr()->isPHI()) + orderDependence(SSD, SU, NewOrderI); + } + llvm::append_range(NewOrderPhi, NewOrderI); + return NewOrderPhi; +} + /// After the schedule has been formed, call this function to combine /// the instructions from the different stages/cycles. That is, this /// function creates a schedule that represents a single iteration. @@ -2924,19 +3378,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { // generated code. for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) { std::deque &cycleInstrs = ScheduledInstrs[Cycle]; - std::deque newOrderPhi; - for (SUnit *SU : cycleInstrs) { - if (SU->getInstr()->isPHI()) - newOrderPhi.push_back(SU); - } - std::deque newOrderI; - for (SUnit *SU : cycleInstrs) { - if (!SU->getInstr()->isPHI()) - orderDependence(SSD, SU, newOrderI); - } - // Replace the old order with the new order. - cycleInstrs.swap(newOrderPhi); - llvm::append_range(cycleInstrs, newOrderI); + cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs)); SSD->fixupRegisterOverlaps(cycleInstrs); } diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir new file mode 100644 index 0000000000000..f523b4548eecc --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir @@ -0,0 +1,328 @@ +# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s + +# Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues. +# The specific value of II is not important. + +# CHECK: Try to schedule with 21 +# CHECK: Can't schedule +# CHECK: Try to schedule with 22 +# CHECK: Can't schedule +# CHECK: Try to schedule with 23 +# CHECK: Rejected the schedule because of too high register pressure +# CHECK: Try to schedule with 24 +# CHECK: Rejected the schedule because of too high register pressure +# CHECK: Try to schedule with 25 +# CHECK: Rejected the schedule because of too high register pressure +# CHECK: Try to schedule with 26 +# CHECK: Schedule Found? 1 (II=26) + +--- | + ; ModuleID = 'a.ll' + source_filename = "a.c" + target datalayout = "e-m:e-Fn32-i64:64-n32:64" + target triple = "ppc64le" + + ; Function Attrs: nofree nosync nounwind memory(argmem: read) uwtable + define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr #0 { + entry: + %0 = load double, ptr %a, align 8, !tbaa !3 + %arrayidx1 = getelementptr inbounds double, ptr %a, i64 1 + %1 = load double, ptr %arrayidx1, align 8, !tbaa !3 + %cmp163 = icmp sgt i32 %n, 0 + br i1 %cmp163, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %scevgep1 = getelementptr i8, ptr %b, i64 -8 + call void @llvm.set.loop.iterations.i64(i64 %wide.trip.count) + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %30, %for.body ] + ret double %res.0.lcssa + + for.body: ; preds = %for.body, %for.body.preheader + %res.0165 = phi double [ 0.000000e+00, %for.body.preheader ], [ %30, %for.body ] + %2 = phi ptr [ %scevgep1, %for.body.preheader ], [ %3, %for.body ] + %3 = getelementptr i8, ptr %2, i64 8 + %4 = load double, ptr %3, align 8, !tbaa !3 + %5 = tail call double @llvm.fmuladd.f64(double %0, double %4, double %0) + %6 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %5) + %7 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %6) + %8 = tail call double @llvm.fmuladd.f64(double %7, double %4, double %7) + %9 = tail call double @llvm.fmuladd.f64(double %8, double %4, double %8) + %10 = tail call double @llvm.fmuladd.f64(double %9, double %4, double %9) + %11 = tail call double @llvm.fmuladd.f64(double %10, double %4, double %10) + %12 = tail call double @llvm.fmuladd.f64(double %11, double %4, double %11) + %13 = tail call double @llvm.fmuladd.f64(double %12, double %4, double %12) + %14 = tail call double @llvm.fmuladd.f64(double %13, double %4, double %13) + %15 = tail call double @llvm.fmuladd.f64(double %14, double %4, double %14) + %16 = tail call double @llvm.fmuladd.f64(double %15, double %4, double %15) + %17 = tail call double @llvm.fmuladd.f64(double %16, double %4, double %16) + %18 = tail call double @llvm.fmuladd.f64(double %17, double %4, double %17) + %19 = tail call double @llvm.fmuladd.f64(double %18, double %4, double %18) + %20 = tail call double @llvm.fmuladd.f64(double %19, double %4, double %19) + %add = fadd double %19, %20 + %21 = tail call double @llvm.fmuladd.f64(double %20, double %4, double %add) + %add35 = fadd double %12, %21 + %22 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %add35) + %add38 = fadd double %13, %22 + %23 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %add38) + %mul = fmul double %4, %7 + %mul46 = fmul double %mul, %14 + %24 = tail call double @llvm.fmuladd.f64(double %mul46, double %13, double %16) + %mul50 = fmul double %4, %9 + %mul51 = fmul double %1, %mul50 + %25 = tail call double @llvm.fmuladd.f64(double %mul51, double %11, double %24) + %add53 = fadd double %5, %25 + %add54 = fadd double %6, %add53 + %mul55 = fmul double %14, %16 + %mul56 = fmul double %mul55, %17 + %mul57 = fmul double %mul56, %18 + %26 = tail call double @llvm.fmuladd.f64(double %mul57, double %19, double %add54) + %27 = tail call double @llvm.fmuladd.f64(double %10, double %1, double %26) + %28 = tail call double @llvm.fmuladd.f64(double %8, double %6, double %27) + %mul61 = fmul double %20, %21 + %mul62 = fmul double %mul61, %22 + %29 = tail call double @llvm.fmuladd.f64(double %mul62, double %23, double %28) + %mul64 = fmul double %26, %29 + %mul65 = fmul double %24, %mul64 + %mul66 = fmul double %12, %mul65 + %30 = tail call double @llvm.fmuladd.f64(double %mul66, double %10, double %res.0165) + %31 = call i1 @llvm.loop.decrement.i64(i64 1) + br i1 %31, label %for.body, label %for.cond.cleanup, !llvm.loop !7 + } + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare double @llvm.fmuladd.f64(double, double, double) #1 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare void @llvm.set.loop.iterations.i64(i64) #2 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i1 @llvm.loop.decrement.i64(i64) #2 + + attributes #0 = { nofree nosync nounwind memory(argmem: read) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+power8-vector,+power9-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-privileged,-rop-protect,-spe" } + attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 7, !"uwtable", i32 2} + !2 = !{!"clang version 18.0.0 (https://miratech-soft@dev.azure.com/miratech-soft/llvm/_git/llvm c8d01fb665fc5d9378100a6d92ebcd3be49be655)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"double", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} + !7 = distinct !{!7, !8, !9} + !8 = !{!"llvm.loop.mustprogress"} + !9 = !{!"llvm.loop.unroll.disable"} + +... +--- +name: kernel +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: vsfrc, preferred-register: '' } + - { id: 1, class: vsfrc, preferred-register: '' } + - { id: 2, class: g8rc, preferred-register: '' } + - { id: 3, class: vsfrc, preferred-register: '' } + - { id: 4, class: vsfrc, preferred-register: '' } + - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 6, class: g8rc, preferred-register: '' } + - { id: 7, class: vsfrc, preferred-register: '' } + - { id: 8, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 10, class: g8rc, preferred-register: '' } + - { id: 11, class: gprc, preferred-register: '' } + - { id: 12, class: vsfrc, preferred-register: '' } + - { id: 13, class: crrc, preferred-register: '' } + - { id: 14, class: vsfrc, preferred-register: '' } + - { id: 15, class: g8rc, preferred-register: '' } + - { id: 16, class: g8rc, preferred-register: '' } + - { id: 17, class: g8rc, preferred-register: '' } + - { id: 18, class: f8rc, preferred-register: '' } + - { id: 19, class: g8rc_and_g8rc_nox0, preferred-register: '' } + - { id: 20, class: vsfrc, preferred-register: '' } + - { id: 21, class: vsfrc, preferred-register: '' } + - { id: 22, class: vsfrc, preferred-register: '' } + - { id: 23, class: vsfrc, preferred-register: '' } + - { id: 24, class: vsfrc, preferred-register: '' } + - { id: 25, class: vsfrc, preferred-register: '' } + - { id: 26, class: vsfrc, preferred-register: '' } + - { id: 27, class: vsfrc, preferred-register: '' } + - { id: 28, class: vsfrc, preferred-register: '' } + - { id: 29, class: vsfrc, preferred-register: '' } + - { id: 30, class: vsfrc, preferred-register: '' } + - { id: 31, class: vsfrc, preferred-register: '' } + - { id: 32, class: vsfrc, preferred-register: '' } + - { id: 33, class: vsfrc, preferred-register: '' } + - { id: 34, class: vsfrc, preferred-register: '' } + - { id: 35, class: vsfrc, preferred-register: '' } + - { id: 36, class: vsfrc, preferred-register: '' } + - { id: 37, class: vsfrc, preferred-register: '' } + - { id: 38, class: vsfrc, preferred-register: '' } + - { id: 39, class: vsfrc, preferred-register: '' } + - { id: 40, class: vsfrc, preferred-register: '' } + - { id: 41, class: vsfrc, preferred-register: '' } + - { id: 42, class: vsfrc, preferred-register: '' } + - { id: 43, class: vsfrc, preferred-register: '' } + - { id: 44, class: vsfrc, preferred-register: '' } + - { id: 45, class: vsfrc, preferred-register: '' } + - { id: 46, class: vsfrc, preferred-register: '' } + - { id: 47, class: vsfrc, preferred-register: '' } + - { id: 48, class: vsfrc, preferred-register: '' } + - { id: 49, class: vsfrc, preferred-register: '' } + - { id: 50, class: vsfrc, preferred-register: '' } + - { id: 51, class: vsfrc, preferred-register: '' } + - { id: 52, class: vsfrc, preferred-register: '' } + - { id: 53, class: vsfrc, preferred-register: '' } + - { id: 54, class: vsfrc, preferred-register: '' } + - { id: 55, class: vsfrc, preferred-register: '' } + - { id: 56, class: vsfrc, preferred-register: '' } + - { id: 57, class: vsfrc, preferred-register: '' } + - { id: 58, class: vsfrc, preferred-register: '' } + - { id: 59, class: vsfrc, preferred-register: '' } + - { id: 60, class: vsfrc, preferred-register: '' } + - { id: 61, class: vsfrc, preferred-register: '' } + - { id: 62, class: crbitrc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%8' } + - { reg: '$x4', virtual-reg: '%9' } + - { reg: '$x5', virtual-reg: '%10' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.2(0x50000000), %bb.1(0x30000000) + liveins: $x3, $x4, $x5 + + %10:g8rc = COPY killed $x5 + %9:g8rc_and_g8rc_nox0 = COPY killed $x4 + %8:g8rc_and_g8rc_nox0 = COPY killed $x3 + %11:gprc = COPY killed %10.sub_32 + %13:crrc = CMPWI %11, 0 + BCC 44, killed %13, %bb.2 + + bb.1: + successors: %bb.3(0x80000000) + + %12:vsfrc = XXLXORdpz + B %bb.3 + + bb.2.for.body.preheader: + successors: %bb.4(0x80000000) + + %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a, !tbaa !3) + %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1, !tbaa !3) + %16:g8rc = IMPLICIT_DEF + %15:g8rc = INSERT_SUBREG killed %16, killed %11, %subreg.sub_32 + %17:g8rc = RLDICL killed %15, 0, 32 + %2:g8rc = ADDI8 killed %9, -8 + MTCTR8loop killed %17, implicit-def dead $ctr8 + %14:vsfrc = XXLXORdpz + B %bb.4 + + bb.3.for.cond.cleanup: + %3:vsfrc = PHI %12, %bb.1, %7, %bb.4 + $f1 = COPY killed %3 + BLR8 implicit $lr8, implicit $rm, implicit killed $f1 + + bb.4.for.body: + successors: %bb.4(0x7c000000), %bb.3(0x04000000) + + %4:vsfrc = PHI %14, %bb.2, %7, %bb.4 + %5:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %6, %bb.4 + %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3, !tbaa !3) + %6:g8rc = COPY killed %19 + %20:vsfrc = nofpexcept XSMADDADP %0, %0, %18, implicit $rm + %21:vsfrc = nofpexcept XSMADDADP %20, %20, %18, implicit $rm + %22:vsfrc = nofpexcept XSMADDADP %21, %21, %18, implicit $rm + %23:vsfrc = nofpexcept XSMADDADP %22, %22, %18, implicit $rm + %24:vsfrc = nofpexcept XSMADDADP %23, %23, %18, implicit $rm + %25:vsfrc = nofpexcept XSMADDADP %24, %24, %18, implicit $rm + %26:vsfrc = nofpexcept XSMADDADP %25, %25, %18, implicit $rm + %27:vsfrc = nofpexcept XSMADDADP %26, %26, %18, implicit $rm + %28:vsfrc = nofpexcept XSMADDADP %27, %27, %18, implicit $rm + %29:vsfrc = nofpexcept XSMADDADP %28, %28, %18, implicit $rm + %30:vsfrc = nofpexcept XSMADDADP %29, %29, %18, implicit $rm + %31:vsfrc = nofpexcept XSMADDADP killed %30, %30, %18, implicit $rm + %32:vsfrc = nofpexcept XSMADDADP %31, %31, %18, implicit $rm + %33:vsfrc = nofpexcept XSMADDADP %32, %32, %18, implicit $rm + %34:vsfrc = nofpexcept XSMADDADP %33, %33, %18, implicit $rm + %35:vsfrc = nofpexcept XSMADDADP %34, %34, %18, implicit $rm + %36:vsfrc = nofpexcept XSADDDP %34, %35, implicit $rm + %37:vsfrc = nofpexcept XSMADDADP killed %36, %35, %18, implicit $rm + %38:vsfrc = nofpexcept XSADDDP %27, %37, implicit $rm + %39:vsfrc = nofpexcept XSMADDADP killed %38, %20, %18, implicit $rm + %40:vsfrc = nofpexcept XSADDDP %28, %39, implicit $rm + %41:vsfrc = nofpexcept XSMADDADP killed %40, %21, %18, implicit $rm + %42:vsfrc = nofpexcept XSMULDP %18, killed %22, implicit $rm + %43:vsfrc = nofpexcept XSMULDP killed %42, %29, implicit $rm + %44:vsfrc = nofpexcept XSMADDADP %31, killed %43, killed %28, implicit $rm + %45:vsfrc = nofpexcept XSMULDP killed %18, killed %24, implicit $rm + %46:vsfrc = nofpexcept XSMULDP %1, killed %45, implicit $rm + %47:vsfrc = nofpexcept XSMADDADP %44, killed %46, killed %26, implicit $rm + %48:vsfrc = nofpexcept XSADDDP killed %20, killed %47, implicit $rm + %49:vsfrc = nofpexcept XSADDDP %21, killed %48, implicit $rm + %50:vsfrc = nofpexcept XSMULDP killed %29, killed %31, implicit $rm + %51:vsfrc = nofpexcept XSMULDP killed %50, killed %32, implicit $rm + %52:vsfrc = nofpexcept XSMULDP killed %51, killed %33, implicit $rm + %53:vsfrc = nofpexcept XSMADDADP killed %49, killed %52, killed %34, implicit $rm + %54:vsfrc = nofpexcept XSMADDADP %53, %25, %1, implicit $rm + %55:vsfrc = nofpexcept XSMADDADP killed %54, killed %23, killed %21, implicit $rm + %56:vsfrc = nofpexcept XSMULDP killed %35, killed %37, implicit $rm + %57:vsfrc = nofpexcept XSMULDP killed %56, killed %39, implicit $rm + %58:vsfrc = nofpexcept XSMADDADP killed %55, killed %57, killed %41, implicit $rm + %59:vsfrc = nofpexcept XSMULDP killed %53, killed %58, implicit $rm + %60:vsfrc = nofpexcept XSMULDP killed %44, killed %59, implicit $rm + %61:vsfrc = nofpexcept XSMULDP killed %27, killed %60, implicit $rm + %7:vsfrc = nofpexcept XSMADDADP killed %4, killed %61, killed %25, implicit $rm + BDNZ8 %bb.4, implicit-def $ctr8, implicit $ctr8 + B %bb.3 + +...