-
Notifications
You must be signed in to change notification settings - Fork 15.1k
release/20.x: [AArch64][SME] [AArch64][SME] Spill p-regs as z-regs when streaming hazards are possible #126503
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
release/20.x: [AArch64][SME] [AArch64][SME] Spill p-regs as z-regs when streaming hazards are possible #126503
Conversation
@llvm/pr-subscribers-tablegen @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis cherry-picks commits: 82c6b8f and e470dca. These are needed for the We think this should be fairly low risk as this feature needs to be manually enabled, but we'd like this to be for users to experiment with in LLVM 20. Patch is 82.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126503.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a082a1ebe95bf84..d3abd79b85a75f7 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1634,6 +1634,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
case AArch64::STR_PXI:
case AArch64::LDR_ZXI:
case AArch64::LDR_PXI:
+ case AArch64::PTRUE_B:
+ case AArch64::CPY_ZPzI_B:
+ case AArch64::CMPNE_PPzZI_B:
return I->getFlag(MachineInstr::FrameSetup) ||
I->getFlag(MachineInstr::FrameDestroy);
}
@@ -3265,7 +3268,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
break;
case RegPairInfo::PPR:
- StrOpc = AArch64::STR_PXI;
+ StrOpc =
+ Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
break;
case RegPairInfo::VG:
StrOpc = AArch64::STRXui;
@@ -3494,7 +3498,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
break;
case RegPairInfo::PPR:
- LdrOpc = AArch64::LDR_PXI;
+ LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+ : AArch64::LDR_PXI;
break;
case RegPairInfo::VG:
continue;
@@ -3720,6 +3725,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
continue;
}
+ // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
+ // spilled. If all of p0-p3 are used as return values p4 is must be free
+ // to reload p8-p15.
+ if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 &&
+ AArch64::PPR_p8to15RegClass.contains(Reg)) {
+ SavedRegs.set(AArch64::P4);
+ }
+
// MachO's compact unwind format relies on all registers being stored in
// pairs.
// FIXME: the usual format is actually better if unwinding isn't needed.
@@ -4159,8 +4172,312 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
true);
}
+/// Attempts to scavenge a register from \p ScavengeableRegs given the used
+/// registers in \p UsedRegs.
+static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
+ BitVector const &ScavengeableRegs,
+ Register PreferredReg) {
+ if (PreferredReg != AArch64::NoRegister && UsedRegs.available(PreferredReg))
+ return PreferredReg;
+ for (auto Reg : ScavengeableRegs.set_bits()) {
+ if (UsedRegs.available(Reg))
+ return Reg;
+ }
+ return AArch64::NoRegister;
+}
+
+/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
+/// \p MachineInstrs.
+static void propagateFrameFlags(MachineInstr &SourceMI,
+ ArrayRef<MachineInstr *> MachineInstrs) {
+ for (MachineInstr *MI : MachineInstrs) {
+ if (SourceMI.getFlag(MachineInstr::FrameSetup))
+ MI->setFlag(MachineInstr::FrameSetup);
+ if (SourceMI.getFlag(MachineInstr::FrameDestroy))
+ MI->setFlag(MachineInstr::FrameDestroy);
+ }
+}
+
+/// RAII helper class for scavenging or spilling a register. On construction
+/// attempts to find a free register of class \p RC (given \p UsedRegs and \p
+/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
+/// MaybeSpillFI to free a register. The free'd register is returned via the \p
+/// FreeReg output parameter. On destruction, if there is a spill, its previous
+/// value is reloaded. The spilling and scavenging is only valid at the
+/// insertion point \p MBBI, this class should _not_ be used in places that
+/// create or manipulate basic blocks, moving the expected insertion point.
+struct ScopedScavengeOrSpill {
+ ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
+ ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
+
+ ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ Register SpillCandidate, const TargetRegisterClass &RC,
+ LiveRegUnits const &UsedRegs,
+ BitVector const &AllocatableRegs,
+ std::optional<int> *MaybeSpillFI,
+ Register PreferredReg = AArch64::NoRegister)
+ : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
+ *MF.getSubtarget().getInstrInfo())),
+ TRI(*MF.getSubtarget().getRegisterInfo()) {
+ FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs, PreferredReg);
+ if (FreeReg != AArch64::NoRegister)
+ return;
+ assert(MaybeSpillFI && "Expected emergency spill slot FI information "
+ "(attempted to spill in prologue/epilogue?)");
+ if (!MaybeSpillFI->has_value()) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
+ TRI.getSpillAlign(RC));
+ }
+ FreeReg = SpillCandidate;
+ SpillFI = MaybeSpillFI->value();
+ TII.storeRegToStackSlot(MBB, MBBI, FreeReg, false, *SpillFI, &RC, &TRI,
+ Register());
+ }
+
+ bool hasSpilled() const { return SpillFI.has_value(); }
+
+ /// Returns the free register (found from scavenging or spilling a register).
+ Register freeRegister() const { return FreeReg; }
+
+ Register operator*() const { return freeRegister(); }
+
+ ~ScopedScavengeOrSpill() {
+ if (hasSpilled())
+ TII.loadRegFromStackSlot(MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
+ Register());
+ }
+
+private:
+ MachineBasicBlock &MBB;
+ MachineBasicBlock::iterator MBBI;
+ const TargetRegisterClass &RC;
+ const AArch64InstrInfo &TII;
+ const TargetRegisterInfo &TRI;
+ Register FreeReg = AArch64::NoRegister;
+ std::optional<int> SpillFI;
+};
+
+/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
+/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+struct EmergencyStackSlots {
+ std::optional<int> ZPRSpillFI;
+ std::optional<int> PPRSpillFI;
+ std::optional<int> GPRSpillFI;
+};
+
+/// Registers available for scavenging (ZPR, PPR3b, GPR).
+struct ScavengeableRegs {
+ BitVector ZPRRegs;
+ BitVector PPR3bRegs;
+ BitVector GPRRegs;
+};
+
+static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
+ return MI.getFlag(MachineInstr::FrameSetup) ||
+ MI.getFlag(MachineInstr::FrameDestroy);
+}
+
+/// Expands:
+/// ```
+/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = CPY_ZPzI_B $p0, 1, 0
+/// STR_ZXI $z0, $stack.0, 0
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary).
+static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
+ MachineInstr &MI,
+ const TargetRegisterInfo &TRI,
+ LiveRegUnits const &UsedRegs,
+ ScavengeableRegs const &SR,
+ EmergencyStackSlots &SpillSlots) {
+ MachineFunction &MF = *MBB.getParent();
+ auto *TII =
+ static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ ScopedScavengeOrSpill ZPredReg(
+ MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
+ isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
+
+ SmallVector<MachineInstr *, 2> MachineInstrs;
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B))
+ .addReg(*ZPredReg, RegState::Define)
+ .add(MI.getOperand(0))
+ .addImm(1)
+ .addImm(0)
+ .getInstr());
+ MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI))
+ .addReg(*ZPredReg)
+ .add(MI.getOperand(1))
+ .addImm(MI.getOperand(2).getImm())
+ .setMemRefs(MI.memoperands())
+ .getInstr());
+ propagateFrameFlags(MI, MachineInstrs);
+}
+
+/// Expands:
+/// ```
+/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = LDR_ZXI %stack.0, 0
+/// $p0 = PTRUE_B 31, implicit $vg
+/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary). If the status flags are in use at the point of
+/// expansion they are preserved (by moving them to/from a GPR). This may cause
+/// an additional spill if no GPR is free at the expansion point.
+static bool expandFillPPRFromZPRSlotPseudo(
+ MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
+ LiveRegUnits const &UsedRegs, ScavengeableRegs const &SR,
+ MachineInstr *&LastPTrue, EmergencyStackSlots &SpillSlots) {
+ MachineFunction &MF = *MBB.getParent();
+ auto *TII =
+ static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ ScopedScavengeOrSpill ZPredReg(
+ MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
+ isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
+
+ ScopedScavengeOrSpill PredReg(
+ MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
+ isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI,
+ /*PreferredReg=*/
+ LastPTrue ? LastPTrue->getOperand(0).getReg() : AArch64::NoRegister);
+
+ // Elide NZCV spills if we know it is not used.
+ bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
+ std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
+ if (IsNZCVUsed)
+ NZCVSaveReg.emplace(
+ MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs,
+ isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
+ SmallVector<MachineInstr *, 4> MachineInstrs;
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
+ .addReg(*ZPredReg, RegState::Define)
+ .add(MI.getOperand(1))
+ .addImm(MI.getOperand(2).getImm())
+ .setMemRefs(MI.memoperands())
+ .getInstr());
+ if (IsNZCVUsed)
+ MachineInstrs.push_back(
+ BuildMI(MBB, MI, DL, TII->get(AArch64::MRS))
+ .addReg(NZCVSaveReg->freeRegister(), RegState::Define)
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(AArch64::NZCV, RegState::Implicit)
+ .getInstr());
+
+ // Reuse previous ptrue if we know it has not been clobbered.
+ if (LastPTrue) {
+ assert(*PredReg == LastPTrue->getOperand(0).getReg());
+ LastPTrue->moveBefore(&MI);
+ } else {
+ LastPTrue = BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
+ .addReg(*PredReg, RegState::Define)
+ .addImm(31);
+ }
+ MachineInstrs.push_back(LastPTrue);
+ MachineInstrs.push_back(
+ BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
+ .addReg(MI.getOperand(0).getReg(), RegState::Define)
+ .addReg(*PredReg)
+ .addReg(*ZPredReg)
+ .addImm(0)
+ .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+ .getInstr());
+ if (IsNZCVUsed)
+ MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR))
+ .addImm(AArch64SysReg::NZCV)
+ .addReg(NZCVSaveReg->freeRegister())
+ .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+ .getInstr());
+
+ propagateFrameFlags(MI, MachineInstrs);
+ return PredReg.hasSpilled();
+}
+
+/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
+/// operations within the MachineBasicBlock \p MBB.
+static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
+ const TargetRegisterInfo &TRI,
+ ScavengeableRegs const &SR,
+ EmergencyStackSlots &SpillSlots) {
+ LiveRegUnits UsedRegs(TRI);
+ UsedRegs.addLiveOuts(MBB);
+ bool HasPPRSpills = false;
+ MachineInstr *LastPTrue = nullptr;
+ for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+ UsedRegs.stepBackward(MI);
+ switch (MI.getOpcode()) {
+ case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
+ if (LastPTrue &&
+ MI.definesRegister(LastPTrue->getOperand(0).getReg(), &TRI))
+ LastPTrue = nullptr;
+ HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
+ LastPTrue, SpillSlots);
+ MI.eraseFromParent();
+ break;
+ case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+ expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
+ MI.eraseFromParent();
+ [[fallthrough]];
+ default:
+ LastPTrue = nullptr;
+ break;
+ }
+ }
+
+ return HasPPRSpills;
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
+
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ const TargetSubtargetInfo &TSI = MF.getSubtarget();
+ const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
+
+ // If predicates spills are 16-bytes we may need to expand
+ // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+ if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
+ auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
+ BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
+ assert(Regs.count() > 0 && "Expected scavengeable registers");
+ return Regs;
+ };
+
+ ScavengeableRegs SR{};
+ SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
+ // Only p0-7 are possible as the second operand of cmpne (needed for fills).
+ SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
+ SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
+
+ EmergencyStackSlots SpillSlots;
+ for (MachineBasicBlock &MBB : MF) {
+ // In the case we had to spill a predicate (in the range p0-p7) to reload
+ // a predicate (>= p8), additional spill/fill pseudos will be created.
+ // These need an additional expansion pass. Note: There will only be at
+ // most two expansion passes, as spilling/filling a predicate in the range
+ // p0-p7 never requires spilling another predicate.
+ for (int Pass = 0; Pass < 2; Pass++) {
+ bool HasPPRSpills =
+ expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots);
+ assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
+ if (!HasPPRSpills)
+ break;
+ }
+ }
+ }
+
MachineFrameInfo &MFI = MF.getFrameInfo();
assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
@@ -4170,7 +4487,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
int64_t SVEStackSize =
assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
@@ -5204,9 +5520,13 @@ void AArch64FrameLowering::emitRemarks(
unsigned RegTy = StackAccess::AccessType::GPR;
if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
- if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
+ // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+ // spill/fill the predicate as a data vector (so are an FPR acess).
+ if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
+ MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
+ AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
RegTy = StackAccess::PPR;
- else
+ } else
RegTy = StackAccess::FPR;
} else if (AArch64InstrInfo::isFpOrNEON(MI)) {
RegTy = StackAccess::FPR;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 17dd8a073eff0f9..0f2b969fba35c7c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -81,7 +81,7 @@ static cl::opt<unsigned>
AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
: AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
AArch64::CATCHRET),
- RI(STI.getTargetTriple()), Subtarget(STI) {}
+ RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
@@ -2438,6 +2438,8 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::STZ2Gi:
case AArch64::STZGi:
case AArch64::TAGPstack:
+ case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+ case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
return 2;
case AArch64::LD1B_D_IMM:
case AArch64::LD1B_H_IMM:
@@ -4223,6 +4225,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
MinOffset = -256;
MaxOffset = 254;
break;
+ case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+ case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
case AArch64::LDR_ZXI:
case AArch64::STR_ZXI:
Scale = TypeSize::getScalable(16);
@@ -5355,6 +5359,11 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZXI;
StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected predicate store without SVE store instructions");
+ Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 24:
@@ -5527,6 +5536,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZXI;
StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected predicate load without SVE load instructions");
+ Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
+ StackID = TargetStackID::ScalableVector;
}
break;
case 24:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5973b63b5a80243..e9730348ba58e5b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -38,8 +38,8 @@ using namespace llvm;
#define GET_REGINFO_TARGET_DESC
#include "AArch64GenRegisterInfo.inc"
-AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
- : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
+AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT, unsigned HwMode)
+ : AArch64GenRegisterInfo(AArch64::LR, 0, 0, 0, HwMode), TT(TT) {
AArch64_MC::initLLVMToCVRegMapping(this);
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 11da624af4881b4..898a509f75908f8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -27,7 +27,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
const Triple &TT;
public:
- AArch64RegisterInfo(const Triple &TT);
+ AArch64RegisterInfo(const Triple &TT, unsigned HwMode);
// FIXM...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no risk in adding this to the release branch, because all functionality is hidden behind a flag. The TableGen/SubtargetEmitter.cpp change should not affect anything, because it merely emits an extra enum class
to the *GenSubtargetInfo.cpp
file.
…ible (llvm#123752) This patch adds a new option `-aarch64-enable-zpr-predicate-spills` (which is disabled by default), this option replaces predicate spills with vector spills in streaming[-compatible] functions. For example: ``` str p8, [sp, llvm#7, mul vl] // 2-byte Folded Spill // ... ldr p8, [sp, llvm#7, mul vl] // 2-byte Folded Reload ``` Becomes: ``` mov z0.b, p8/z, llvm#1 str z0, [sp] // 16-byte Folded Spill // ... ldr z0, [sp] // 16-byte Folded Reload ptrue p4.b cmpne p8.b, p4/z, z0.b, #0 ``` This is done to avoid streaming memory hazards between FPR/vector and predicate spills, which currently occupy the same stack area even when the `-aarch64-stack-hazard-size` flag is set. This is implemented with two new pseudos SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO. The expansion of these pseudos handles scavenging the required registers (z0 in the above example) and, in the worst case spilling a register to an emergency stack slot in the expansion. The condition flags are also preserved around the `cmpne` in case they are live at the expansion point.
…vm#125523) Currently, each expansion of `FILL_PPR_FROM_ZPR_SLOT_PSEUDO` creates a new ptrue instruction. This patch adds a simple method to reuse a previous ptrue instruction when expanding back-to-back fill pseudos.
19daa20
to
7b9b674
Compare
@MacDue (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR. |
This cherry-picks commits: 82c6b8f and e470dca.
These are needed for the
-aarch64-enable-zpr-predicate-spills
flag. This is an off-by-default flag that converts predicate spills to data vector spills in streaming[-compatible] functions.We think this should be fairly low risk as this feature needs to be manually enabled, and we'd like this to be available for users to experiment with in LLVM 20.