-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AArch64][SME] Support agnostic ZA functions in the MachineSMEABIPass #149064
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis extends the MachineSMEABIPass to handle agnostic ZA functions. This case is currently handled like shared ZA functions, but we don't require ZA state to be reloaded before agnostic ZA calls. Note: This patch does not yet fully handle agnostic ZA functions that can catch exceptions. E.g.:
As in this case, we won't commit a ZA save before the Patch is 27.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149064.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d586942582d8b..e0f157141c899 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8154,7 +8154,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- if (Subtarget->useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
+ if (Subtarget->useNewSMEABILowering()) {
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
SDValue Size;
if (Attrs.hasZAState()) {
@@ -8965,9 +8965,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool UseNewSMEABILowering = Subtarget->useNewSMEABILowering();
bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
- // TODO: Handle agnostic ZA functions.
- if (!UseNewSMEABILowering || IsAgnosticZAFunction)
+ if (!UseNewSMEABILowering)
+ return std::nullopt;
+ if (IsAgnosticZAFunction) {
+ if (CallAttrs.requiresPreservingAllZAState())
+ return AArch64ISD::REQUIRES_ZA_SAVE;
return std::nullopt;
+ }
if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
return std::nullopt;
return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
@@ -9047,7 +9051,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
};
bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
- bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
+ bool RequiresSaveAllZA =
+ !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
if (RequiresLazySave) {
const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
MachinePointerInfo MPI =
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 287cc86e19bde..7c0cad299cc64 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This pass implements the SME ABI requirements for ZA state. This includes
-// implementing the lazy ZA state save schemes around calls.
+// implementing the lazy (and agnostic) ZA state save schemes around calls.
//
//===----------------------------------------------------------------------===//
@@ -128,7 +128,7 @@ struct MachineSMEABI : public MachineFunctionPass {
void collectNeededZAStates(MachineFunction &MF, SMEAttrs);
void pickBundleZAStates(MachineFunction &MF);
- void insertStateChanges(MachineFunction &MF);
+ void insertStateChanges(MachineFunction &MF, bool IsAgnosticZA);
// Emission routines for private and shared ZA functions (using lazy saves).
void emitNewZAPrologue(MachineBasicBlock &MBB,
@@ -143,11 +143,46 @@ struct MachineSMEABI : public MachineFunctionPass {
void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
bool ClearTPIDR2);
+ // Emission routines for agnostic ZA functions.
+ void emitSetupFullZASave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+ void emitFullZASaveRestore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsSave);
+ void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+
void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+ ZAState From, ZAState To, LiveRegs PhysLiveRegs,
+ bool IsAgnosticZA);
+
+ // Helpers for switching between lazy/full ZA save/restore routines.
+ void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ if (IsAgnosticZA)
+ return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
+ return emitSetupLazySave(MBB, MBBI);
+ }
+ void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ if (IsAgnosticZA)
+ return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
+ return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
+ }
+ void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ if (IsAgnosticZA)
+ return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
+ return emitAllocateLazySaveBuffer(MBB, MBBI);
+ }
TPIDR2State getTPIDR2Block(MachineFunction &MF);
+ Register getAgnosticZABufferPtr(MachineFunction &MF);
+
private:
struct InstInfo {
ZAState NeededState{ZAState::ANY};
@@ -158,6 +193,7 @@ struct MachineSMEABI : public MachineFunctionPass {
struct BlockInfo {
ZAState FixedEntryState{ZAState::ANY};
SmallVector<InstInfo> Insts;
+ LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
@@ -167,6 +203,9 @@ struct MachineSMEABI : public MachineFunctionPass {
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+ Register AgnosticZABufferPtr = AArch64::NoRegister;
+ LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
+ bool HasFullZASaveRestore = false;
} State;
EdgeBundles *Bundles = nullptr;
@@ -175,7 +214,8 @@ struct MachineSMEABI : public MachineFunctionPass {
void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
SMEAttrs SMEFnAttrs) {
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+ assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
+ SMEFnAttrs.hasZAState()) &&
"Expected function to have ZA/ZT0 state!");
State.Blocks.resize(MF.getNumBlockIDs());
@@ -209,6 +249,7 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+ auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
for (MachineInstr &MI : reverse(MBB)) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
@@ -219,15 +260,20 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
// block setup.
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
State.AfterSMEProloguePt = MBBI;
+ State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
}
+ // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
auto [NeededState, InsertPt] = getInstNeededZAState(
- TRI, MI, /*ZALiveAtReturn=*/SMEFnAttrs.hasSharedZAInterface());
+ TRI, MI, /*ZALiveAtReturn=*/SMEFnAttrs.hasSharedZAInterface() ||
+ SMEFnAttrs.hasAgnosticZAInterface());
assert((InsertPt == MBBI ||
InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
"Unexpected state change insertion point!");
// TODO: Do something to avoid state changes where NZCV is live.
if (MBBI == FirstTerminatorInsertPt)
Block.PhysLiveRegsAtExit = PhysLiveRegs;
+ if (MBBI == FirstNonPhiInsertPt)
+ Block.PhysLiveRegsAtEntry = PhysLiveRegs;
if (NeededState != ZAState::ANY)
Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
}
@@ -294,7 +340,7 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) {
}
}
-void MachineSMEABI::insertStateChanges(MachineFunction &MF) {
+void MachineSMEABI::insertStateChanges(MachineFunction &MF, bool IsAgnosticZA) {
for (MachineBasicBlock &MBB : MF) {
BlockInfo &Block = State.Blocks[MBB.getNumber()];
ZAState InState =
@@ -309,7 +355,7 @@ void MachineSMEABI::insertStateChanges(MachineFunction &MF) {
for (auto &Inst : Block.Insts) {
if (CurrentState != Inst.NeededState)
emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
- Inst.PhysLiveRegs);
+ Inst.PhysLiveRegs, IsAgnosticZA);
CurrentState = Inst.NeededState;
}
@@ -318,7 +364,7 @@ void MachineSMEABI::insertStateChanges(MachineFunction &MF) {
if (CurrentState != OutState)
emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
- Block.PhysLiveRegsAtExit);
+ Block.PhysLiveRegsAtExit, IsAgnosticZA);
}
}
@@ -571,10 +617,98 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
emitZeroZA(TII, DL, MBB, MBBI, /*Mask=*/0b11111111);
}
+Register MachineSMEABI::getAgnosticZABufferPtr(MachineFunction &MF) {
+ if (State.AgnosticZABufferPtr != AArch64::NoRegister)
+ return State.AgnosticZABufferPtr;
+ if (auto BufferPtr =
+ MF.getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
+ BufferPtr != AArch64::NoRegister)
+ State.AgnosticZABufferPtr = BufferPtr;
+ else
+ State.AgnosticZABufferPtr =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ return State.AgnosticZABufferPtr;
+}
+
+void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsSave) {
+ MachineFunction &MF = *MBB.getParent();
+ auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ State.HasFullZASaveRestore = true;
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register BufferPtr = AArch64::X0;
+
+ ScopedPhysRegSave ScopedPhysRegSave(MRI, TII, DL, MBB, MBBI, PhysLiveRegs);
+
+ // Copy the buffer pointer into X0.
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), BufferPtr)
+ .addReg(getAgnosticZABufferPtr(MF));
+
+ // Call __arm_sme_save/__arm_sme_restore.
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::BL))
+ .addReg(BufferPtr, RegState::Implicit)
+ .addExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore")
+ .addRegMask(TRI.getCallPreservedMask(
+ MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+}
+
+void MachineSMEABI::emitAllocateFullZASaveBuffer(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ MachineFunction &MF = *MBB.getParent();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ // Buffer already allocated in SelectionDAG.
+ if (AFI->getEarlyAllocSMESaveBuffer())
+ return;
+
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register BufferPtr = getAgnosticZABufferPtr(MF);
+ Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+
+ ScopedPhysRegSave ScopedPhysRegSave(MRI, TII, DL, MBB, MBBI, PhysLiveRegs);
+
+ // Calculate the SME state size.
+ {
+ const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::BL))
+ .addExternalSymbol("__arm_sme_state_size")
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .addRegMask(TRI->getCallPreservedMask(
+ MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), BufferSize)
+ .addReg(AArch64::X0);
+ }
+
+ // Allocate a buffer object of the size given __arm_sme_state_size.
+ {
+ BuildMI(MBB, MBBI, DL, TII.get(AArch64::SUBXrx64), AArch64::SP)
+ .addReg(AArch64::SP)
+ .addReg(BufferSize)
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), BufferPtr)
+ .addReg(AArch64::SP);
+
+ // We have just allocated a variable sized object, tell this to PEI.
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+}
+
void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
ZAState From, ZAState To,
- LiveRegs PhysLiveRegs) {
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
// ZA not used.
if (From == ZAState::ANY || To == ZAState::ANY)
@@ -601,10 +735,11 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
}
if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
- emitSetupLazySave(MBB, InsertPt);
+ emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
- emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+ emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
else if (To == ZAState::OFF) {
+ assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function");
// If we're exiting from the CALLER_DORMANT state that means this new ZA
// function did not touch ZA (so ZA was never turned on).
if (From != ZAState::CALLER_DORMANT)
@@ -627,7 +762,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
- if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+ if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
+ !SMEFnAttrs.hasAgnosticZAInterface())
return false;
assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
@@ -636,20 +772,27 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
State = PassState{};
Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
+ bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
+
collectNeededZAStates(MF, SMEFnAttrs);
pickBundleZAStates(MF);
- insertStateChanges(MF);
+ insertStateChanges(MF, /*IsAgnosticZA=*/IsAgnosticZA);
// Allocate save buffer (if needed).
- if (State.TPIDR2Block.has_value()) {
+ if (State.HasFullZASaveRestore || State.TPIDR2Block.has_value()) {
if (State.AfterSMEProloguePt) {
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
// entry block (due to the probing loop).
- emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
- *State.AfterSMEProloguePt);
+ emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+ *State.AfterSMEProloguePt,
+ State.PhysLiveRegsAfterSMEPrologue,
+ /*IsAgnosticZA=*/IsAgnosticZA);
} else {
MachineBasicBlock &EntryBlock = MF.front();
- emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+ emitAllocateZASaveBuffer(
+ EntryBlock, EntryBlock.getFirstNonPHI(),
+ State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry,
+ /*IsAgnosticZA=*/IsAgnosticZA);
}
}
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index d1ec53f54c702..0447166a2dde6 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sme2 < %s | FileCheck %s
-; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s
+; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
+; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
target triple = "aarch64"
@@ -9,10 +9,10 @@ declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic"
; No calls. Test that no buffer is allocated.
define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agnostic" {
-; CHECK-LABEL: agnostic_caller_no_callees:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x0, [x0]
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: agnostic_caller_no_callees:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: ldr x0, [x0]
+; CHECK-COMMON-NEXT: ret
%v = load i64, ptr %ptr
ret i64 %v
}
@@ -51,6 +51,29 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x29, sp
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT: mov x19, sp
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov sp, x29
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -60,12 +83,12 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
;
; Should not result in save/restore code.
define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_agnostic" {
-; CHECK-LABEL: agnostic_caller_agnostic_callee:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: bl agnostic_decl
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: agnostic_caller_agnostic_callee:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: bl agnostic_decl
+; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ret
%res = call i64 @agnostic_decl(i64 %v)
ret i64 %res
}
@@ -74,12 +97,12 @@ define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_a
;
; Should not result in lazy-save or save of ZT0
define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "aarch64_inout_zt0" {
-; CHECK-LABEL: shared_caller_agnostic_callee:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: bl agnostic_decl
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: shared_caller_agnostic_callee:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: bl agnostic_decl
+; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ret
%res = call i64 @agnostic_decl(i64 %v)
ret i64 %res
}
@@ -130,6 +153,45 @@ define i64 @streaming_agnostic_caller_nonstreaming_priv...
[truncated]
|
9dcb26a
to
4250bec
Compare
3fb2e45
to
af4a764
Compare
4532a6b
to
a2e73ca
Compare
af4a764
to
323b821
Compare
a2e73ca
to
b18d8f1
Compare
323b821
to
75b2bf0
Compare
b18d8f1
to
142af7d
Compare
75b2bf0
to
ea77b25
Compare
142af7d
to
960f411
Compare
ea77b25
to
ba4ddc7
Compare
960f411
to
a87dcab
Compare
ba4ddc7
to
de812ce
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left two nits, but LGTM otherwise.
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt; | ||
Register AgnosticZABufferPtr = AArch64::NoRegister; | ||
LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None; | ||
bool HasFullZASaveRestore = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you document what a "full ZA save/restore" is? How does this differ from the "plain" save/restore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a little comment above emitFullZASaveRestore
. It's full in the sense it uses __arm_sme_save
or __arm_sme_restore
, which handles saving both ZA and ZT0.
// All pass state that must be cleared between functions. | ||
struct PassState { | ||
SmallVector<BlockInfo> Blocks; | ||
SmallVector<ZAState> BundleStates; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are starting to accumulate a lot of state, which makes the code harder to follow as it allows any member function to modify it instead of having clear ins/outs.
I know we discussed that already here, but I feel it would be nice not to delay the refactoring too much. Even having a first step that collects all the info in a struct would help. We could then pass that info around by const ref to any function that needs it. If some info needs to be mutable, then it should not be in the struct, and be a clear in/out parameter.
Doing something like this would clearly decouple the "collection" phase from the "let me correctly handle the state changes" phase.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here's a WIP patch that implements the scheme I mentioned previously: #156674
a240cdb
to
371598d
Compare
On Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated. Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code. Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs). Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
Change-Id: I0c3703e432386814830492e46d896ac2395840fc
Change-Id: Id48a5e1cbf3f246165b41657372ff046b6ff0c84
Change-Id: I6755e71c47d03fba8931876b8607289df30feaae
This extends the MachineSMEABIPass to handle agnostic ZA functions. This case is currently handled like shared ZA functions, but we don't require ZA state to be reloaded before agnostic ZA calls. Note: This patch does not yet fully handle agnostic ZA functions that can catch exceptions. E.g.: ``` __arm_agnostic("sme_za_state") void try_catch_agnostic_za_callee() { try { agnostic_za_call(); } catch(...) { noexcept_agnostic_za_call(); } } ``` As in this case, we won't commit a ZA save before the `agnostic_za_call()`, which would be needed to restore ZA in the catch block. This will be handled in a later patch. Change-Id: I9cce7b42ec8b64d5442b35231b65dfaf9d149eed
Change-Id: I94018ed55c302de670f7a0b25fd28605d9bed2b6
Change-Id: Ia4b65e49c9007ae6b13a10ce2ea8ee1411036dc0
Change-Id: I868b20fc09b7c971edf778924ac56a065df19772
Change-Id: I474c67d0cd84711214491a260d16fc697e9294a3
371598d
to
4242cb3
Compare
Change-Id: I952807fc03f1db5d23b1df7ceeab110b95b17996
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/163/builds/26131 Here is the relevant piece of the build log for the reference
|
This extends the MachineSMEABIPass to handle agnostic ZA functions. This case is currently handled like shared ZA functions, but we don't require ZA state to be reloaded before agnostic ZA calls.
Note: This patch does not yet fully handle agnostic ZA functions that can catch exceptions. E.g.:
As in this case, we won't commit a ZA save before the
agnostic_za_call()
, which would be needed to restore ZA in the catch block. This will be handled in a later patch.