From 0c087a1eec655669182913e3c5dc2a56671db762 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 3 Nov 2025 15:41:49 +0000 Subject: [PATCH] [AArch64][SME] Support saving/restoring ZT0 in the MachineSMEABIPass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch extends the MachineSMEABIPass to support ZT0. This is done with the addition of two new states: - `ACTIVE_ZT0_SAVED` * This is used when calling a function that shares ZA, but does share ZT0 (i.e., no ZT0 attributes). * This state indicates ZT0 must be saved to the save slot, but must remain on, with no lazy save setup - `LOCAL_COMMITTED` * This is used for saving ZT0 in functions without ZA state. * This state indicates ZA is off and ZT0 has been saved. * This state is general enough to support ZA, but those have not been implemented† To aid with readability, the state transitions have been reworked to a switch of `transitionFrom().to()`, rather than nested ifs, which helps manage more transitions. † This could be implemented to handle some cases of undefined behavior better. Change-Id: I14be4a7f8b998fe667bfaade5088f88039515f91 --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 1 + .../Target/AArch64/AArch64ISelLowering.cpp | 11 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 6 + llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 176 +++++++++++++++--- .../test/CodeGen/AArch64/sme-peephole-opts.ll | 4 - .../test/CodeGen/AArch64/sme-za-exceptions.ll | 124 +++++++++--- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 104 ++++++----- 7 files changed, 321 insertions(+), 105 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 34d74d04c4419..60e6a82d41cc8 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, } case AArch64::InOutZAUsePseudo: case AArch64::RequiresZASavePseudo: + case AArch64::RequiresZT0SavePseudo: case AArch64::SMEStateAllocPseudo: case AArch64::COALESCER_BARRIER_FPR16: case AArch64::COALESCER_BARRIER_FPR32: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 30f961043e78b..20c1c6790b2fb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9457,6 +9457,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState()) ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE; + else if (CallAttrs.requiresPreservingZT0()) + ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE; else if (CallAttrs.caller().hasZAState() || CallAttrs.caller().hasZT0State()) ZAMarkerNode = AArch64ISD::INOUT_ZA_USE; @@ -9576,7 +9578,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue ZTFrameIdx; MachineFrameInfo &MFI = MF.getFrameInfo(); - bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0(); + bool ShouldPreserveZT0 = + !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0(); // If the caller has ZT0 state which will not be preserved by the callee, // spill ZT0 before the call. @@ -9589,7 +9592,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If caller shares ZT0 but the callee is not shared ZA, we need to stop // PSTATE.ZA before the call if there is no lazy-save active. - bool DisableZA = CallAttrs.requiresDisablingZABeforeCall(); + bool DisableZA = + !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall(); assert((!DisableZA || !RequiresLazySave) && "Lazy-save should have PSTATE.SM=1 on entry to the function"); @@ -10074,7 +10078,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getSMToggleCondition(CallAttrs)); } - if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()) + if (!UseNewSMEABILowering && + (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())) // Unconditionally resume ZA. Result = DAG.getNode( AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result, diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 8f8f211c5fceb..2753a4561daae 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -102,6 +102,7 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)), let hasSideEffects = 1, isMeta = 1 in { def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; + def RequiresZT0SavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>; } def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>; @@ -122,6 +123,11 @@ def AArch64_requires_za_save [SDNPHasChain, SDNPInGlue]>; def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; +def AArch64_requires_zt0_save + : SDNode<"AArch64ISD::REQUIRES_ZT0_SAVE", SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPInGlue]>; +def : Pat<(AArch64_requires_zt0_save), (RequiresZT0SavePseudo)>; + def AArch64_sme_state_alloc : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>, [SDNPHasChain]>; diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index bb4dfe8c60904..5ff19c8260be4 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -72,16 +72,30 @@ using namespace llvm; namespace { -enum ZAState { +// Note: For agnostic ZA, we assume the function is always entered/exited in the +// "ACTIVE" state -- this _may_ not be the case (since OFF is also a +// possibility, but for the purpose of placing ZA saves/restores, that does not +// matter). +enum ZAState : uint8_t { // Any/unknown state (not valid) ANY = 0, // ZA is in use and active (i.e. within the accumulator) ACTIVE, + // ZA is active, but ZT0 has been saved. + // This handles the edge case of sharedZA && !sharesZT0. + ACTIVE_ZT0_SAVED, + // A ZA save has been set up or committed (i.e. ZA is dormant or off) + // If the function uses ZT0 it must also be saved. LOCAL_SAVED, + // ZA has been committed to the lazy save buffer of the current function. + // If the function uses ZT0 it must also be saved. + // ZA is off when a save has been committed. + LOCAL_COMMITTED, + // The ZA/ZT0 state on entry to the function. ENTRY, @@ -164,6 +178,14 @@ class EmitContext { return AgnosticZABufferPtr; } + int getZT0SaveSlot(MachineFunction &MF) { + if (ZT0SaveFI) + return *ZT0SaveFI; + MachineFrameInfo &MFI = MF.getFrameInfo(); + ZT0SaveFI = MFI.CreateSpillStackObject(64, Align(16)); + return *ZT0SaveFI; + } + /// Returns true if the function must allocate a ZA save buffer on entry. This /// will be the case if, at any point in the function, a ZA save was emitted. bool needsSaveBuffer() const { @@ -173,6 +195,7 @@ class EmitContext { } private: + std::optional ZT0SaveFI; std::optional TPIDR2BlockFI; Register AgnosticZABufferPtr = AArch64::NoRegister; }; @@ -184,8 +207,10 @@ class EmitContext { /// state would not be legal, as transitioning to it drops the content of ZA. static bool isLegalEdgeBundleZAState(ZAState State) { switch (State) { - case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. - case ZAState::LOCAL_SAVED: // ZA state is saved on the stack. + case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. + case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active). + case ZAState::LOCAL_SAVED: // ZA state may be saved on the stack. + case ZAState::LOCAL_COMMITTED: // ZA state is saved on the stack. return true; default: return false; @@ -199,7 +224,9 @@ StringRef getZAStateString(ZAState State) { switch (State) { MAKE_CASE(ZAState::ANY) MAKE_CASE(ZAState::ACTIVE) + MAKE_CASE(ZAState::ACTIVE_ZT0_SAVED) MAKE_CASE(ZAState::LOCAL_SAVED) + MAKE_CASE(ZAState::LOCAL_COMMITTED) MAKE_CASE(ZAState::ENTRY) MAKE_CASE(ZAState::OFF) default: @@ -221,18 +248,34 @@ static bool isZAorZTRegOp(const TargetRegisterInfo &TRI, /// Returns the required ZA state needed before \p MI and an iterator pointing /// to where any code required to change the ZA state should be inserted. static std::pair -getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI, - bool ZAOffAtReturn) { +getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI, + SMEAttrs SMEFnAttrs) { MachineBasicBlock::iterator InsertPt(MI); if (MI.getOpcode() == AArch64::InOutZAUsePseudo) return {ZAState::ACTIVE, std::prev(InsertPt)}; + // Note: If we need to save both ZA and ZT0 we use RequiresZASavePseudo. if (MI.getOpcode() == AArch64::RequiresZASavePseudo) return {ZAState::LOCAL_SAVED, std::prev(InsertPt)}; - if (MI.isReturn()) + // If we only need to save ZT0 there's two cases to consider: + // 1. The function has ZA state (that we don't need to save). + // - In this case we switch to the "ACTIVE_ZT0_SAVED" state. + // This only saves ZT0. + // 2. The function does not have ZA state + // - In this case we switch to "LOCAL_COMMITTED" state. + // This saves ZT0 and turns ZA off. + if (MI.getOpcode() == AArch64::RequiresZT0SavePseudo) { + return {SMEFnAttrs.hasZAState() ? ZAState::ACTIVE_ZT0_SAVED + : ZAState::LOCAL_COMMITTED, + std::prev(InsertPt)}; + } + + if (MI.isReturn()) { + bool ZAOffAtReturn = SMEFnAttrs.hasPrivateZAInterface(); return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt}; + } for (auto &MO : MI.operands()) { if (isZAorZTRegOp(TRI, MO)) @@ -280,6 +323,9 @@ struct MachineSMEABI : public MachineFunctionPass { /// predecessors). void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true); + void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsSave); + // Emission routines for private and shared ZA functions (using lazy saves). void emitSMEPrologue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); @@ -290,8 +336,8 @@ struct MachineSMEABI : public MachineFunctionPass { MachineBasicBlock::iterator MBBI); void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - bool ClearTPIDR2); + void emitZAMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2, bool On); // Emission routines for agnostic ZA functions. void emitSetupFullZASave(MachineBasicBlock &MBB, @@ -398,7 +444,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { Block.FixedEntryState = ZAState::ENTRY; } else if (MBB.isEHPad()) { // EH entry block: - Block.FixedEntryState = ZAState::LOCAL_SAVED; + Block.FixedEntryState = ZAState::LOCAL_COMMITTED; } LiveRegUnits LiveUnits(*TRI); @@ -420,8 +466,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { PhysLiveRegsAfterSMEPrologue = PhysLiveRegs; } // Note: We treat Agnostic ZA as inout_za with an alternate save/restore. - auto [NeededState, InsertPt] = getZAStateBeforeInst( - *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface()); + auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs); assert((InsertPt == MBBI || InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) && "Unexpected state change insertion point!"); @@ -742,9 +787,9 @@ void MachineSMEABI::emitRestoreLazySave(EmitContext &Context, restorePhyRegSave(RegSave, MBB, MBBI, DL); } -void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool ClearTPIDR2) { +void MachineSMEABI::emitZAMode(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool ClearTPIDR2, bool On) { DebugLoc DL = getDebugLoc(MBB, MBBI); if (ClearTPIDR2) @@ -755,7 +800,7 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB, // Disable ZA. BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1)) .addImm(AArch64SVCR::SVCRZA) - .addImm(0); + .addImm(On ? 1 : 0); } void MachineSMEABI::emitAllocateLazySaveBuffer( @@ -884,6 +929,28 @@ void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context, restorePhyRegSave(RegSave, MBB, MBBI, DL); } +void MachineSMEABI::emitZT0SaveRestore(EmitContext &Context, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + bool IsSave) { + DebugLoc DL = getDebugLoc(MBB, MBBI); + Register ZT0Save = MRI->createVirtualRegister(&AArch64::GPR64spRegClass); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ZT0Save) + .addFrameIndex(Context.getZT0SaveSlot(*MF)) + .addImm(0) + .addImm(0); + + if (IsSave) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::STR_TX)) + .addReg(AArch64::ZT0) + .addReg(ZT0Save); + } else { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDR_TX), AArch64::ZT0) + .addReg(ZT0Save); + } +} + void MachineSMEABI::emitAllocateFullZASaveBuffer( EmitContext &Context, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) { @@ -928,6 +995,17 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer( restorePhyRegSave(RegSave, MBB, MBBI, DL); } +struct FromState { + ZAState From; + + constexpr uint8_t to(ZAState To) const { + static_assert(NUM_ZA_STATE < 16, "expected ZAState to fit in 4-bits"); + return uint8_t(From) << 4 | uint8_t(To); + } +}; + +constexpr FromState transitionFrom(ZAState From) { return FromState{From}; } + void MachineSMEABI::emitStateChange(EmitContext &Context, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, @@ -959,17 +1037,63 @@ void MachineSMEABI::emitStateChange(EmitContext &Context, From = ZAState::ACTIVE; } - if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED) - emitZASave(Context, MBB, InsertPt, PhysLiveRegs); - else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE) - emitZARestore(Context, MBB, InsertPt, PhysLiveRegs); - else if (To == ZAState::OFF) { - assert(From != ZAState::ENTRY && - "ENTRY to OFF should have already been handled"); - assert(!SMEFnAttrs.hasAgnosticZAInterface() && - "Should not turn ZA off in agnostic ZA function"); - emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED); - } else { + bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface(); + bool HasZT0State = SMEFnAttrs.hasZT0State(); + bool HasZAState = IsAgnosticZA || SMEFnAttrs.hasZAState(); + + switch (transitionFrom(From).to(To)) { + // This section handles: ACTIVE <-> ACTIVE_ZT0_SAVED + case transitionFrom(ZAState::ACTIVE).to(ZAState::ACTIVE_ZT0_SAVED): + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true); + break; + case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::ACTIVE): + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false); + break; + + // This section handles: ACTIVE -> LOCAL_SAVED + case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_SAVED): + if (HasZT0State) + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true); + if (HasZAState) + emitZASave(Context, MBB, InsertPt, PhysLiveRegs); + break; + + // This section handles: ACTIVE -> LOCAL_COMMITTED + case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_COMMITTED): + // Note: We could support ZA state here, but this transition is currently + // only possible when we _don't_ have ZA state. + assert(HasZT0State && !HasZAState && "Expect to only have ZT0 state."); + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true); + emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/false); + break; + + // This section handles: LOCAL_COMMITTED -> (OFF|LOCAL_SAVED) + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::OFF): + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::LOCAL_SAVED): + // These transistions are a no-op. + break; + + // This section handles: LOCAL_(SAVED|COMMITTED) -> ACTIVE[_ZT0_SAVED] + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE): + case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED): + case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE): + if (HasZAState) + emitZARestore(Context, MBB, InsertPt, PhysLiveRegs); + else + emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/true); + if (HasZT0State && To == ZAState::ACTIVE) + emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false); + break; + default: + if (To == ZAState::OFF) { + assert(From != ZAState::ENTRY && + "ENTRY to OFF should have already been handled"); + assert(SMEFnAttrs.hasPrivateZAInterface() && + "Did not expect to turn ZA off in shared/agnostic ZA function"); + emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED, + /*On=*/false); + break; + } dbgs() << "Error: Transition from " << getZAStateString(From) << " to " << getZAStateString(To) << '\n'; llvm_unreachable("Unimplemented state transition"); diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index ced0d41c22dab..f4a3b55e49cd7 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -230,10 +230,6 @@ define void @test7() nounwind "aarch64_inout_zt0" { ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: str zt0, [x19] -; CHECK-NEXT: smstop za ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index dcdc56c669077..f219b1169af01 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -511,7 +511,6 @@ exit: ; ; This code may require reloading ZT0 in the cleanup for ~ZT0Resource(). ; -; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented). define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: try_catch_shared_zt0_callee: ; CHECK: .Lfunc_begin3: @@ -519,52 +518,37 @@ define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @ ; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 ; CHECK-NEXT: .cfi_lsda 28, .Lexception3 ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w30, -24 -; CHECK-NEXT: .cfi_offset w29, -32 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .Ltmp9: // EH_LABEL -; CHECK-NEXT: sub x19, x29, #64 +; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp10: // EH_LABEL ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: .Ltmp10: // EH_LABEL ; CHECK-NEXT: // %bb.1: // %return_normally -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_2: // %unwind_dtors ; CHECK-NEXT: .Ltmp11: // EH_LABEL -; CHECK-NEXT: sub x20, x29, #64 +; CHECK-NEXT: mov x20, sp ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #80 -; CHECK-NEXT: cbnz x8, .LBB3_4 -; CHECK-NEXT: // %bb.3: // %unwind_dtors -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB3_4: // %unwind_dtors -; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: ldr zt0, [x20] ; CHECK-NEXT: bl shared_zt0_call ; CHECK-NEXT: str zt0, [x20] ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl _Unwind_Resume -; CHECK-NEXT: smstart za -; CHECK-NEXT: ldr zt0, [x20] ; ; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee: ; CHECK-SDAG: .Lfunc_begin3: @@ -965,6 +949,90 @@ exit: ret void } +define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_inout_zt0: +; CHECK: .Lfunc_begin7: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception7 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .Ltmp21: // EH_LABEL +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: smstop za +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp22: // EH_LABEL +; CHECK-NEXT: .LBB7_1: // %exit +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_2: // %catch +; CHECK-NEXT: .Ltmp23: // EH_LABEL +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB7_1 +; +; CHECK-SDAG-LABEL: try_catch_inout_zt0: +; CHECK-SDAG: .Lfunc_begin7: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception7 +; CHECK-SDAG-NEXT: // %bb.0: // %entry +; CHECK-SDAG-NEXT: sub sp, sp, #80 +; CHECK-SDAG-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 80 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w30, -16 +; CHECK-SDAG-NEXT: .Ltmp21: // EH_LABEL +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: .Ltmp22: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB7_1: // %exit +; CHECK-SDAG-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: add sp, sp, #80 +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB7_2: // %catch +; CHECK-SDAG-NEXT: .Ltmp23: // EH_LABEL +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: b .LBB7_1 +entry: + invoke void @may_throw() + to label %exit unwind label %catch + +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + declare ptr @__cxa_allocate_exception(i64) declare void @__cxa_throw(ptr, ptr, ptr) declare ptr @__cxa_begin_catch(ptr) diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 4c48e41294a3a..e8f4f6ed78b9c 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -193,7 +193,7 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee: ; CHECK-NEWLOWERING: // %bb.0: ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB6_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: @@ -202,14 +202,11 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) "aarch64_new_zt0" nounwi ; CHECK-NEWLOWERING-NEXT: zero { zt0 } ; CHECK-NEWLOWERING-NEXT: .LBB6_2: ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: str zt0, [x19] +; CHECK-NEWLOWERING-NEXT: mov x8, sp +; CHECK-NEWLOWERING-NEXT: str zt0, [x8] ; CHECK-NEWLOWERING-NEXT: smstop za ; CHECK-NEWLOWERING-NEXT: blr x0 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] -; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 ; CHECK-NEWLOWERING-NEXT: ret call void %callee() "aarch64_new_zt0"; @@ -246,7 +243,7 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee: ; CHECK-NEWLOWERING: // %bb.0: ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #80 -; CHECK-NEWLOWERING-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: cbz x8, .LBB7_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: @@ -255,12 +252,11 @@ define i64 @zt0_new_caller_abi_routine_callee() "aarch64_new_zt0" nounwind { ; CHECK-NEWLOWERING-NEXT: zero { zt0 } ; CHECK-NEWLOWERING-NEXT: .LBB7_2: ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mov x19, sp -; CHECK-NEWLOWERING-NEXT: str zt0, [x19] -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state -; CHECK-NEWLOWERING-NEXT: ldr zt0, [x19] +; CHECK-NEWLOWERING-NEXT: mov x8, sp +; CHECK-NEWLOWERING-NEXT: str zt0, [x8] ; CHECK-NEWLOWERING-NEXT: smstop za -; CHECK-NEWLOWERING-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: add sp, sp, #80 ; CHECK-NEWLOWERING-NEXT: ret %res = call {i64, i64} @__arm_sme_state() @@ -382,37 +378,57 @@ define void @shared_za_new_zt0(ptr %callee) "aarch64_inout_za" "aarch64_new_zt0" define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" nounwind { -; CHECK-COMMON-LABEL: zt0_multiple_private_za_calls: -; CHECK-COMMON: // %bb.0: -; CHECK-COMMON-NEXT: sub sp, sp, #96 -; CHECK-COMMON-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-COMMON-NEXT: mov x20, sp -; CHECK-COMMON-NEXT: mov x19, x0 -; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x0 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x19 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x19 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: str zt0, [x20] -; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: blr x19 -; CHECK-COMMON-NEXT: smstart za -; CHECK-COMMON-NEXT: ldr zt0, [x20] -; CHECK-COMMON-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-COMMON-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload -; CHECK-COMMON-NEXT: add sp, sp, #96 -; CHECK-COMMON-NEXT: ret +; CHECK-LABEL: zt0_multiple_private_za_calls: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov x20, sp +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x19 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x19 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: blr x19 +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #96 +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x20, sp +; CHECK-NEWLOWERING-NEXT: mov x19, x0 +; CHECK-NEWLOWERING-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str zt0, [x20] +; CHECK-NEWLOWERING-NEXT: smstop za +; CHECK-NEWLOWERING-NEXT: blr x0 +; CHECK-NEWLOWERING-NEXT: blr x19 +; CHECK-NEWLOWERING-NEXT: blr x19 +; CHECK-NEWLOWERING-NEXT: blr x19 +; CHECK-NEWLOWERING-NEXT: smstart za +; CHECK-NEWLOWERING-NEXT: ldr zt0, [x20] +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add sp, sp, #96 +; CHECK-NEWLOWERING-NEXT: ret call void %callee() call void %callee() call void %callee()