Skip to content

Commit 1bd3cc2

Browse files
authored
[AArch64][SME] Support Windows/stack probes in MachineSMEABIPass (#149063)
On Windows or with stack probes on other targets, additional code needs to be inserted after dynamic stack allocations to validate stack accesses and/or ensure enough stack space has been allocated. Rather than handle this case in the MachineSMEABIPass (like we do for the standard case), we allocate the memory for the lazy save buffer in SelectionDAG, which allows the existing expansions to emit the correct code. Note: This means in these cases, we may allocate a lazy save buffer when there are no lazy saves present in the function (as we have to allocate the buffer before the MachineSMEABIPass runs).
1 parent a20fc93 commit 1bd3cc2

File tree

7 files changed

+124
-18
lines changed

7 files changed

+124
-18
lines changed

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
16881688
}
16891689
case AArch64::InOutZAUsePseudo:
16901690
case AArch64::RequiresZASavePseudo:
1691+
case AArch64::SMEStateAllocPseudo:
16911692
case AArch64::COALESCER_BARRIER_FPR16:
16921693
case AArch64::COALESCER_BARRIER_FPR32:
16931694
case AArch64::COALESCER_BARRIER_FPR64:

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8489,7 +8489,30 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
84898489
if (Subtarget->hasCustomCallingConv())
84908490
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
84918491

8492-
if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
8492+
if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
8493+
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8494+
SDValue Size;
8495+
if (Attrs.hasZAState()) {
8496+
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8497+
DAG.getConstant(1, DL, MVT::i32));
8498+
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8499+
}
8500+
if (Size) {
8501+
SDValue Buffer = DAG.getNode(
8502+
ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8503+
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8504+
Chain = Buffer.getValue(1);
8505+
8506+
Register BufferPtr =
8507+
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8508+
Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8509+
Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8510+
DAG.getVTList(MVT::Other), Chain);
8511+
FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8512+
MFI.CreateVariableSizedObject(Align(16), nullptr);
8513+
}
8514+
}
8515+
} else {
84938516
// Old SME ABI lowering (deprecated):
84948517
// Create a 16 Byte TPIDR2 object. The dynamic buffer
84958518
// will be expanded and stored in the static object later using a

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
235235
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
236236
SMEAttrs SMEFnAttrs;
237237

238+
// Holds the TPIDR2 block if allocated early (for Windows/stack probes
239+
// support).
240+
Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
241+
238242
// Note: The following properties are only used for the old SME ABI lowering:
239243
/// The frame-index for the TPIDR2 object used for lazy saves.
240244
TPIDR2Object TPIDR2;
@@ -253,6 +257,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
253257
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
254258
const override;
255259

260+
void setEarlyAllocSMESaveBuffer(Register Ptr) {
261+
EarlyAllocSMESaveBuffer = Ptr;
262+
}
263+
264+
Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
265+
256266
// Old SME ABI lowering state getters/setters:
257267
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
258268
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
9393
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
9494
}
9595

96+
def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
97+
9698
def CommitZASavePseudo
9799
: Pseudo<(outs),
98100
(ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
@@ -108,6 +110,11 @@ def AArch64_requires_za_save
108110
[SDNPHasChain, SDNPInGlue]>;
109111
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
110112

113+
def AArch64_sme_state_alloc
114+
: SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
115+
[SDNPHasChain]>;
116+
def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
117+
111118
//===----------------------------------------------------------------------===//
112119
// Instruction naming conventions.
113120
//===----------------------------------------------------------------------===//

llvm/lib/Target/AArch64/MachineSMEABIPass.cpp

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ struct MachineSMEABI : public MachineFunctionPass {
249249
SmallVector<BlockInfo> Blocks;
250250
SmallVector<ZAState> BundleStates;
251251
std::optional<TPIDR2State> TPIDR2Block;
252+
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
252253
} State;
253254

254255
MachineFunction *MF = nullptr;
@@ -298,6 +299,12 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
298299
MachineBasicBlock::iterator MBBI(MI);
299300
LiveUnits.stepBackward(MI);
300301
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
302+
// The SMEStateAllocPseudo marker is added to a function if the save
303+
// buffer was allocated in SelectionDAG. It marks the end of the
304+
// allocation -- which is a safe point for this pass to insert any TPIDR2
305+
// block setup.
306+
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo)
307+
State.AfterSMEProloguePt = MBBI;
301308
auto [NeededState, InsertPt] = getZAStateBeforeInst(
302309
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
303310
assert((InsertPt == MBBI ||
@@ -529,23 +536,27 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
529536
void MachineSMEABI::emitAllocateLazySaveBuffer(
530537
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
531538
MachineFrameInfo &MFI = MF->getFrameInfo();
539+
auto *AFI = MF->getInfo<AArch64FunctionInfo>();
532540

533541
DebugLoc DL = getDebugLoc(MBB, MBBI);
534542
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
535543
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
536-
Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
544+
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
537545

538546
// Calculate SVL.
539547
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
540548

541549
// 1. Allocate the lazy save buffer.
542-
{
543-
// TODO This function grows the stack with a subtraction, which doesn't work
544-
// on Windows. Some refactoring to share the functionality in
545-
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
546-
// supports SME
550+
if (Buffer == AArch64::NoRegister) {
551+
// TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
552+
// Buffer != AArch64::NoRegister). This is done to reuse the existing
553+
// expansions (which can insert stack checks). This works, but it means we
554+
// will always allocate the lazy save buffer (even if the function contains
555+
// no lazy saves). If we want to handle Windows here, we'll need to
556+
// implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
547557
assert(!Subtarget->isTargetWindows() &&
548558
"Lazy ZA save is not yet supported on Windows");
559+
Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
549560
// Get original stack pointer.
550561
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
551562
.addReg(AArch64::SP);
@@ -686,8 +697,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
686697

687698
// Allocate save buffer (if needed).
688699
if (State.TPIDR2Block) {
689-
MachineBasicBlock &EntryBlock = MF.front();
690-
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
700+
if (State.AfterSMEProloguePt) {
701+
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
702+
// entry block (due to the probing loop).
703+
emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
704+
*State.AfterSMEProloguePt);
705+
} else {
706+
MachineBasicBlock &EntryBlock = MF.front();
707+
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
708+
}
691709
}
692710

693711
return true;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s
4+
5+
declare void @private_za_callee()
6+
declare void @shared_za_callee() "aarch64_inout_za"
7+
8+
define void @test_lazy_save() nounwind "aarch64_inout_za" {
9+
; CHECK-LABEL: test_lazy_save:
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
12+
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
13+
; CHECK-NEXT: mov x29, sp
14+
; CHECK-NEXT: sub sp, sp, #16
15+
; CHECK-NEXT: rdsvl x8, #1
16+
; CHECK-NEXT: mul x9, x8, x8
17+
; CHECK-NEXT: lsr x15, x9, #4
18+
; CHECK-NEXT: bl __chkstk
19+
; CHECK-NEXT: sub x9, sp, x15, lsl #4
20+
; CHECK-NEXT: mov sp, x9
21+
; CHECK-NEXT: sub x10, x29, #16
22+
; CHECK-NEXT: stp x9, x8, [x29, #-16]
23+
; CHECK-NEXT: msr TPIDR2_EL0, x10
24+
; CHECK-NEXT: bl private_za_callee
25+
; CHECK-NEXT: smstart za
26+
; CHECK-NEXT: mrs x8, TPIDR2_EL0
27+
; CHECK-NEXT: sub x0, x29, #16
28+
; CHECK-NEXT: cbnz x8, .LBB0_2
29+
; CHECK-NEXT: // %bb.1:
30+
; CHECK-NEXT: bl __arm_tpidr2_restore
31+
; CHECK-NEXT: .LBB0_2:
32+
; CHECK-NEXT: msr TPIDR2_EL0, xzr
33+
; CHECK-NEXT: mov sp, x29
34+
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
35+
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
36+
; CHECK-NEXT: ret
37+
call void @private_za_callee()
38+
ret void
39+
}

llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ exit:
9999
ret float %ret
100100
}
101101

102-
; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
103102
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
104103
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
105104
; CHECK: // %bb.0:
@@ -157,26 +156,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
157156
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
158157
; CHECK-NEWLOWERING-NEXT: mov x9, sp
159158
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
159+
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
160+
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
161+
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
162+
; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
163+
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
164+
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
165+
; CHECK-NEWLOWERING-NEXT: b .LBB2_1
166+
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
160167
; CHECK-NEWLOWERING-NEXT: mov sp, x9
168+
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
161169
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
162170
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
163171
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
164-
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
165-
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
172+
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
173+
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
166174
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
167175
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
168-
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
169-
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
176+
; CHECK-NEWLOWERING-NEXT: b .LBB2_6
177+
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
170178
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
171179
; CHECK-NEWLOWERING-NEXT: bl cosf
172-
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
180+
; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
173181
; CHECK-NEWLOWERING-NEXT: smstart za
174182
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
175183
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
176-
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
177-
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
184+
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
185+
; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
178186
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
179-
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
187+
; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
180188
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
181189
; CHECK-NEWLOWERING-NEXT: mov sp, x29
182190
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload

0 commit comments

Comments
 (0)