-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SME] Save VG for unwind info when changing streaming-mode #83301
[AArch64][SME] Save VG for unwind info when changing streaming-mode #83301
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-aarch64 Author: Kerry McLaughlin (kmclaughlin-arm) ChangesIf a function requires any streaming-mode change, the vector granule This patch adds VG to the list of callee-saved registers and increases the See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst Patch is 166.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83301.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 5cc612e89162af..68564bc2ea7bf5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext())
+ if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -691,6 +691,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
continue;
+ if (!Info.isRestored())
+ continue;
+
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1344,6 +1347,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
int CFAOffset = 0) {
unsigned NewOpc;
+
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1651,6 +1655,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::LR);
}
+ // If the function contains streaming mode changes, we expect the first
+ // instruction of MBB to be a CNTD. Move past this instruction if found.
+ if (AFI->hasStreamingModeChanges()) {
+ assert(MBBI->getOpcode() == AArch64::CNTD_XPiI && "Unexpected instruction");
+ MBBI = std::next(MBBI);
+ }
+
auto VerifyClobberOnExit = make_scope_exit([&]() {
if (NonFrameStart == MBB.end())
return;
@@ -2756,7 +2767,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
RegPairInfo() = default;
@@ -2768,6 +2779,7 @@ struct RegPairInfo {
return 2;
case GPR:
case FPR64:
+ case VG:
return 8;
case ZPR:
case FPR128:
@@ -2833,6 +2845,8 @@ static void computeCalleeSaveRegisterPairs(
RPI.Type = RegPairInfo::ZPR;
else if (AArch64::PPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::PPR;
+ else if (RPI.Reg1 == AArch64::VG)
+ RPI.Type = RegPairInfo::VG;
else
llvm_unreachable("Unsupported register class.");
@@ -2860,6 +2874,7 @@ static void computeCalleeSaveRegisterPairs(
break;
case RegPairInfo::PPR:
case RegPairInfo::ZPR:
+ case RegPairInfo::VG:
break;
}
}
@@ -3047,7 +3062,23 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ StrOpc = AArch64::STRXui;
+ Size = 8;
+ Alignment = Align(8);
+ break;
}
+
+ if (Reg1 == AArch64::VG) {
+ // Find an available register to store value of VG to.
+ Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+ assert(Reg1 != AArch64::NoRegister);
+
+ BuildMI(MBB, MBB.begin(), DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1);
+ }
+
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
@@ -3171,6 +3202,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ continue;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -3313,6 +3346,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
CSStackSize += RegSize;
}
+ // Increase the callee-saved stack size if the function has streaming mode
+ // changes, as we will need to spill the value of the VG register.
+ if (AFI->hasStreamingModeChanges())
+ CSStackSize += 8;
+
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
@@ -3449,6 +3487,23 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
}
+ // Insert VG into the list of CSRs, immediately before LR if saved.
+ if (AFI->hasStreamingModeChanges()) {
+ auto VGInfo = CalleeSavedInfo(AArch64::VG);
+ VGInfo.setRestored(false);
+ bool InsertBeforeLR = false;
+
+ for (unsigned I = 0; I < CSI.size(); I++)
+ if (CSI[I].getReg() == AArch64::LR) {
+ InsertBeforeLR = true;
+ CSI.insert(CSI.begin() + I, VGInfo);
+ break;
+ }
+
+ if (!InsertBeforeLR)
+ CSI.push_back(VGInfo);
+ }
+
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 3fa1ee5b9b0114..2a57e4edff8080 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mattr=+sve < %s | FileCheck %s
; Verify that the following code can be compiled without +sme, because if the
; call is not entered in streaming-SVE mode at runtime, the codepath leading
@@ -10,11 +10,13 @@ target triple = "aarch64"
define void @streaming_compatible() #0 {
; CHECK-LABEL: streaming_compatible:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB0_2
@@ -26,11 +28,12 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @non_streaming()
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 2a78012045ff42..5605556275a96c 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -11,12 +11,14 @@ declare double @normal_callee(double)
define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline optnone {
; CHECK-FISEL-LABEL: nonstreaming_caller_streaming_callee:
; CHECK-FISEL: // %bb.0: // %entry
+; CHECK-FISEL-NEXT: cntd x9
; CHECK-FISEL-NEXT: sub sp, sp, #96
; CHECK-FISEL-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: smstart sm
; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -37,12 +39,14 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
;
; CHECK-GISEL-LABEL: nonstreaming_caller_streaming_callee:
; CHECK-GISEL: // %bb.0: // %entry
+; CHECK-GISEL-NEXT: cntd x9
; CHECK-GISEL-NEXT: sub sp, sp, #96
; CHECK-GISEL-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: smstart sm
; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -70,12 +74,14 @@ entry:
define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_enabled" {
; CHECK-COMMON-LABEL: streaming_caller_nonstreaming_callee:
; CHECK-COMMON: // %bb.0: // %entry
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: sub sp, sp, #96
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -102,12 +108,14 @@ entry:
define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: sub sp, sp, #112
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: str x9, [sp, #104] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -166,11 +174,13 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi
define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -187,11 +197,13 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optnone {
; CHECK-COMMON-LABEL: normal_call_to_streaming_callee_ptr:
; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -325,12 +337,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-COMMON-LABEL: f128_call_sm:
; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: sub sp, sp, #112
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
@@ -386,12 +399,13 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-COMMON-LABEL: frem_call_sm:
; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: sub sp, sp, #96
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload
@@ -414,12 +428,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-COMMON-LABEL: frem_call_sm_compat:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: sub sp, sp, #112
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload
@@ -436,13 +452,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: // %bb.3:
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: .LBB12_4:
-; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #96
+; CHECK-COMMON-NEXT: add sp, sp, #112
; CHECK-COMMON-NEXT: ret
%res = frem float %a, %b
ret float %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f191..c24585a971fb7a 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -121,13 +121,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -160,7 +161,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index d5bea725b6d14d..0fb85bb7e05a14 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -15,12 +15,13 @@ target triple = "aarch64-unknown-unknown-eabi-elf"
define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -32,8 +33,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folde...
[truncated]
|
It seems weird to be generating actual instructions in the text section for the sake of unwind info, particularly for functions marked "nounwind". Suppose we have a function that's aarch64_pstate_sm_body or aarch64_pstate_sm_enabled. We can statically determine whether SME is supposed to be enabled at any given point in the function. If we encode that information into the DWARF, the unwinder/debugger can figure it out too. For aarch64_pstate_sm_compatible, we always have the result of __arm_sme_state saved in some register. (With one minor exception: if we're currently executing code in the function itself or __arm_sme_state, we might not have the value saved yet. But in that case a debugger can directly inspect the PSTATE register). So if we invent an appropriate DWARF encoding, we should be able to represent all the necessary information without requiring any code at runtime, I think. Given that, what's the advantage of generating code to explicitly compute/spill "VG"? |
Hi @efriedma-quic, thank you for taking a look at this!
In the latest commit I have made changes to ensure that functions marked with 'nounwind' will not store VG to the stack.
I created this patch to spill the value of VG at the beginning of the function as this is requested by the ABI, specifically However, I'm not sure why we don't instead encode this information in debug info; perhaps this is something @rsandifo-arm might know more about? |
The mechanism is intended to be more general than simply streaming VL vs non-streaming VL. It is possible to change the non-streaming VL from one value to another, such as via a linux prctl. In those cases there would be no way for the unwinder to recover the previous VL unless the previous VL was specifically saved. So yes, I agree it's unusual to emit code specifically for unwinding purposes, but it seemed like the best compromise. FWIW, there are already other situations where unwinding constrains code generation. For example, if a big-endian function follows the SVE PCS and needs to save Z8, it must do the save using ST1D rather than STR, so that the low 64 bits (D8) are laid out as the unwinder expects. Using ST1D requires a predicate input and has a smaller range, so it isn't the instruction that would naturally be chosen. |
Not sure this is actually the right choice, given the interaction with debug info. Presumably we want debuggers to be able to produce a stack trace for nounwind code. On a related note, I'm not sure what we're generating here is really what we want for aarch64_pstate_sm_body. A aarch64_pstate_sm_body has two vector lengths: the length used for spilling the callee-saves, and the length used for local variables. If we only store the former, debuggers can't display local variables.
You don't necessarily need to use the same mechanism for this. And I suspect using the same mechanism is going to be sort of awkward.
Yes, I know, there are situations where this comes up. Particularly on Windows. But I'd like to avoid it when it isn't necessary, and if we're defining the spec, we should consider this situation as we write the spec. |
Thank you for raising this @efriedma-quic , it's something that was not handled correctly in my original patch. After some offline discussion about this scenario, I have made changes to the patch to ensure the correct value of VG can be recovered from locally-streaming functions. As I understand it, we need to store both the streaming and non-streaming vector length in the prologue and additionally save the streaming length before calls which require streaming-mode changes. After such calls, I believe we need to use This is based on my understanding of the specification as it is today, but I am happy to try a different implementation if this changes.
Do you know if there is something else I could be checking to decide whether to emit VG for the nounwind case, or if it's better to remove the calls to |
The sequences here seem fine.
I think just remove the needsUnwindTableEntry() checks; it's more important to have consistent debug vs. non-debug codegen, vs. saving a couple instructions in the prologue. On additional concern I just realized looking at the patch again: I don't think we can use cntd in non-streaming mode on targets without SVE. (See #86743 for more discussion of SME without SVE.) |
c521c76
to
fd56aa3
Compare
If the target doesn't have SVE, I don't think there is a need to save VG even in functions which have streaming-mode changes. I've added a helper to AArch64MachineFunctionInfo which will ensure we only spill VG if both hasSVE and hasStreamingModeChanges are true. |
Sure, but "-sve" doesn't mean the target doesn't have SVE; it just means the target may or may not have SVE. What if you compile the code with SVE disabled, and the target actually does end up having SVE at runtime? (e.g. you compile a library in SME-but-not-SVE mode, but it gets linked into an application that uses SVE.) |
I believe checking for However, there are a couple of cases not covered here if the code is compiled without SVE, but the target has SVE:
It should be quite straightforward to extend this patch to cover the first of the cases above, as we can still emit |
Streaming caller -> non-streaming callee works, sure. From your description, not completely sure how you plan to handle the streaming-compatible caller -> non-streaming callee case; there's no such thing as a conditional cfi_offset, as far as I know. I'd at least like to see the proposed assembly sequence, even if you don't implement it in this patch. The problem case is "non-streaming SVE-enabled caller -> locally-streaming SVE-disabled callee". The caller doesn't set VG because it doesn't know anything about streaming. The callee can't set VG because it can't read the non-streaming vector length. The only way I can think of to make it work, given the specified structure of the unwind data, is a runtime check for whether SVE is available. But the ABI doesn't provide any way to check that at the moment. I don't have a problem with splitting the patches however you think is appropriate, but we need to make sure the proposed scheme actually works before we go forward with it. |
If I understand you correctly, you're concerned about the following case:
where in I think the critical point here is that having SME without also having SVE available in non-streaming mode is an atypical use-case for which the ABI was never really intended. SME is an Armv9-A feature which generally expects SVE to be available (in non-streaming mode), similar to how Armv8-A generally expects AdvSIMD to be available (see note in section A1.5 of the Arm Reference Manual). In LLVM we have tried to keep the two features conceptually separate so not to tie ourselves in in case this requirement ever needs to be relaxed. If this is a use-case that ever needs supporting in the ABI, I guess this could be done by adding an extra ABI routine that safely provides the current value of VG iff SVE is available at runtime. |
If the ABI spec is intentionally not supporting the combination for now, that's fine, I guess? As you note, it should be feasible to extend the ABI without breaking backward-compatibility. Probably the compiler should report an error on the unsupported combinations, though. |
clang/lib/Sema/SemaChecking.cpp
Outdated
if (CallerFD->hasAttr<ArmLocallyStreamingAttr>()) | ||
Diag(Loc, diag::warn_sme_locally_streaming_no_sve); | ||
|
||
auto CallerStreamingTy = getArmStreamingFnType(CallerFD); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indentation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in latest commit
ae250d9
to
255f040
Compare
Gentle ping :) |
// the streaming value of VG around streaming-mode changes in locally-streaming | ||
// functions. | ||
def VGUnwindInfoPseudo : | ||
Pseudo<(outs), (ins timm0_1:$save_restore), []>, Sched<[]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason to make this one pseudo, instead of two? The two operations have opposite semantics, and opcode space isn't that scarce.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There wasn't any particular reason for only adding only pseudo, so I've split this out into two (VGSavePseudo & VGRestorePseudo).
@@ -3730,6 +3730,12 @@ def warn_gnu_inline_cplusplus_without_extern : Warning< | |||
"'gnu_inline' attribute without 'extern' in C++ treated as externally" | |||
" available, this changed in Clang 10">, | |||
InGroup<DiagGroup<"gnu-inline-cpp-without-extern">>; | |||
def warn_sme_streaming_mode_change_no_sve : Warning< | |||
"function requires a streaming-mode change, unwinding is not possible without 'sve'">, | |||
InGroup<AArch64SMEAttributes>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should probably be an error if it's possible to unwind: it's effectively miscompile. Both here, and in the backend. Add a note suggesting marking the function noexcept
/__attribute((nothrow))
.
If it isn't possible to unwind, you just end up with slightly inaccurate debug info, which is just annoying; probably not worth warning for that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @efriedma-quic, I have changed these warnings to errors as suggested and also updated SemaChecking.cpp to only emit them if the callee is not marked with either noexcept
or nothrow
. I thought that we should not emit the errors if fno-exceptions
is used as well, so I've added a check for getLangOpts().Exceptions
in the same place.
I started adding similar asserts to LLVM to ensure that if SVE is not available, that nounwind is set on the callee. I haven't included these in the latest commit however, as many of the LLVM tests for SME do not currently pass -mattr=+sve
and would require updating a lot of CHECK lines. I would prefer to work on this separately and post a new patch following this PR if possible, as I think including all of the test changes will make this PR more difficult to review.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
if (!AFI->requiresVGSpill(MF)) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be removed, because the pseudo should not have been emitted if the function didn't require a spill of VG.
int64_t getVGIdx() const { return VGIdx; }; | ||
void setVGIdx(unsigned Idx) { VGIdx = Idx; }; | ||
|
||
int64_t getStreamingVGIdx() const { return StreamingVGIdx; }; | ||
void setStreamingVGIdx(unsigned Idx) { StreamingVGIdx = Idx; }; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor nit: Could you use FrameIdx
instead of just Idx
?
// This is a restore of VG after returning from the call. Emit the | ||
// .cfi_restore instruction, which sets the rule for VG to the same | ||
// as it was on entry to the function. | ||
++MBBI; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure why you're incrementing the iterator? I don't think this makes a difference if you remove the pseudo? In fact, when I remove this all the tests still pass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was a mistake, the iterator shouldn't be changing here as the pseudos are emitted in the correct place around the call.
TII.get(TargetOpcode::CFI_INSTRUCTION)) | ||
.addCFIIndex(CFIIndex) | ||
.setMIFlags(MachineInstr::FrameSetup); | ||
} else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is little common code between the two, so I'd rather see this written as:
case AArch64::VGSavePseudo: {
...
}
case AArch64::VGRestorePseudo: {
...
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Expanding the pseudos has moved to AArch64FrameLowering.cpp. There is more common code here, so I have only added one function to expand both (emitVGSaveRestore
). I can split this into an emitVGSave
/emitVGRestore
if you'd still prefer they be kept separate.
"Expected FrameIdx for VG"); | ||
|
||
const TargetSubtargetInfo &STI = MF.getSubtarget(); | ||
const TargetInstrInfo &TII = *STI.getInstrInfo(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: TII
is already available as a member of AArch64ExpandPseudo
, no need to get it again.
return false; | ||
|
||
int64_t VGFrameIdx = | ||
LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice if we could keep all this knowledge within the FrameLowering without having information that we implicitly pass (through AFI) between different passes (in this case, PEI and PseudoExpansion).
PEI has a callback named processFunctionBeforeFrameIndicesReplaced
. You could update the VGSave/RestorePseudo
nodes to add the offset, so that the code here simply has to replace the pseudo by a CFI_INSTRUCTION. Or you could replace the pseudo in that callback itself.
@@ -8287,6 +8289,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, | |||
|
|||
SDValue InGlue; | |||
if (RequiresSMChange) { | |||
|
|||
if (Subtarget->hasSVE()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can only emit this pseudo when we have asynchronous unwind tables enabled. At the moment, when I compile some code with -fno-asynchronous-unwind-tables
, it will still generate these directives. Perhaps this should have a diagnostic in Clang?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I follow what the issue is... I think we discussed before that we want debuggers to be able to unwind the stack even if a function is nounwind.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when I compile some code with -fno-asynchronous-unwind-tables, it will still generate these directives. Perhaps this should have a diagnostic in Clang?
I'm happy to add such a diagnostic, but I want to make sure I have the reason for this requirement correct.
If I understand correctly, enabling asychronous unwind tables is required now that I am emitting the save and restore of VG at the point of each call in the function which changes streaming-mode.
we want debuggers to be able to unwind the stack even if a function is nounwind.
Given this, I'm not entirely sure why we would expect to be able to unwind if -fno-asynchronous-unwind-tables
was used or if the function has been marked as nounwind? I might have misunderstood something though!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are two different kinds of DWARF "unwind info"; one is the kind that's in a loadable section, and used for EH. The other is in a debug info section, not loaded at runtime. If you specify -fasynchronous-unwind-tables
, you get the former; if you specify -g -fno-asynchronous-unwind-tables
, you get the latter.
If you request no debug info and no unwind tables, we shouldn't emit any DWARF directives.
But like I mentioned, I think we want to unconditionally emit the code to save the VG, whether or not we emit the corresponding DWARF directives.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we still emit the spill of VG with -fno-asynchronous-unwind-tables
, I don't know how we would be able to recover the correct value without the per-call CFI saves and restores. And without the correct VG value, I don't think it will be possible to recover any VG based values in the stack frame.
I'm still not sure why this wouldn't require a diagnostic, because if there is not enough information to recover VG then I don't think we can unwind correctly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there's no DWARF unwind, nothing can unwind the stack whether or not we store the VG.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@efriedma-quic I think the issue is that when we emit unwind info that is not asynchronous, then the unwinder can't correctly unwind the stack because it would use the wrong value for VG to compute the offsets of callee-saves. So any unwind info that would be produced is broken.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, right, there's also that dimension. I'm not sure I understand the interaction here, but if there's an issue, can we just force on "asynchronous" unwind info in that case? The point of non-async unwind info isn't that it's a different unwind format; it's just an optimization to reduce the size of the unwind info.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've made changes to needsAsyncDwarfUnwindInfo
in AArch64MachineFunctionInfo.cpp to always return true if the function has streaming-mode changes. I believe this will ensure we emit the correct information when -fno-asynchronous-unwind-tables
is specified.
@@ -8443,9 +8452,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, | |||
|
|||
if (RequiresSMChange) { | |||
assert(PStateSM && "Expected a PStateSM to be set"); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: unnecessary newline.
@@ -3768,6 +3768,12 @@ def err_conflicting_attributes_arm_state : Error< | |||
"conflicting attributes for state '%0'">; | |||
def err_sme_streaming_cannot_be_multiversioned : Error< | |||
"streaming function cannot be multi-versioned">; | |||
def err_sme_streaming_mode_change_no_sve : Error< | |||
"function requires a streaming-mode change, unwinding is not possible without 'sve'. " | |||
"Consider marking this function as 'noexcept' or '__attribute__((nothrow))'">; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It probably makes more sense to add a routine to compiler-rt that returns the value of VG if SVE is available, rather than emitting an error here. You can implement that function using the (already existing) interfaces for function-multiversioning to check if SVE is available at runtime. In that case, I think there is little value in having these Clang changes here.
@efriedma-quic are you happy going with that approach instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we call new functions, they need to be part of the ABI. If you're happy to work with your ABI people to document the new interface, I guess it's not a problem.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Documentation for this function was added in ARM-software/abi-aa#263 and #92921 adds the routine to compiler-rt.
d73fa84
to
6d258dc
Compare
6d258dc
to
2750fb2
Compare
F.getUWTableKind() == UWTableKind::Async && | ||
!F.hasMinSize(); | ||
NeedsAsyncDwarfUnwindInfo = | ||
(needsDwarfUnwindInfo(MF) && F.getUWTableKind() == UWTableKind::Async && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we can't emit correct async unwind info for functions with outlining/homogeneous epilogues, does that mean we also need to disable outlining/homogeneous epilogues for functions with streaming mode changes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, also, this should probably be NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) && ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) || AFI->hasStreamingModeChanges());
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This PR does include a change to AArch64FrameLowering::homogeneousPrologEpilog
which disables homogeneous epilogues if the function has streaming-mode changes.
I hadn't considered outlining, but I can see that when considering candidates we must be able to outline all CFI instructions in the function. Am I correct in thinking that this is the reason we would need to disable outlining when there are streaming-mode changes which require async unwind info?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure exactly what the issues are with outlining at this point, but the last time async-unwind was looked at, there apparently were issues.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think outlining from functions with streaming-mode changes needs more investigation. I don't think this is just a concern for async unwind; I noticed that when passing -enable-machine-outliner
to sme-vg-to-stack.ll that some calls are outlined with only one of the smstart/smstop instructions surrounding the call. I'm not sure if this is safe yet, so for now I've disabled outlining for these functions in isFunctionSafeToOutlineFrom
.
@@ -214,7 +232,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za" | |||
define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{ | |||
; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee: | |||
; CHECK-COMMON: // %bb.0: // %prelude | |||
; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill | |||
; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill | |||
; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like I must have asked about this at some point, but where is the x19 spill coming from?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The base pointer x19 is added to the list of saved registers in determineCalleeSaves
if the function is in streaming mode or has SVE. The spill was introduced here when I enabled SVE for this test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, right, it's not directly connected to this patch, it's just because you're changing the RUN line.
Please file a bug for this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've taken a closer look at this and I think the changes in @za_new_caller_to_za_shared_callee
are correct. Because it's an aarch64_new_za
function, it has to set up a lazy-save buffer on entry and a variable sized object is allocated on the stack for this. The function is not streaming, so x19 will only be spilled if the function has SVE.
The buffer is not used however, and #81648 was created to remove the lazy-save from tests like this where it is not required.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If a function requires any streaming-mode change, the vector granule value must be stored to the stack and unwind info must also describe the save of VG to this location. This patch adds VG to the list of callee-saved registers and increases the callee-saved stack size in determineCalleeSaves if the function requires streaming-mode changes. A new type is added to RegPairInfo for VG, which is also used to skip restoring the register in the restore block. See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
- Added a test to sme-vg-to-stack.ll with the 'nounwind' attribute
…nges: - Emit both the streaming and non-streaming value of VG in the prologue of functions with the aarch64_pstate_sm_body attribute. - Added the VGUnwindInfoPseudo node which expands to either .cfi_restore or a .cfi_offset depending on the value of the immediate used (0 or 1 respectively). - VGUnwindInfoPseudo nodes are emitted with the smstop/smstart pair around calls to streaming-mode functions from a locally-streaming caller. The .cfi_offset will save the streaming-VG value, whilst the restore sets the rule for VG to the same as it was at the beginning of the function (non-streaming). - The frame index used for the streaming VG value is saved in AArch64FunctionInfo so that it can be used to calculate the offset when expanding the pseudo. - Added the @vg_locally_streaming_fn() test to sme-vg-to-stack.ll
spilled if there are streaming mode changes in the function. - Added requiresVGSpill() to AArch64MachineFunctionInfo which returns true if the function has streaming mode changes and hasSVE is true. With this change, we will no longer spill VG at the beginning of functions if the target does not also have SVE. - Removed SpilledStreamingVG flag from spillCalleeSavedRegisters. - Rebased to include recent changes to the changeStreamingMode interface.
…function. - Change emitCalleeSavedGPRLocations to only emit the non-streaming location of VG in the prologue for locally-streaming functions. - Move the .cfi_offset directive before the smstart/smstop. - Added streaming-compatible tests.
…possible because of streaming-mode changes without SVE available. - Fixed incorrect labels in sme-vg-to-stack.ll
- Rebased after new SME warnings were added to SemaChecking.cpp in main.
- Check for noexcept or nothrow when emitting Clang errors.
…IndicesReplaced and removed handling from AArch64ExpandPseudoInsts. - Removed diagnostics from Clang for unwinding without +sve. - Removed hasSVE() check when emitting pseudos around calls in AArch64ISelLowering. - Emit a call to __arm_get_current_vg from spillCalleeSavedRegisters if HasSVE is false & preserve X0 around the call if live. - Updated LLVM tests with streaming-mode changes to also pass +sve.
a streaming-mode change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some very minor nits, but overall LGTM.
bool NeedsWinCFI = needsWinCFI(MF); | ||
bool HasSVE = MF.getSubtarget<AArch64Subtarget>().hasSVE(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this only has one use, which seems quite far away from the definition. Perhaps just inline it?
} else { | ||
if (HasSVE) | ||
BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) | ||
.addImm(31) | ||
.addImm(1) | ||
.setMIFlag(MachineInstr::FrameSetup); | ||
else { | ||
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); | ||
for (const auto &LiveIn : MBB.liveins()) | ||
if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0, | ||
LiveIn.PhysReg)) | ||
X0Scratch = Reg1; | ||
|
||
if (X0Scratch != AArch64::NoRegister) | ||
BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1) | ||
.addReg(AArch64::XZR) | ||
.addReg(AArch64::X0, RegState::Undef) | ||
.addReg(AArch64::X0, RegState::Implicit) | ||
.setMIFlag(MachineInstr::FrameSetup); | ||
|
||
const uint32_t *RegMask = TRI->getCallPreservedMask( | ||
MF, CallingConv:: | ||
AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); | ||
BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) | ||
.addExternalSymbol("__arm_get_current_vg") | ||
.addRegMask(RegMask) | ||
.addReg(AArch64::X0, RegState::ImplicitDefine) | ||
.setMIFlag(MachineInstr::FrameSetup); | ||
Reg1 = AArch64::X0; | ||
} | ||
AFI->setVGIdx(RPI.FrameIdx); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: put AFI->setVGIdx(RPI.FrameIdx);
in both the if(HasSVE)
and the else
branch, and remove a level of indentation?
} else { | |
if (HasSVE) | |
BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) | |
.addImm(31) | |
.addImm(1) | |
.setMIFlag(MachineInstr::FrameSetup); | |
else { | |
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); | |
for (const auto &LiveIn : MBB.liveins()) | |
if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0, | |
LiveIn.PhysReg)) | |
X0Scratch = Reg1; | |
if (X0Scratch != AArch64::NoRegister) | |
BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1) | |
.addReg(AArch64::XZR) | |
.addReg(AArch64::X0, RegState::Undef) | |
.addReg(AArch64::X0, RegState::Implicit) | |
.setMIFlag(MachineInstr::FrameSetup); | |
const uint32_t *RegMask = TRI->getCallPreservedMask( | |
MF, CallingConv:: | |
AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); | |
BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) | |
.addExternalSymbol("__arm_get_current_vg") | |
.addRegMask(RegMask) | |
.addReg(AArch64::X0, RegState::ImplicitDefine) | |
.setMIFlag(MachineInstr::FrameSetup); | |
Reg1 = AArch64::X0; | |
} | |
AFI->setVGIdx(RPI.FrameIdx); | |
} | |
} else if (HasSVE) { | |
BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1) | |
.addImm(31) | |
.addImm(1) | |
.setMIFlag(MachineInstr::FrameSetup); | |
AFI->setVGIdx(RPI.FrameIdx); | |
} else { | |
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); | |
for (const auto &LiveIn : MBB.liveins()) | |
if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0, | |
LiveIn.PhysReg)) | |
X0Scratch = Reg1; | |
if (X0Scratch != AArch64::NoRegister) | |
BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1) | |
.addReg(AArch64::XZR) | |
.addReg(AArch64::X0, RegState::Undef) | |
.addReg(AArch64::X0, RegState::Implicit) | |
.setMIFlag(MachineInstr::FrameSetup); | |
const uint32_t *RegMask = TRI->getCallPreservedMask( | |
MF, CallingConv:: | |
AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); | |
BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) | |
.addExternalSymbol("__arm_get_current_vg") | |
.addRegMask(RegMask) | |
.addReg(AArch64::X0, RegState::ImplicitDefine) | |
.setMIFlag(MachineInstr::FrameSetup); | |
Reg1 = AArch64::X0; | |
AFI->setVGIdx(RPI.FrameIdx); | |
} |
for (const auto &LiveIn : MBB.liveins()) | ||
if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0, | ||
LiveIn.PhysReg)) | ||
X0Scratch = Reg1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this missing a break
?
If so, you can write this using any_of, e.g.:
if (llvm::any_of(MBB.liveins(), [&STI](const RegisterMaskPair &LiveIn) {
return STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0,
LiveIn.PhysReg);
}))
X0Scratch = Reg1;
- Removed HasSVE - Use any_of to check if X0 is a livein - Removed level of indentation from block handling VG spill
d6a7fb1
to
24b7e5e
Compare
…lvm#83301) If a function requires any streaming-mode change, the vector granule value must be stored to the stack and unwind info must also describe the save of VG to this location. This patch adds VG to the list of callee-saved registers and increases the callee-saved stack size if the function requires streaming-mode changes. A new type is added to RegPairInfo, which is also used to skip restoring the register used to spill the VG value in the epilogue. See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
If a function requires any streaming-mode change, the vector granule
value must be stored to the stack and unwind info must also describe the
save of VG to this location.
This patch adds VG to the list of callee-saved registers and increases the
callee-saved stack size if the function requires streaming-mode changes.
A new type is added to RegPairInfo, which is also used to skip restoring
the register used to spill the VG value in the epilogue.
See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst