[AArch64][SME] Save VG for unwind info when changing streaming-mode #83301

kmclaughlin-arm · 2024-02-28T17:29:20Z

If a function requires any streaming-mode change, the vector granule
value must be stored to the stack and unwind info must also describe the
save of VG to this location.

This patch adds VG to the list of callee-saved registers and increases the
callee-saved stack size if the function requires streaming-mode changes.
A new type is added to RegPairInfo, which is also used to skip restoring
the register used to spill the VG value in the epilogue.

See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst

llvmbot · 2024-02-28T17:29:37Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-aarch64

Author: Kerry McLaughlin (kmclaughlin-arm)

Changes

If a function requires any streaming-mode change, the vector granule
value must be stored to the stack and unwind info must also describe the
save of VG to this location.

This patch adds VG to the list of callee-saved registers and increases the
callee-saved stack size if the function requires streaming-mode changes.
A new type is added to RegPairInfo, which is also used to skip restoring
the register used to spill the VG value in the epilogue.

See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst

Patch is 166.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83301.diff

11 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+57-2)
(modified) llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll (+8-5)
(modified) llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll (+24-7)
(modified) llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll (+3-2)
(modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+279-193)
(modified) llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll (+19-11)
(modified) llvm/test/CodeGen/AArch64/sme-streaming-body.ll (+57-40)
(modified) llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll (+66-45)
(modified) llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (+26-13)
(modified) llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll (+5-4)
(added) llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll (+641)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 5cc612e89162af..68564bc2ea7bf5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
     return false;
 
   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
-  if (AFI->hasSwiftAsyncContext())
+  if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
     return false;
 
   // If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -691,6 +691,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
         !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
       continue;
 
+    if (!Info.isRestored())
+      continue;
+
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
         nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1344,6 +1347,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
     int CFAOffset = 0) {
   unsigned NewOpc;
+
   switch (MBBI->getOpcode()) {
   default:
     llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1651,6 +1655,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     LiveRegs.removeReg(AArch64::LR);
   }
 
+  // If the function contains streaming mode changes, we expect the first
+  // instruction of MBB to be a CNTD. Move past this instruction if found.
+  if (AFI->hasStreamingModeChanges()) {
+    assert(MBBI->getOpcode() == AArch64::CNTD_XPiI && "Unexpected instruction");
+    MBBI = std::next(MBBI);
+  }
+
   auto VerifyClobberOnExit = make_scope_exit([&]() {
     if (NonFrameStart == MBB.end())
       return;
@@ -2756,7 +2767,7 @@ struct RegPairInfo {
   unsigned Reg2 = AArch64::NoRegister;
   int FrameIdx;
   int Offset;
-  enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+  enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
 
   RegPairInfo() = default;
 
@@ -2768,6 +2779,7 @@ struct RegPairInfo {
       return 2;
     case GPR:
     case FPR64:
+    case VG:
       return 8;
     case ZPR:
     case FPR128:
@@ -2833,6 +2845,8 @@ static void computeCalleeSaveRegisterPairs(
       RPI.Type = RegPairInfo::ZPR;
     else if (AArch64::PPRRegClass.contains(RPI.Reg1))
       RPI.Type = RegPairInfo::PPR;
+    else if (RPI.Reg1 == AArch64::VG)
+      RPI.Type = RegPairInfo::VG;
     else
       llvm_unreachable("Unsupported register class.");
 
@@ -2860,6 +2874,7 @@ static void computeCalleeSaveRegisterPairs(
         break;
       case RegPairInfo::PPR:
       case RegPairInfo::ZPR:
+      case RegPairInfo::VG:
         break;
       }
     }
@@ -3047,7 +3062,23 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
        Size = 2;
        Alignment = Align(2);
        break;
+    case RegPairInfo::VG:
+      StrOpc = AArch64::STRXui;
+      Size = 8;
+      Alignment = Align(8);
+      break;
     }
+
+    if (Reg1 == AArch64::VG) {
+      // Find an available register to store value of VG to.
+      Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+      assert(Reg1 != AArch64::NoRegister);
+
+      BuildMI(MBB, MBB.begin(), DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+          .addImm(31)
+          .addImm(1);
+    }
+
     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
                dbgs() << ") -> fi#(" << RPI.FrameIdx;
@@ -3171,6 +3202,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
        Size = 2;
        Alignment = Align(2);
        break;
+    case RegPairInfo::VG:
+      continue;
     }
     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -3313,6 +3346,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       CSStackSize += RegSize;
   }
 
+  // Increase the callee-saved stack size if the function has streaming mode
+  // changes, as we will need to spill the value of the VG register.
+  if (AFI->hasStreamingModeChanges())
+    CSStackSize += 8;
+
   // Save number of saved regs, so we can easily update CSStackSize later.
   unsigned NumSavedRegs = SavedRegs.count();
 
@@ -3449,6 +3487,23 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
     if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
   }
 
+  // Insert VG into the list of CSRs, immediately before LR if saved.
+  if (AFI->hasStreamingModeChanges()) {
+    auto VGInfo = CalleeSavedInfo(AArch64::VG);
+    VGInfo.setRestored(false);
+    bool InsertBeforeLR = false;
+
+    for (unsigned I = 0; I < CSI.size(); I++)
+      if (CSI[I].getReg() == AArch64::LR) {
+        InsertBeforeLR = true;
+        CSI.insert(CSI.begin() + I, VGInfo);
+        break;
+      }
+
+    if (!InsertBeforeLR)
+      CSI.push_back(VGInfo);
+  }
+
   for (auto &CS : CSI) {
     Register Reg = CS.getReg();
     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 3fa1ee5b9b0114..2a57e4edff8080 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mattr=+sve < %s | FileCheck %s
 
 ; Verify that the following code can be compiled without +sme, because if the
 ; call is not entered in streaming-SVE mode at runtime, the codepath leading
@@ -10,11 +10,13 @@ target triple = "aarch64"
 define void @streaming_compatible() #0 {
 ; CHECK-LABEL: streaming_compatible:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB0_2
@@ -26,11 +28,12 @@ define void @streaming_compatible() #0 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @non_streaming()
   ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 2a78012045ff42..5605556275a96c 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -11,12 +11,14 @@ declare double @normal_callee(double)
 define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline optnone {
 ; CHECK-FISEL-LABEL: nonstreaming_caller_streaming_callee:
 ; CHECK-FISEL:       // %bb.0: // %entry
+; CHECK-FISEL-NEXT:    cntd x9
 ; CHECK-FISEL-NEXT:    sub sp, sp, #96
 ; CHECK-FISEL-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    smstart sm
 ; CHECK-FISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -37,12 +39,14 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ;
 ; CHECK-GISEL-LABEL: nonstreaming_caller_streaming_callee:
 ; CHECK-GISEL:       // %bb.0: // %entry
+; CHECK-GISEL-NEXT:    cntd x9
 ; CHECK-GISEL-NEXT:    sub sp, sp, #96
 ; CHECK-GISEL-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstart sm
 ; CHECK-GISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -70,12 +74,14 @@ entry:
 define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_enabled" {
 ; CHECK-COMMON-LABEL: streaming_caller_nonstreaming_callee:
 ; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    sub sp, sp, #96
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -102,12 +108,14 @@ entry:
 define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
 ; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
 ; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    sub sp, sp, #112
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x9, [sp, #104] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -166,11 +174,13 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi
 define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
 ; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
 ; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    blr x0
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -187,11 +197,13 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
 define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optnone {
 ; CHECK-COMMON-LABEL: normal_call_to_streaming_callee_ptr:
 ; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    blr x0
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -325,12 +337,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-COMMON-LABEL: f128_call_sm:
 ; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    sub sp, sp, #112
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
@@ -386,12 +399,13 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-COMMON-LABEL: frem_call_sm:
 ; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    sub sp, sp, #96
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s1, s0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldp s1, s0, [sp, #8] // 8-byte Folded Reload
@@ -414,12 +428,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
 define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-COMMON-LABEL: frem_call_sm_compat:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    sub sp, sp, #96
+; CHECK-COMMON-NEXT:    cntd x9
+; CHECK-COMMON-NEXT:    sub sp, sp, #112
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    bl __arm_sme_state
 ; CHECK-COMMON-NEXT:    ldp s2, s0, [sp, #8] // 8-byte Folded Reload
@@ -436,13 +452,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
 ; CHECK-COMMON-NEXT:  // %bb.3:
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:  .LBB12_4:
-; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #96
+; CHECK-COMMON-NEXT:    add sp, sp, #112
 ; CHECK-COMMON-NEXT:    ret
   %res = frem float %a, %b
   ret float %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f191..c24585a971fb7a 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -121,13 +121,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
@@ -160,7 +161,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    sub sp, x29, #64
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index d5bea725b6d14d..0fb85bb7e05a14 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -15,12 +15,13 @@ target triple = "aarch64-unknown-unknown-eabi-elf"
 define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    mov x19, x1
@@ -32,8 +33,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folde...
[truncated]

efriedma-quic · 2024-03-01T00:50:58Z

It seems weird to be generating actual instructions in the text section for the sake of unwind info, particularly for functions marked "nounwind".

Suppose we have a function that's aarch64_pstate_sm_body or aarch64_pstate_sm_enabled. We can statically determine whether SME is supposed to be enabled at any given point in the function. If we encode that information into the DWARF, the unwinder/debugger can figure it out too.

For aarch64_pstate_sm_compatible, we always have the result of __arm_sme_state saved in some register. (With one minor exception: if we're currently executing code in the function itself or __arm_sme_state, we might not have the value saved yet. But in that case a debugger can directly inspect the PSTATE register).

So if we invent an appropriate DWARF encoding, we should be able to represent all the necessary information without requiring any code at runtime, I think. Given that, what's the advantage of generating code to explicitly compute/spill "VG"?

kmclaughlin-arm · 2024-03-04T16:12:39Z

Hi @efriedma-quic, thank you for taking a look at this!

It seems weird to be generating actual instructions in the text section for the sake of unwind info, particularly for functions marked "nounwind".

In the latest commit I have made changes to ensure that functions marked with 'nounwind' will not store VG to the stack.

So if we invent an appropriate DWARF encoding, we should be able to represent all the necessary information without requiring any code at runtime, I think. Given that, what's the advantage of generating code to explicitly compute/spill "VG"?

I created this patch to spill the value of VG at the beginning of the function as this is requested by the ABI, specifically "the function's executable code must save the old value of VG to some location L before the operation that might change VG."

However, I'm not sure why we don't instead encode this information in debug info; perhaps this is something @rsandifo-arm might know more about?

rsandifo-arm · 2024-03-04T17:20:31Z

The mechanism is intended to be more general than simply streaming VL vs non-streaming VL. It is possible to change the non-streaming VL from one value to another, such as via a linux prctl. In those cases there would be no way for the unwinder to recover the previous VL unless the previous VL was specifically saved.

So yes, I agree it's unusual to emit code specifically for unwinding purposes, but it seemed like the best compromise.

FWIW, there are already other situations where unwinding constrains code generation. For example, if a big-endian function follows the SVE PCS and needs to save Z8, it must do the save using ST1D rather than STR, so that the low 64 bits (D8) are laid out as the unwinder expects. Using ST1D requires a predicate input and has a smaller range, so it isn't the instruction that would naturally be chosen.

efriedma-quic · 2024-03-04T20:54:07Z

In the latest commit I have made changes to ensure that functions marked with 'nounwind' will not store VG to the stack.

Not sure this is actually the right choice, given the interaction with debug info. Presumably we want debuggers to be able to produce a stack trace for nounwind code.

On a related note, I'm not sure what we're generating here is really what we want for aarch64_pstate_sm_body. A aarch64_pstate_sm_body has two vector lengths: the length used for spilling the callee-saves, and the length used for local variables. If we only store the former, debuggers can't display local variables.

The mechanism is intended to be more general than simply streaming VL vs non-streaming VL. It is possible to change the non-streaming VL from one value to another, such as via a linux prctl. In those cases there would be no way for the unwinder to recover the previous VL unless the previous VL was specifically saved.

You don't necessarily need to use the same mechanism for this. And I suspect using the same mechanism is going to be sort of awkward.

FWIW, there are already other situations where unwinding constrains code generation.

Yes, I know, there are situations where this comes up. Particularly on Windows. But I'd like to avoid it when it isn't necessary, and if we're defining the spec, we should consider this situation as we write the spec.

kmclaughlin-arm · 2024-03-26T11:32:05Z

On a related note, I'm not sure what we're generating here is really what we want for aarch64_pstate_sm_body. A aarch64_pstate_sm_body has two vector lengths: the length used for spilling the callee-saves, and the length used for local variables. If we only store the former, debuggers can't display local variables.

Thank you for raising this @efriedma-quic , it's something that was not handled correctly in my original patch. After some offline discussion about this scenario, I have made changes to the patch to ensure the correct value of VG can be recovered from locally-streaming functions. As I understand it, we need to store both the streaming and non-streaming vector length in the prologue and additionally save the streaming length before calls which require streaming-mode changes. After such calls, I believe we need to use .cfi_restore to set the rule for VG to the same as it was at the beginning of the function (which will be the non-streaming length).

This is based on my understanding of the specification as it is today, but I am happy to try a different implementation if this changes.

Not sure this is actually the right choice, given the interaction with debug info. Presumably we want debuggers to be able to produce a stack trace for nounwind code.

Do you know if there is something else I could be checking to decide whether to emit VG for the nounwind case, or if it's better to remove the calls to F.needsUnwindTableEntry() that I added in the last commit altogether?

efriedma-quic · 2024-03-27T18:25:08Z

On a related note, I'm not sure what we're generating here is really what we want for aarch64_pstate_sm_body. A aarch64_pstate_sm_body has two vector lengths: the length used for spilling the callee-saves, and the length used for local variables. If we only store the former, debuggers can't display local variables.

Thank you for raising this @efriedma-quic , it's something that was not handled correctly in my original patch. After some offline discussion about this scenario, I have made changes to the patch to ensure the correct value of VG can be recovered from locally-streaming functions. As I understand it, we need to store both the streaming and non-streaming vector length in the prologue and additionally save the streaming length before calls which require streaming-mode changes. After such calls, I believe we need to use .cfi_restore to set the rule for VG to the same as it was at the beginning of the function (which will be the non-streaming length).

This is based on my understanding of the specification as it is today, but I am happy to try a different implementation if this changes.

The sequences here seem fine.

Not sure this is actually the right choice, given the interaction with debug info. Presumably we want debuggers to be able to produce a stack trace for nounwind code.

Do you know if there is something else I could be checking to decide whether to emit VG for the nounwind case, or if it's better to remove the calls to F.needsUnwindTableEntry() that I added in the last commit altogether?

I think just remove the needsUnwindTableEntry() checks; it's more important to have consistent debug vs. non-debug codegen, vs. saving a couple instructions in the prologue.

On additional concern I just realized looking at the patch again: I don't think we can use cntd in non-streaming mode on targets without SVE. (See #86743 for more discussion of SME without SVE.)

kmclaughlin-arm · 2024-04-05T10:06:56Z

On additional concern I just realized looking at the patch again: I don't think we can use cntd in non-streaming mode on targets without SVE. (See #86743 for more discussion of SME without SVE.)

If the target doesn't have SVE, I don't think there is a need to save VG even in functions which have streaming-mode changes. I've added a helper to AArch64MachineFunctionInfo which will ensure we only spill VG if both hasSVE and hasStreamingModeChanges are true.

efriedma-quic · 2024-04-05T18:38:38Z

If the target doesn't have SVE, I don't think there is a need to save VG even in functions which have streaming-mode changes

Sure, but "-sve" doesn't mean the target doesn't have SVE; it just means the target may or may not have SVE. What if you compile the code with SVE disabled, and the target actually does end up having SVE at runtime? (e.g. you compile a library in SME-but-not-SVE mode, but it gets linked into an application that uses SVE.)

kmclaughlin-arm · 2024-04-12T17:12:43Z

What if you compile the code with SVE disabled, and the target actually does end up having SVE at runtime? (e.g. you compile a library in SME-but-not-SVE mode, but it gets linked into an application that uses SVE.)

I believe checking for hasSVE will be sufficient for non-streaming functions which contain a call to a streaming-mode function. Even if the target does end up having SVE, there shouldn't be any relevant VG value in the function to save if it was only compiled with SME.

However, there are a couple of cases not covered here if the code is compiled without SVE, but the target has SVE:

Streaming caller -> non-streaming callee.
Streaming-compatible caller -> non-streaming callee.

It should be quite straightforward to extend this patch to cover the first of the cases above, as we can still emit cntd in the prologue if the function is in streaming mode. However, the second will be more involved as we can only emit the cntd & .cfi_offset if the condition guarding the smstart/smstop is true at runtime. To keep this first patch simpler, would you be happy for this to be limited to handle only the typical cases where the target has SVE?

efriedma-quic · 2024-04-12T21:54:11Z

Streaming caller -> non-streaming callee works, sure.

From your description, not completely sure how you plan to handle the streaming-compatible caller -> non-streaming callee case; there's no such thing as a conditional cfi_offset, as far as I know. I'd at least like to see the proposed assembly sequence, even if you don't implement it in this patch.

The problem case is "non-streaming SVE-enabled caller -> locally-streaming SVE-disabled callee". The caller doesn't set VG because it doesn't know anything about streaming. The callee can't set VG because it can't read the non-streaming vector length. The only way I can think of to make it work, given the specified structure of the unwind data, is a runtime check for whether SVE is available. But the ABI doesn't provide any way to check that at the moment.

I don't have a problem with splitting the patches however you think is appropriate, but we need to make sure the proposed scheme actually works before we go forward with it.

sdesmalen-arm · 2024-04-15T15:13:07Z

The problem case is "non-streaming SVE-enabled caller -> locally-streaming SVE-disabled callee". The caller doesn't set VG because it doesn't know anything about streaming. The callee can't set VG because it can't read the non-streaming vector length. The only way I can think of to make it work, given the specified structure of the unwind data, is a runtime check for whether SVE is available. But the ABI doesn't provide any way to check that at the moment.

If I understand you correctly, you're concerned about the following case:

__attribute__((target("+sme,+nosve")))
void c() __arm_streaming {
  ...
}

__attribute__((target("+sme,+nosve"))
void b() { c(); }

__attribute__((target("+sve")))
void a() { b(); }

where in b() it's not possible to read and save VG (since the function is compiled with +nosve and the function is not in streaming mode), so when unwinding from c->b->a there is no information to tell the unwinder/debugger what the value of VG is inside a().

I think the critical point here is that having SME without also having SVE available in non-streaming mode is an atypical use-case for which the ABI was never really intended. SME is an Armv9-A feature which generally expects SVE to be available (in non-streaming mode), similar to how Armv8-A generally expects AdvSIMD to be available (see note in section A1.5 of the Arm Reference Manual). In LLVM we have tried to keep the two features conceptually separate so not to tie ourselves in in case this requirement ever needs to be relaxed.

If this is a use-case that ever needs supporting in the ABI, I guess this could be done by adding an extra ABI routine that safely provides the current value of VG iff SVE is available at runtime.

efriedma-quic · 2024-04-15T16:45:26Z

If the ABI spec is intentionally not supporting the combination for now, that's fine, I guess? As you note, it should be feasible to extend the ABI without breaking backward-compatibility. Probably the compiler should report an error on the unsupported combinations, though.

efriedma-quic · 2024-04-19T18:52:32Z

clang/lib/Sema/SemaChecking.cpp

+      if (CallerFD->hasAttr<ArmLocallyStreamingAttr>())
+        Diag(Loc, diag::warn_sme_locally_streaming_no_sve);
+
+        auto CallerStreamingTy = getArmStreamingFnType(CallerFD);


Indentation?

Fixed in latest commit

kmclaughlin-arm · 2024-04-30T12:37:33Z

Gentle ping :)

efriedma-quic · 2024-05-05T20:35:16Z

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

+// the streaming value of VG around streaming-mode changes in locally-streaming
+// functions.
+def VGUnwindInfoPseudo :
+  Pseudo<(outs), (ins timm0_1:$save_restore), []>, Sched<[]>;


Is there a reason to make this one pseudo, instead of two? The two operations have opposite semantics, and opcode space isn't that scarce.

There wasn't any particular reason for only adding only pseudo, so I've split this out into two (VGSavePseudo & VGRestorePseudo).

efriedma-quic · 2024-05-05T20:42:27Z

clang/include/clang/Basic/DiagnosticSemaKinds.td

@@ -3730,6 +3730,12 @@ def warn_gnu_inline_cplusplus_without_extern : Warning<
  "'gnu_inline' attribute without 'extern' in C++ treated as externally"
  " available, this changed in Clang 10">,
  InGroup<DiagGroup<"gnu-inline-cpp-without-extern">>;
+def warn_sme_streaming_mode_change_no_sve : Warning<
+  "function requires a streaming-mode change, unwinding is not possible without 'sve'">,
+  InGroup<AArch64SMEAttributes>;


This should probably be an error if it's possible to unwind: it's effectively miscompile. Both here, and in the backend. Add a note suggesting marking the function noexcept/__attribute((nothrow)).

If it isn't possible to unwind, you just end up with slightly inaccurate debug info, which is just annoying; probably not worth warning for that.

Thanks @efriedma-quic, I have changed these warnings to errors as suggested and also updated SemaChecking.cpp to only emit them if the callee is not marked with either noexcept or nothrow. I thought that we should not emit the errors if fno-exceptions is used as well, so I've added a check for getLangOpts().Exceptions in the same place.

I started adding similar asserts to LLVM to ensure that if SVE is not available, that nounwind is set on the callee. I haven't included these in the latest commit however, as many of the LLVM tests for SME do not currently pass -mattr=+sve and would require updating a lot of CHECK lines. I would prefer to work on this separately and post a new patch following this PR if possible, as I think including all of the test changes will make this PR more difficult to review.

efriedma-quic

LGTM

sdesmalen-arm · 2024-05-14T15:32:55Z

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

+     if (!AFI->requiresVGSpill(MF))
+       return false;


This can be removed, because the pseudo should not have been emitted if the function didn't require a spill of VG.

sdesmalen-arm · 2024-05-14T15:34:25Z

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

+  int64_t getVGIdx() const { return VGIdx; };
+  void setVGIdx(unsigned Idx) { VGIdx = Idx; };
+
+  int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
+  void setStreamingVGIdx(unsigned Idx) { StreamingVGIdx = Idx; };


minor nit: Could you use FrameIdx instead of just Idx?

sdesmalen-arm · 2024-05-15T08:19:38Z

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

+       // This is a restore of VG after returning from the call. Emit the
+       // .cfi_restore instruction, which sets the rule for VG to the same
+       // as it was on entry to the function.
+       ++MBBI;


I'm not sure why you're incrementing the iterator? I don't think this makes a difference if you remove the pseudo? In fact, when I remove this all the tests still pass.

This was a mistake, the iterator shouldn't be changing here as the pseudos are emitted in the correct place around the call.

sdesmalen-arm · 2024-05-15T08:20:43Z

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

+               TII.get(TargetOpcode::CFI_INSTRUCTION))
+           .addCFIIndex(CFIIndex)
+           .setMIFlags(MachineInstr::FrameSetup);
+     } else {


There is little common code between the two, so I'd rather see this written as:

case AArch64::VGSavePseudo: { ... } case AArch64::VGRestorePseudo: { ... }

Expanding the pseudos has moved to AArch64FrameLowering.cpp. There is more common code here, so I have only added one function to expand both (emitVGSaveRestore). I can split this into an emitVGSave/emitVGRestore if you'd still prefer they be kept separate.

sdesmalen-arm · 2024-05-15T08:21:39Z

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

+            "Expected FrameIdx for VG");
+
+     const TargetSubtargetInfo &STI = MF.getSubtarget();
+     const TargetInstrInfo &TII = *STI.getInstrInfo();


nit: TII is already available as a member of AArch64ExpandPseudo, no need to get it again.

sdesmalen-arm · 2024-05-15T08:22:34Z

llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

+       return false;
+
+     int64_t VGFrameIdx =
+         LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();


It would be nice if we could keep all this knowledge within the FrameLowering without having information that we implicitly pass (through AFI) between different passes (in this case, PEI and PseudoExpansion).

PEI has a callback named processFunctionBeforeFrameIndicesReplaced. You could update the VGSave/RestorePseudo nodes to add the offset, so that the code here simply has to replace the pseudo by a CFI_INSTRUCTION. Or you could replace the pseudo in that callback itself.

sdesmalen-arm · 2024-05-15T08:30:45Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -8287,6 +8289,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

  SDValue InGlue;
  if (RequiresSMChange) {
+
+    if (Subtarget->hasSVE()) {


I think we can only emit this pseudo when we have asynchronous unwind tables enabled. At the moment, when I compile some code with -fno-asynchronous-unwind-tables, it will still generate these directives. Perhaps this should have a diagnostic in Clang?

I'm not sure I follow what the issue is... I think we discussed before that we want debuggers to be able to unwind the stack even if a function is nounwind.

when I compile some code with -fno-asynchronous-unwind-tables, it will still generate these directives. Perhaps this should have a diagnostic in Clang?

I'm happy to add such a diagnostic, but I want to make sure I have the reason for this requirement correct.
If I understand correctly, enabling asychronous unwind tables is required now that I am emitting the save and restore of VG at the point of each call in the function which changes streaming-mode.

we want debuggers to be able to unwind the stack even if a function is nounwind.

Given this, I'm not entirely sure why we would expect to be able to unwind if -fno-asynchronous-unwind-tables was used or if the function has been marked as nounwind? I might have misunderstood something though!

There are two different kinds of DWARF "unwind info"; one is the kind that's in a loadable section, and used for EH. The other is in a debug info section, not loaded at runtime. If you specify -fasynchronous-unwind-tables, you get the former; if you specify -g -fno-asynchronous-unwind-tables, you get the latter.

If you request no debug info and no unwind tables, we shouldn't emit any DWARF directives.

But like I mentioned, I think we want to unconditionally emit the code to save the VG, whether or not we emit the corresponding DWARF directives.

If we still emit the spill of VG with -fno-asynchronous-unwind-tables, I don't know how we would be able to recover the correct value without the per-call CFI saves and restores. And without the correct VG value, I don't think it will be possible to recover any VG based values in the stack frame.

I'm still not sure why this wouldn't require a diagnostic, because if there is not enough information to recover VG then I don't think we can unwind correctly?

If there's no DWARF unwind, nothing can unwind the stack whether or not we store the VG.

@efriedma-quic I think the issue is that when we emit unwind info that is not asynchronous, then the unwinder can't correctly unwind the stack because it would use the wrong value for VG to compute the offsets of callee-saves. So any unwind info that would be produced is broken.

Oh, right, there's also that dimension. I'm not sure I understand the interaction here, but if there's an issue, can we just force on "asynchronous" unwind info in that case? The point of non-async unwind info isn't that it's a different unwind format; it's just an optimization to reduce the size of the unwind info.

I've made changes to needsAsyncDwarfUnwindInfo in AArch64MachineFunctionInfo.cpp to always return true if the function has streaming-mode changes. I believe this will ensure we emit the correct information when -fno-asynchronous-unwind-tables is specified.

sdesmalen-arm · 2024-05-15T08:31:02Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -8443,9 +8452,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

  if (RequiresSMChange) {
    assert(PStateSM && "Expected a PStateSM to be set");
+


nit: unnecessary newline.

sdesmalen-arm · 2024-05-15T08:57:37Z

clang/include/clang/Basic/DiagnosticSemaKinds.td

@@ -3768,6 +3768,12 @@ def err_conflicting_attributes_arm_state : Error<
  "conflicting attributes for state '%0'">;
 def err_sme_streaming_cannot_be_multiversioned : Error<
  "streaming function cannot be multi-versioned">;
+def err_sme_streaming_mode_change_no_sve : Error<
+  "function requires a streaming-mode change, unwinding is not possible without 'sve'. "
+  "Consider marking this function as 'noexcept' or '__attribute__((nothrow))'">;


It probably makes more sense to add a routine to compiler-rt that returns the value of VG if SVE is available, rather than emitting an error here. You can implement that function using the (already existing) interfaces for function-multiversioning to check if SVE is available at runtime. In that case, I think there is little value in having these Clang changes here.

@efriedma-quic are you happy going with that approach instead?

If we call new functions, they need to be part of the ABI. If you're happy to work with your ABI people to document the new interface, I guess it's not a problem.

Documentation for this function was added in ARM-software/abi-aa#263 and #92921 adds the routine to compiler-rt.

efriedma-quic · 2024-06-07T19:19:41Z

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp

-                                F.getUWTableKind() == UWTableKind::Async &&
-                                !F.hasMinSize();
+    NeedsAsyncDwarfUnwindInfo =
+        (needsDwarfUnwindInfo(MF) && F.getUWTableKind() == UWTableKind::Async &&


If we can't emit correct async unwind info for functions with outlining/homogeneous epilogues, does that mean we also need to disable outlining/homogeneous epilogues for functions with streaming mode changes?

Oh, also, this should probably be NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) && ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) || AFI->hasStreamingModeChanges());.

This PR does include a change to AArch64FrameLowering::homogeneousPrologEpilog which disables homogeneous epilogues if the function has streaming-mode changes.

I hadn't considered outlining, but I can see that when considering candidates we must be able to outline all CFI instructions in the function. Am I correct in thinking that this is the reason we would need to disable outlining when there are streaming-mode changes which require async unwind info?

I'm not sure exactly what the issues are with outlining at this point, but the last time async-unwind was looked at, there apparently were issues.

I think outlining from functions with streaming-mode changes needs more investigation. I don't think this is just a concern for async unwind; I noticed that when passing -enable-machine-outliner to sme-vg-to-stack.ll that some calls are outlined with only one of the smstart/smstop instructions surrounding the call. I'm not sure if this is safe yet, so for now I've disabled outlining for these functions in isFunctionSafeToOutlineFrom.

efriedma-quic · 2024-06-07T19:28:12Z

llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll

@@ -214,7 +232,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
 define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
 ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
 ; CHECK-COMMON:       // %bb.0: // %prelude
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill


I feel like I must have asked about this at some point, but where is the x19 spill coming from?

The base pointer x19 is added to the list of saved registers in determineCalleeSaves if the function is in streaming mode or has SVE. The spill was introduced here when I enabled SVE for this test.

Oh, right, it's not directly connected to this patch, it's just because you're changing the RUN line.

Please file a bug for this.

I've taken a closer look at this and I think the changes in @za_new_caller_to_za_shared_callee are correct. Because it's an aarch64_new_za function, it has to set up a lazy-save buffer on entry and a variable sized object is allocated on the stack for this. The function is not streaming, so x19 will only be spilled if the function has SVE.

The buffer is not used however, and #81648 was created to remove the lazy-save from tests like this where it is not required.

efriedma-quic

LGTM

If a function requires any streaming-mode change, the vector granule value must be stored to the stack and unwind info must also describe the save of VG to this location. This patch adds VG to the list of callee-saved registers and increases the callee-saved stack size in determineCalleeSaves if the function requires streaming-mode changes. A new type is added to RegPairInfo for VG, which is also used to skip restoring the register in the restore block. See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst

- Added a test to sme-vg-to-stack.ll with the 'nounwind' attribute

…nges: - Emit both the streaming and non-streaming value of VG in the prologue of functions with the aarch64_pstate_sm_body attribute. - Added the VGUnwindInfoPseudo node which expands to either .cfi_restore or a .cfi_offset depending on the value of the immediate used (0 or 1 respectively). - VGUnwindInfoPseudo nodes are emitted with the smstop/smstart pair around calls to streaming-mode functions from a locally-streaming caller. The .cfi_offset will save the streaming-VG value, whilst the restore sets the rule for VG to the same as it was at the beginning of the function (non-streaming). - The frame index used for the streaming VG value is saved in AArch64FunctionInfo so that it can be used to calculate the offset when expanding the pseudo. - Added the @vg_locally_streaming_fn() test to sme-vg-to-stack.ll

spilled if there are streaming mode changes in the function. - Added requiresVGSpill() to AArch64MachineFunctionInfo which returns true if the function has streaming mode changes and hasSVE is true. With this change, we will no longer spill VG at the beginning of functions if the target does not also have SVE. - Removed SpilledStreamingVG flag from spillCalleeSavedRegisters. - Rebased to include recent changes to the changeStreamingMode interface.

…function. - Change emitCalleeSavedGPRLocations to only emit the non-streaming location of VG in the prologue for locally-streaming functions. - Move the .cfi_offset directive before the smstart/smstop. - Added streaming-compatible tests.

…possible because of streaming-mode changes without SVE available. - Fixed incorrect labels in sme-vg-to-stack.ll

- Rebased after new SME warnings were added to SemaChecking.cpp in main.

- Check for noexcept or nothrow when emitting Clang errors.

…IndicesReplaced and removed handling from AArch64ExpandPseudoInsts. - Removed diagnostics from Clang for unwinding without +sve. - Removed hasSVE() check when emitting pseudos around calls in AArch64ISelLowering. - Emit a call to __arm_get_current_vg from spillCalleeSavedRegisters if HasSVE is false & preserve X0 around the call if live. - Updated LLVM tests with streaming-mode changes to also pass +sve.

a streaming-mode change.

sdesmalen-arm

Some very minor nits, but overall LGTM.

sdesmalen-arm · 2024-06-13T07:55:04Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

  bool NeedsWinCFI = needsWinCFI(MF);
+  bool HasSVE = MF.getSubtarget<AArch64Subtarget>().hasSVE();


nit: this only has one use, which seems quite far away from the definition. Perhaps just inline it?

sdesmalen-arm · 2024-06-13T07:57:55Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+      } else {
+        if (HasSVE)
+          BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+              .addImm(31)
+              .addImm(1)
+              .setMIFlag(MachineInstr::FrameSetup);
+        else {
+          const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+          for (const auto &LiveIn : MBB.liveins())
+            if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0,
+                                                              LiveIn.PhysReg))
+              X0Scratch = Reg1;
+
+          if (X0Scratch != AArch64::NoRegister)
+            BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
+                .addReg(AArch64::XZR)
+                .addReg(AArch64::X0, RegState::Undef)
+                .addReg(AArch64::X0, RegState::Implicit)
+                .setMIFlag(MachineInstr::FrameSetup);
+
+          const uint32_t *RegMask = TRI->getCallPreservedMask(
+              MF, CallingConv::
+                      AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+          BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
+              .addExternalSymbol("__arm_get_current_vg")
+              .addRegMask(RegMask)
+              .addReg(AArch64::X0, RegState::ImplicitDefine)
+              .setMIFlag(MachineInstr::FrameSetup);
+          Reg1 = AArch64::X0;
+        }
+        AFI->setVGIdx(RPI.FrameIdx);
+      }


nit: put AFI->setVGIdx(RPI.FrameIdx); in both the if(HasSVE) and the else branch, and remove a level of indentation?

Suggested change

} else {

if (HasSVE)

BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)

.addImm(31)

.addImm(1)

.setMIFlag(MachineInstr::FrameSetup);

else {

const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();

for (const auto &LiveIn : MBB.liveins())

if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0,

LiveIn.PhysReg))

X0Scratch = Reg1;

if (X0Scratch != AArch64::NoRegister)

BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)

.addReg(AArch64::XZR)

.addReg(AArch64::X0, RegState::Undef)

.addReg(AArch64::X0, RegState::Implicit)

.setMIFlag(MachineInstr::FrameSetup);

const uint32_t *RegMask = TRI->getCallPreservedMask(

MF, CallingConv::

AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);

BuildMI(MBB, MI, DL, TII.get(AArch64::BL))

.addExternalSymbol("__arm_get_current_vg")

.addRegMask(RegMask)

.addReg(AArch64::X0, RegState::ImplicitDefine)

.setMIFlag(MachineInstr::FrameSetup);

Reg1 = AArch64::X0;

}

AFI->setVGIdx(RPI.FrameIdx);

}

} else if (HasSVE) {

BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)

.addImm(31)

.addImm(1)

.setMIFlag(MachineInstr::FrameSetup);

AFI->setVGIdx(RPI.FrameIdx);

} else {

const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();

for (const auto &LiveIn : MBB.liveins())

if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0,

LiveIn.PhysReg))

X0Scratch = Reg1;

if (X0Scratch != AArch64::NoRegister)

BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)

.addReg(AArch64::XZR)

.addReg(AArch64::X0, RegState::Undef)

.addReg(AArch64::X0, RegState::Implicit)

.setMIFlag(MachineInstr::FrameSetup);

const uint32_t *RegMask = TRI->getCallPreservedMask(

MF, CallingConv::

AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);

BuildMI(MBB, MI, DL, TII.get(AArch64::BL))

.addExternalSymbol("__arm_get_current_vg")

.addRegMask(RegMask)

.addReg(AArch64::X0, RegState::ImplicitDefine)

.setMIFlag(MachineInstr::FrameSetup);

Reg1 = AArch64::X0;

AFI->setVGIdx(RPI.FrameIdx);

}

sdesmalen-arm · 2024-06-13T08:16:14Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+          for (const auto &LiveIn : MBB.liveins())
+            if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0,
+                                                              LiveIn.PhysReg))
+              X0Scratch = Reg1;


Is this missing a break ?

If so, you can write this using any_of, e.g.:

if (llvm::any_of(MBB.liveins(), [&STI](const RegisterMaskPair &LiveIn) { return STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0, LiveIn.PhysReg); })) X0Scratch = Reg1;

- Removed HasSVE - Use any_of to check if X0 is a livein - Removed level of indentation from block handling VG spill

…lvm#83301) If a function requires any streaming-mode change, the vector granule value must be stored to the stack and unwind info must also describe the save of VG to this location. This patch adds VG to the list of callee-saved registers and increases the callee-saved stack size if the function requires streaming-mode changes. A new type is added to RegPairInfo, which is also used to skip restoring the register used to spill the VG value in the epilogue. See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst

kmclaughlin-arm added the backend:AArch64 label Feb 28, 2024

kmclaughlin-arm requested a review from momchil-velikov February 28, 2024 17:29

kmclaughlin-arm requested review from sdesmalen-arm, aemerson, SamTebbs33 and efriedma-quic February 28, 2024 17:29

kmclaughlin-arm force-pushed the save-vg-streaming-change branch from c521c76 to fd56aa3 Compare April 5, 2024 10:04

llvmbot added clang Clang issues not falling into any other category clang:frontend Language frontend issues, e.g. anything involving "Sema" labels Apr 19, 2024

efriedma-quic reviewed Apr 19, 2024

View reviewed changes

kmclaughlin-arm force-pushed the save-vg-streaming-change branch from ae250d9 to 255f040 Compare April 22, 2024 13:40

efriedma-quic reviewed May 5, 2024

View reviewed changes

efriedma-quic approved these changes May 9, 2024

View reviewed changes

efriedma-quic mentioned this pull request May 9, 2024

[AArch64][SME] Fix frame lowering not using a base pointer for SME functions. #91643

Merged

sdesmalen-arm reviewed May 15, 2024

View reviewed changes

kmclaughlin-arm force-pushed the save-vg-streaming-change branch from d73fa84 to 6d258dc Compare June 3, 2024 16:46

kmclaughlin-arm force-pushed the save-vg-streaming-change branch from 6d258dc to 2750fb2 Compare June 6, 2024 12:36

efriedma-quic reviewed Jun 7, 2024

View reviewed changes

efriedma-quic approved these changes Jun 11, 2024

View reviewed changes

kmclaughlin-arm added 14 commits June 12, 2024 09:58

- Prevent spill of VG for functions marked as 'nounwind'

bb8fef4

- Added a test to sme-vg-to-stack.ll with the 'nounwind' attribute

- Add warnings to clang for unsupported cases where unwinding is not …

c4e8026

…possible because of streaming-mode changes without SVE available. - Fixed incorrect labels in sme-vg-to-stack.ll

- Fixed indentation in checkCall.

291702b

- Rebased after new SME warnings were added to SemaChecking.cpp in main.

- Split pseudo into VGSavePseudo & VGRestorePseudo.

989af1e

- Check for noexcept or nothrow when emitting Clang errors.

- Added PreserveMost_From_X1 regmask to __arm_get_current_vg call

471560b

- Return true from needsAsyncDwarfUnwindInfo() if the function contains

3e63e6a

a streaming-mode change.

- Fixed misplaced brackets in needsAsyncDwarfUnwindInfo()

3f1b271

- Disable outlining from functions which require streaming-mode changes.

3b410ac

- Rebased & updated outlining-with-streaming-mode-changes.ll test

56c9f39

sdesmalen-arm approved these changes Jun 13, 2024

View reviewed changes

Improvements to spillCalleeSavedRegisters:

24b7e5e

- Removed HasSVE - Use any_of to check if X0 is a livein - Removed level of indentation from block handling VG spill

kmclaughlin-arm force-pushed the save-vg-streaming-change branch from d6a7fb1 to 24b7e5e Compare June 13, 2024 11:19

kmclaughlin-arm merged commit 93c8e0f into llvm:main Jun 13, 2024
5 of 6 checks passed

efriedma-quic mentioned this pull request Jun 28, 2024

[AArch64] [Windows] Functions using SVE can fail "WinCFI not supported with SVE vectors" #80009

Open

		@@ -8443,9 +8452,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

		if (RequiresSMChange) {
		assert(PStateSM && "Expected a PStateSM to be set");

		bool NeedsWinCFI = needsWinCFI(MF);
		bool HasSVE = MF.getSubtarget<AArch64Subtarget>().hasSVE();

[AArch64][SME] Save VG for unwind info when changing streaming-mode #83301

[AArch64][SME] Save VG for unwind info when changing streaming-mode #83301

Conversation

kmclaughlin-arm commented Feb 28, 2024

llvmbot commented Feb 28, 2024 • edited Loading

efriedma-quic commented Mar 1, 2024

kmclaughlin-arm commented Mar 4, 2024

rsandifo-arm commented Mar 4, 2024

efriedma-quic commented Mar 4, 2024

kmclaughlin-arm commented Mar 26, 2024

efriedma-quic commented Mar 27, 2024

kmclaughlin-arm commented Apr 5, 2024

efriedma-quic commented Apr 5, 2024 • edited Loading

kmclaughlin-arm commented Apr 12, 2024

efriedma-quic commented Apr 12, 2024

sdesmalen-arm commented Apr 15, 2024

efriedma-quic commented Apr 15, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kmclaughlin-arm commented Apr 30, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

efriedma-quic left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

efriedma-quic left a comment

Choose a reason for hiding this comment

sdesmalen-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

llvmbot commented Feb 28, 2024 •

edited

Loading

efriedma-quic commented Apr 5, 2024 •

edited

Loading