[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… #77665

CarolineConcatto · 2024-01-10T18:42:02Z

…VE2.1 for spill/fill

When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.

llvmbot · 2024-01-10T18:42:30Z

@llvm/pr-subscribers-backend-aarch64

Author: None (CarolineConcatto)

Changes

…VE2.1 for spill/fill

When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.

Patch is 302.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77665.diff

5 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+78-22)
(modified) llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h (+2)
(modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll (+528-928)
(modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll (+528-928)
(added) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll (+295)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index caab59201a8d69..7a332bfd18c21b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1480,6 +1480,11 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   switch (I->getOpcode()) {
   default:
     return false;
+  case AArch64::PTRUE_C_B:
+  case AArch64::LD1B_2Z_IMM:
+  case AArch64::ST1B_2Z_IMM:
+    return I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
+           I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2();
   case AArch64::STR_ZXI:
   case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
@@ -2753,6 +2758,16 @@ struct RegPairInfo {
 
 } // end anonymous namespace
 
+unsigned findFreePredicateAsCounterReg(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MCRegister PReg :
+       {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
+        AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
+    if (!MRI.isReserved(PReg))
+      return PReg;
+  }
+  llvm_unreachable("cannot find a free predicate");
+}
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2763,6 +2778,7 @@ static void computeCalleeSaveRegisterPairs(
 
   bool IsWindows = isTargetWindows(MF);
   bool NeedsWinCFI = needsWinCFI(MF);
+  const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -2831,7 +2847,11 @@ static void computeCalleeSaveRegisterPairs(
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::PPR:
+        break;
       case RegPairInfo::ZPR:
+        if (Subtarget.hasSVE2p1() || Subtarget.hasSME2())
+          if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+            RPI.Reg2 = NextReg;
         break;
       }
     }
@@ -2876,7 +2896,7 @@ static void computeCalleeSaveRegisterPairs(
     assert(OffsetPre % Scale == 0);
 
     if (RPI.isScalable())
-      ScalableByteOffset += StackFillDir * Scale;
+      ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
     else
       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
 
@@ -2887,9 +2907,6 @@ static void computeCalleeSaveRegisterPairs(
          (IsWindows && RPI.Reg2 == AArch64::LR)))
       ByteOffset += StackFillDir * 8;
 
-    assert(!(RPI.isScalable() && RPI.isPaired()) &&
-           "Paired spill/fill instructions don't exist for SVE vectors");
-
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
     if (NeedGapToAlignStack && !NeedsWinCFI &&
@@ -2976,6 +2993,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     }
     return true;
   }
+  bool PtrueCreated = false;
   for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
@@ -3010,10 +3028,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
        Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
-       StrOpc = AArch64::STR_ZXI;
-       Size = 16;
-       Alignment = Align(16);
-       break;
+      StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+      Size = 16;
+      Alignment = Align(16);
+      break;
     case RegPairInfo::PPR:
        StrOpc = AArch64::STR_PXI;
        Size = 2;
@@ -3037,19 +3055,37 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
+
+    unsigned PnReg;
+    unsigned PairRegs;
+    if (RPI.isPaired() && RPI.isScalable()) {
+      PnReg = findFreePredicateAsCounterReg(MF);
+      PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+      if (!PtrueCreated) {
+        PtrueCreated = true;
+        BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+            .setMIFlags(MachineInstr::FrameDestroy);
+      }
+    }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     if (!MRI.isReserved(Reg1))
       MBB.addLiveIn(Reg1);
     if (RPI.isPaired()) {
       if (!MRI.isReserved(Reg2))
         MBB.addLiveIn(Reg2);
-      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+      if (RPI.isScalable())
+        MIB.addReg(PairRegs);
+      else
+        MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOStore, Size, Alignment));
     }
-    MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(AArch64::SP)
+    if (RPI.isPaired() && RPI.isScalable())
+      MIB.addReg(PnReg);
+    else
+      MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
+    MIB.addReg(AArch64::SP)
         .addImm(RPI.Offset) // [sp, #offset*scale],
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameSetup);
@@ -3061,8 +3097,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
 
     // Update the StackIDs of the SVE stack slots.
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
-      MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
+    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+      MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
+      if (RPI.isPaired())
+         MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+    }
 
   }
   return true;
@@ -3082,7 +3121,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
 
-  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+  auto EmitMI = [&](const RegPairInfo &RPI,
+                    bool *PtrueCreated) -> MachineBasicBlock::iterator {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -3114,7 +3154,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
        Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
-       LdrOpc = AArch64::LDR_ZXI;
+       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
        Size = 16;
        Alignment = Align(16);
        break;
@@ -3139,15 +3179,31 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
+
+    unsigned PnReg;
+    unsigned PairRegs;
+    if (RPI.isPaired() && RPI.isScalable()) {
+      PnReg = findFreePredicateAsCounterReg(MF);
+      PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+      if (!*PtrueCreated) {
+        *PtrueCreated = true;
+        BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+            .setMIFlags(MachineInstr::FrameDestroy);
+      }
+    }
+
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
-      MIB.addReg(Reg2, getDefRegState(true));
+      MIB.addReg(RPI.isScalable() ? PairRegs : Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOLoad, Size, Alignment));
     }
-    MIB.addReg(Reg1, getDefRegState(true))
-        .addReg(AArch64::SP)
+    if (RPI.isPaired() && RPI.isScalable())
+      MIB.addReg(PnReg);
+    else
+      MIB.addReg(Reg1, getDefRegState(true));
+    MIB.addReg(AArch64::SP)
         .addImm(RPI.Offset) // [sp, #offset*scale]
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameDestroy);
@@ -3161,9 +3217,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   };
 
   // SVE objects are always restored in reverse order.
+  bool PtrueCreated = false;
   for (const RegPairInfo &RPI : reverse(RegPairs))
     if (RPI.isScalable())
-      EmitMI(RPI);
+      EmitMI(RPI, &PtrueCreated);
 
   if (homogeneousPrologEpilog(MF, &MBB)) {
     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
@@ -3174,13 +3231,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     }
     return true;
   }
-
   if (ReverseCSRRestoreSeq) {
     MachineBasicBlock::iterator First = MBB.end();
     for (const RegPairInfo &RPI : reverse(RegPairs)) {
       if (RPI.isScalable())
         continue;
-      MachineBasicBlock::iterator It = EmitMI(RPI);
+      MachineBasicBlock::iterator It = EmitMI(RPI, &PtrueCreated);
       if (First == MBB.end())
         First = It;
     }
@@ -3190,7 +3246,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     for (const RegPairInfo &RPI : RegPairs) {
       if (RPI.isScalable())
         continue;
-      (void)EmitMI(RPI);
+      (void)EmitMI(RPI, &PtrueCreated);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index cd4a18bfbc23a8..b44cc8d0d0dc9b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -300,6 +300,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
         int FrameIdx = Info.getFrameIdx();
         if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
           continue;
+        if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+          continue;
         int64_t Offset = MFI.getObjectOffset(FrameIdx);
         int64_t ObjSize = MFI.getObjectSize(FrameIdx);
         MinOffset = std::min<int64_t>(Offset, MinOffset);
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc0825673..6c94546c9525aa 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,45 +55,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    ptrue pn8.b
+; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
-; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ptrue pn8.b
+; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -103,20 +89,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -130,21 +110,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -210,45 +184,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    ptrue pn8.b
+; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
-; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STR...
[truncated]

github-actions · 2024-01-10T18:44:22Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 99be3875fb161a5786aaad5dab0b92fa052e47d1 c636d7dac943879cb4b60dec2a56f4e00efb4137 -- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

View the diff from clang-format here.

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 48a781b64de..1f2b7149b81 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3222,10 +3222,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
        Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
-       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
-       Size = 16;
-       Alignment = Align(16);
-       break;
+      LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
+      Size = 16;
+      Alignment = Align(16);
+      break;
     case RegPairInfo::PPR:
        LdrOpc = AArch64::LDR_PXI;
        Size = 2;

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll

This is needed by PR#77665[1] that uses a P-register while restoring Z-registers. The reverse for SVE register restore in the epilogue was added to guarantee performance, but further work was done to improve sve frame restore and besides that the schedule also may change the order of the restore, undoing the reverse restore. [1]#77665

…VE2.1 for spill/fill When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.

Add missing ;

This is needed by PR#77665[1] that uses a P-register while restoring Z-registers. The reverse for SVE register restore in the epilogue was added to guarantee performance, but further work was done to improve sve frame restore and besides that the schedule also may change the order of the restore, undoing the reverse restore. This also fix the problem reported on Windows with std::reverse and .base(). [1]llvm#77665

This is needed by PR#77665[1] that uses a P-register while restoring Z-registers. The reverse for SVE register restore in the epilogue was added to guarantee performance, but further work was done to improve sve frame restore and besides that the schedule also may change the order of the restore, undoing the reverse restore. This also fix the problem reported in (PR #79623) on Windows with std::reverse and .base(). [1]#77665

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll

sdesmalen-arm · 2024-03-04T10:58:59Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+        PtrueCreated = true;
+        // Any one of predicate-as-count will be free to use
+        // This can be replaced in the future if needed
+        PnReg = AArch64::PN8;


It's not correct to blindly pick PN8 (P8) here. You can only clobber P8 if it is preserved by the preceding predicate callee-saves.

i.e.

define void @test_clobbers_3_z_regs(<vscale x 16 x i8> %v) { call void asm sideeffect "", "~{z8},~{z9}"() ret void }

results in:

str x29, [sp, #-16]! addvl sp, sp, #-2 ptrue pn8.b ; pn8 is not preserved by foo, even though the AAPCS says that it should. st1b { z8.b, z9.b }, pn8, [sp] ld1b { z8.b, z9.b }, pn8/z, [sp] addvl sp, sp, #2 ldr x29, [sp], #16 ret

One thing you could do is try to see if one of the argument registers is available (p0 - p3), so that you can reuse one of those. Alternatively, you could mark p8 as clobbered by the function so that the preceding callee-save spills will include p8.

It's not correct to blindly pick PN8 (P8) here. You can only clobber P8 if it is preserved by the preceding predicate callee-saves.

Good point. I guess I misread the AAPCS when I suggested to just pick an arbitrary register as scratch.

Alternatively, you could mark p8 as clobbered by the function so that the preceding callee-save spills will include p8.

I would prefer this solution.

The P8 register is added to the list of SavedRegs now.

I'm a little concerned with just blindly picking pn8 here, because this may not match the given calling convention (if someone would choose to use a different one from the standard SVE calling convention), defined in AArch64CallingConvention.td.

Can you create a function that finds a suitable caller-saved register instead?

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

sdesmalen-arm · 2024-03-20T08:51:45Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
    if (!MRI.isReserved(Reg1))
      MBB.addLiveIn(Reg1);
    if (RPI.isPaired()) {
      if (!MRI.isReserved(Reg2))
        MBB.addLiveIn(Reg2);
-      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+      if (RPI.isScalable())
+        MIB.addReg(PairRegs);


Should PairRegs also use getPrologueDeath ?

I am not sure. I can see in getPrologueDeath that it kills the reg if it is does not live.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

sdesmalen-arm · 2024-03-20T10:03:17Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+        PtrueCreated = true;
+        // Any one of predicate-as-count will be free to use
+        // This can be replaced in the future if needed
+        PnReg = AArch64::PN8;


I'm a little concerned with just blindly picking pn8 here, because this may not match the given calling convention (if someone would choose to use a different one from the standard SVE calling convention), defined in AArch64CallingConvention.td.

Can you create a function that finds a suitable caller-saved register instead?

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

momchil-velikov · 2024-04-23T17:52:32Z

I think it's converging, but how about squishing all the commits, if other reviewers don't mind? I know the advice is to do fixup commit, but they are too many now and it has become hard to see to final picture.

momchil-velikov · 2024-04-24T15:22:02Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

-      unsigned PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
-      unsigned PnReg;
+      AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+      unsigned PnReg = AFI->getPredicateRegForFillSpill();


This looks good, the following are more like style remarks:

PnReg can be declared outside the loop. Then you can initialise it under if (!PtrueCreated) ..., which seems more logical place to do it.

PairRegs has only one use and it's on the following line, you don't save anything by naming the corresponding expression

momchil-velikov · 2024-04-24T15:27:44Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
+      if (AArch64::PPRRegClass.contains(Reg) &&
+          (Reg > AArch64::P8 || Reg < AArch64::P15) && SavedRegs.test(Reg) &&


Looks like that should be Reg >= AArch64::P8 && Reg <= AArch64::P15

momchil-velikov · 2024-04-24T15:34:12Z

Can you create a function that finds a suitable caller-saved register instead?

But there are no caller-saved predicate registers, are there? p0-p3 are argument registers and p4-p15 are callee-saved and we can only use pn8-pn15 anyway.

sdesmalen-arm · 2024-04-24T15:40:41Z

Can you create a function that finds a suitable caller-saved register instead?

But there are no caller-saved predicate registers, are there? p0-p3 are argument registers and p4-p15 are callee-saved and we can only use pn8-pn15 anyway.

Doesn't that suggest you can use p0-p3 if they are not used for passing arguments? (and if so, the caller will preserve them around the call, e.g. https://godbolt.org/z/1jM547WP8)

sdesmalen-arm · 2024-04-24T15:43:21Z

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

sdesmalen-arm · 2024-04-24T15:58:20Z

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

It seems that PTRUE (predicate-as-counter) can only use PN8-PN15, so I guess we can't use any of the other registers. So the choice of always using PN8 for this purpose is always fine, which depending on the calling convention may result in an extra callee-save spill/fill. Perhaps I'm just overcomplicating it, but it feels a bit wrong to hard-code a register like that if we have TableGen support many different calling conventions.

It would probably be useful to add an assert to make sure that PN8/P8 is not a reserved register though.

momchil-velikov · 2024-04-24T16:03:04Z

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

All right, how about this strategy:

don't optimise for hypothetical calling conventions
in determineCalleeSaves, make sure we chose an already saved register, or, if there are none, choose pn8 if the calling convention of the function is one where we know pn8 is a callee saved register:
- AArch64_SVE_VectorCall
- AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
- AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Not a correctness issue if we miss some.
everywhere else we use only AFI->getPredicateRegForFillSpill() != 0 as the condition determining we can and should load/store in pairs

sdesmalen-arm · 2024-04-25T08:34:00Z

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

All right, how about this strategy:

don't optimise for hypothetical calling conventions

in determineCalleeSaves, make sure we chose an already saved register, or, if there are none, choose pn8 if the calling convention of the function is one where we know pn8 is a callee saved register:

AArch64_SVE_VectorCall

AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0

AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2

Not a correctness issue if we miss some.

everywhere else we use only AFI->getPredicateRegForFillSpill() != 0 as the condition determining we can and should load/store in pairs

Sounds good. The function would otherwise be free to choose any register that is not used as an argument/return register and is not a reserved register, but I'm happy fixing it to pn8 with the checks you outlined above. Just in case, please add an assert that PN8 is not also a reserved register.

I'd also suggest wrapping the AFI->getPredicateRegForFillSpill() != 0 in a function like canUseMultiVectorLoadStore (feel free to pick a different name) that combines it with && (Subtarget.hasSVE2p1() || Subtarget.hasSME2()), to keep the logic in other places a bit simpler.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h

momchil-velikov · 2024-04-25T09:11:29Z

I'd also suggest wrapping the AFI->getPredicateRegForFillSpill() != 0 in a function like canUseMultiVectorLoadStore (feel free to pick a different name) that combines it with && (Subtarget.hasSVE2p1() || Subtarget.hasSME2()), to keep the logic in other places a bit simpler.

It should not need to combine it, since !(Subtarget.hasSVE2p1() || Subtarget.hasSME2()) must imply AFI->getPredicateRegForFillSpill() == 0, i.e. we don't go through the trouble of choosing a register if there's no chance for using it.

Otherwise, yes

everywhere else we use only AFI->getPredicateRegForFillSpill() != 0 as the condition determining we can and should load/store in pairs

CarolineConcatto · 2024-04-26T17:45:12Z

I have addressed the comments about the calling conventions and added tests for it.

sdesmalen-arm

Hi @CarolineConcatto, thanks for making the changes. The logic looks pretty good to me. I merely left some nits, stylistic comments and requests for asserts, but am happy with the changes otherwise.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

sdesmalen-arm · 2024-05-01T08:05:23Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

@@ -1508,6 +1508,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
  switch (I->getOpcode()) {
  default:
    return false;
+  case AArch64::PTRUE_C_B:
+  case AArch64::LD1B_2Z_IMM:
+  case AArch64::ST1B_2Z_IMM:


As future work, I wonder if we can extend this further to use the quad variants of these instructions as well.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

sdesmalen-arm · 2024-05-01T09:54:29Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
+      if (AArch64::PPRRegClass.contains(Reg) &&
+          (Reg >= AArch64::P8 && Reg <= AArch64::P15) && SavedRegs.test(Reg) &&


When choosing a callee-saved register, there is the assumption that in the prologue this register will already be spilled before overwriting it with a new ptrue, and in the epilogue that it will be filled after defining it with a ptrue. This is dependent on the order in which the registers are specified in the AArch64CallingConvention.td file and the order in which they are iterated. To avoid this ever silently doing the wrong thing, can you add some asserts in restoreCalleeSavedRegisters and spillCalleeSavedRegisters to guard that?

I can't see where you've addressed this. Did you miss this comment?

sdesmalen-arm · 2024-05-08T08:34:15Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+      AFI->setPredicateRegForFillSpill(AArch64::PN8);
+    }
+
+    assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) &&


nit: Is it worth putting this functionality into a helper function to keep this function a bit simpler?

e.g.

if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) if (unsigned Reg = findFreePredicateReg()) AFI->setPredicateRegForFillSpill(Reg);

?

sdesmalen-arm · 2024-05-08T08:48:28Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
+      if (AArch64::PPRRegClass.contains(Reg) &&
+          (Reg >= AArch64::P8 && Reg <= AArch64::P15) && SavedRegs.test(Reg) &&


I can't see where you've addressed this. Did you miss this comment?

sdesmalen-arm · 2024-05-13T10:38:45Z

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

+void verify(SmallVectorImpl<RegPairInfo> &RegPairs) {
+  auto IsPPR = [](const RegPairInfo &c) { return c.Reg1 == RegPairInfo::PPR; };
+  auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+  auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+  auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+  assert(!(PPRBegin < ZPRBegin) &&
+         "Expected callee save predicate to be handled first");
+}
+


Does this work when there are no PPR registers to save (PPRBegin == RegPairs.end())?

Additionally:

You only need to check this when using a predicate reg to do the paired spill/fill instructions.

nit: verify is quite a generic name, but the checks are rather specific. Maybe just inline this function where it is used?

Does this work when there are no PPR registers to save (PPRBegin == RegPairs.end())?

If there is no PReg to save then it should be fine to clobber PReg.
AFAIU the check is to avoid to spill/store the PRegs after ZRegs and fill/load PRegs before ZRegs. If there is not PReg to spill and fill then the compiler should be fine.

sdesmalen-arm

LGTM, thanks for your patience @CarolineConcatto! :)

llvmbot added the backend:AArch64 label Jan 10, 2024

momchil-velikov reviewed Jan 17, 2024

View reviewed changes

llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll Outdated Show resolved Hide resolved

CarolineConcatto mentioned this pull request Feb 8, 2024

[AArch64] Restore Z-registers before P-registers #79623

Merged

CarolineConcatto added 3 commits February 19, 2024 14:55

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S…

5e9b05b

…VE2.1 for spill/fill When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S…

fff3e34

…VE2.1 for spill/fill When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.

Address review comments

f61f7bc

CarolineConcatto force-pushed the ldstpair branch from b66d3ce to f61f7bc Compare February 19, 2024 17:16

CarolineConcatto added 4 commits February 19, 2024 17:20

Remove not needed test from AArch64MachineFunctionInfo.h

94f21b1

Use assert in IsSVECalleeSave for the ld/st/ptrue

19a8ab6

Merge branch 'main' into ldstpair

6312650

Update AArch64FrameLowering.cpp

b18f3a6

Add missing ;

CarolineConcatto mentioned this pull request Feb 21, 2024

[AArch64] Restore Z-registers before P-registers (#79623) #82492

Merged

Merge branch 'main' into ldstpair

898a5fc

CarolineConcatto requested review from sdesmalen-arm and paulwalker-arm February 23, 2024 15:30

sdesmalen-arm reviewed Mar 4, 2024

View reviewed changes

Merge branch 'llvm:main' into ldstpair

2c67e80

sdesmalen-arm reviewed Mar 20, 2024

View reviewed changes

momchil-velikov reviewed Apr 23, 2024

View reviewed changes

Save predicate a register used for save and restore

ecb0f57

momchil-velikov reviewed Apr 24, 2024

View reviewed changes

sdesmalen-arm reviewed Apr 25, 2024

View reviewed changes

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h Outdated Show resolved Hide resolved

Address comments about PN8 register use

c8bdbb9

CarolineConcatto force-pushed the ldstpair branch from 73563cb to c8bdbb9 Compare April 26, 2024 17:44

sdesmalen-arm reviewed May 1, 2024

View reviewed changes

CarolineConcatto added 2 commits May 7, 2024 14:47

Add asserts when using PnReg for ld/st with pair of ZRegs

633fa85

fix format

0b2c9f7

sdesmalen-arm reviewed May 8, 2024

View reviewed changes

Add checks for ZReg and PRegs order for spill/fill

a6d036b

sdesmalen-arm reviewed May 13, 2024

View reviewed changes

CarolineConcatto added 2 commits May 14, 2024 13:00

Add tests for when there is only PReg being clobbered

e314a8a

Remove lambda function

c636d7d

sdesmalen-arm approved these changes May 16, 2024

View reviewed changes

CarolineConcatto merged commit c4bac7f into llvm:main May 17, 2024
3 of 4 checks passed

CarolineConcatto deleted the ldstpair branch May 20, 2024 07:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… #77665

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… #77665

CarolineConcatto commented Jan 10, 2024

llvmbot commented Jan 10, 2024

github-actions bot commented Jan 10, 2024 •

edited

sdesmalen-arm Mar 4, 2024

momchil-velikov Mar 4, 2024

CarolineConcatto Mar 14, 2024

sdesmalen-arm Mar 20, 2024

sdesmalen-arm Mar 20, 2024

CarolineConcatto Mar 27, 2024

sdesmalen-arm Mar 20, 2024

momchil-velikov commented Apr 23, 2024

momchil-velikov Apr 24, 2024 •

edited

momchil-velikov Apr 24, 2024

momchil-velikov commented Apr 24, 2024

sdesmalen-arm commented Apr 24, 2024 •

edited

sdesmalen-arm commented Apr 24, 2024

sdesmalen-arm commented Apr 24, 2024

momchil-velikov commented Apr 24, 2024

sdesmalen-arm commented Apr 25, 2024

momchil-velikov commented Apr 25, 2024

CarolineConcatto commented Apr 26, 2024

sdesmalen-arm left a comment

sdesmalen-arm May 1, 2024

sdesmalen-arm May 1, 2024

sdesmalen-arm May 8, 2024

sdesmalen-arm May 8, 2024

sdesmalen-arm May 8, 2024

sdesmalen-arm May 13, 2024

CarolineConcatto May 14, 2024

sdesmalen-arm left a comment

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… #77665

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… #77665

Conversation

CarolineConcatto commented Jan 10, 2024

llvmbot commented Jan 10, 2024

github-actions bot commented Jan 10, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

momchil-velikov commented Apr 23, 2024

momchil-velikov Apr 24, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

momchil-velikov commented Apr 24, 2024

sdesmalen-arm commented Apr 24, 2024 • edited

sdesmalen-arm commented Apr 24, 2024

sdesmalen-arm commented Apr 24, 2024

momchil-velikov commented Apr 24, 2024

sdesmalen-arm commented Apr 25, 2024

momchil-velikov commented Apr 25, 2024

CarolineConcatto commented Apr 26, 2024

sdesmalen-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

sdesmalen-arm left a comment

Choose a reason for hiding this comment

github-actions bot commented Jan 10, 2024 •

edited

momchil-velikov Apr 24, 2024 •

edited

sdesmalen-arm commented Apr 24, 2024 •

edited