Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLVM][AArch64]Use load/store with consecutive registers in SME2 or S… #77665

Merged
merged 16 commits into from
May 17, 2024

Conversation

CarolineConcatto
Copy link
Contributor

…VE2.1 for spill/fill

When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 10, 2024

@llvm/pr-subscribers-backend-aarch64

Author: None (CarolineConcatto)

Changes

…VE2.1 for spill/fill

When possible the spill/fill register in Frame Lowering uses the ld/st consecutive pairs available in sme or sve2.1.


Patch is 302.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77665.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+78-22)
  • (modified) llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h (+2)
  • (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll (+528-928)
  • (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll (+528-928)
  • (added) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll (+295)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index caab59201a8d69..7a332bfd18c21b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1480,6 +1480,11 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   switch (I->getOpcode()) {
   default:
     return false;
+  case AArch64::PTRUE_C_B:
+  case AArch64::LD1B_2Z_IMM:
+  case AArch64::ST1B_2Z_IMM:
+    return I->getMF()->getSubtarget<AArch64Subtarget>().hasSVE2p1() ||
+           I->getMF()->getSubtarget<AArch64Subtarget>().hasSME2();
   case AArch64::STR_ZXI:
   case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
@@ -2753,6 +2758,16 @@ struct RegPairInfo {
 
 } // end anonymous namespace
 
+unsigned findFreePredicateAsCounterReg(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MCRegister PReg :
+       {AArch64::PN8, AArch64::PN9, AArch64::PN10, AArch64::PN11, AArch64::PN12,
+        AArch64::PN13, AArch64::PN14, AArch64::PN15}) {
+    if (!MRI.isReserved(PReg))
+      return PReg;
+  }
+  llvm_unreachable("cannot find a free predicate");
+}
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
@@ -2763,6 +2778,7 @@ static void computeCalleeSaveRegisterPairs(
 
   bool IsWindows = isTargetWindows(MF);
   bool NeedsWinCFI = needsWinCFI(MF);
+  const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -2831,7 +2847,11 @@ static void computeCalleeSaveRegisterPairs(
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::PPR:
+        break;
       case RegPairInfo::ZPR:
+        if (Subtarget.hasSVE2p1() || Subtarget.hasSME2())
+          if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
+            RPI.Reg2 = NextReg;
         break;
       }
     }
@@ -2876,7 +2896,7 @@ static void computeCalleeSaveRegisterPairs(
     assert(OffsetPre % Scale == 0);
 
     if (RPI.isScalable())
-      ScalableByteOffset += StackFillDir * Scale;
+      ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
     else
       ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
 
@@ -2887,9 +2907,6 @@ static void computeCalleeSaveRegisterPairs(
          (IsWindows && RPI.Reg2 == AArch64::LR)))
       ByteOffset += StackFillDir * 8;
 
-    assert(!(RPI.isScalable() && RPI.isPaired()) &&
-           "Paired spill/fill instructions don't exist for SVE vectors");
-
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
     if (NeedGapToAlignStack && !NeedsWinCFI &&
@@ -2976,6 +2993,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     }
     return true;
   }
+  bool PtrueCreated = false;
   for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
@@ -3010,10 +3028,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
        Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
-       StrOpc = AArch64::STR_ZXI;
-       Size = 16;
-       Alignment = Align(16);
-       break;
+      StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
+      Size = 16;
+      Alignment = Align(16);
+      break;
     case RegPairInfo::PPR:
        StrOpc = AArch64::STR_PXI;
        Size = 2;
@@ -3037,19 +3055,37 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
+
+    unsigned PnReg;
+    unsigned PairRegs;
+    if (RPI.isPaired() && RPI.isScalable()) {
+      PnReg = findFreePredicateAsCounterReg(MF);
+      PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+      if (!PtrueCreated) {
+        PtrueCreated = true;
+        BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+            .setMIFlags(MachineInstr::FrameDestroy);
+      }
+    }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     if (!MRI.isReserved(Reg1))
       MBB.addLiveIn(Reg1);
     if (RPI.isPaired()) {
       if (!MRI.isReserved(Reg2))
         MBB.addLiveIn(Reg2);
-      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+      if (RPI.isScalable())
+        MIB.addReg(PairRegs);
+      else
+        MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOStore, Size, Alignment));
     }
-    MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(AArch64::SP)
+    if (RPI.isPaired() && RPI.isScalable())
+      MIB.addReg(PnReg);
+    else
+      MIB.addReg(Reg1, getPrologueDeath(MF, Reg1));
+    MIB.addReg(AArch64::SP)
         .addImm(RPI.Offset) // [sp, #offset*scale],
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameSetup);
@@ -3061,8 +3097,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
 
     // Update the StackIDs of the SVE stack slots.
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
-      MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector);
+    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+      MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
+      if (RPI.isPaired())
+         MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+    }
 
   }
   return true;
@@ -3082,7 +3121,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
 
-  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+  auto EmitMI = [&](const RegPairInfo &RPI,
+                    bool *PtrueCreated) -> MachineBasicBlock::iterator {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -3114,7 +3154,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
        Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
-       LdrOpc = AArch64::LDR_ZXI;
+       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
        Size = 16;
        Alignment = Align(16);
        break;
@@ -3139,15 +3179,31 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
+
+    unsigned PnReg;
+    unsigned PairRegs;
+    if (RPI.isPaired() && RPI.isScalable()) {
+      PnReg = findFreePredicateAsCounterReg(MF);
+      PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
+      if (!*PtrueCreated) {
+        *PtrueCreated = true;
+        BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
+            .setMIFlags(MachineInstr::FrameDestroy);
+      }
+    }
+
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
-      MIB.addReg(Reg2, getDefRegState(true));
+      MIB.addReg(RPI.isScalable() ? PairRegs : Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOLoad, Size, Alignment));
     }
-    MIB.addReg(Reg1, getDefRegState(true))
-        .addReg(AArch64::SP)
+    if (RPI.isPaired() && RPI.isScalable())
+      MIB.addReg(PnReg);
+    else
+      MIB.addReg(Reg1, getDefRegState(true));
+    MIB.addReg(AArch64::SP)
         .addImm(RPI.Offset) // [sp, #offset*scale]
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameDestroy);
@@ -3161,9 +3217,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   };
 
   // SVE objects are always restored in reverse order.
+  bool PtrueCreated = false;
   for (const RegPairInfo &RPI : reverse(RegPairs))
     if (RPI.isScalable())
-      EmitMI(RPI);
+      EmitMI(RPI, &PtrueCreated);
 
   if (homogeneousPrologEpilog(MF, &MBB)) {
     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
@@ -3174,13 +3231,12 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     }
     return true;
   }
-
   if (ReverseCSRRestoreSeq) {
     MachineBasicBlock::iterator First = MBB.end();
     for (const RegPairInfo &RPI : reverse(RegPairs)) {
       if (RPI.isScalable())
         continue;
-      MachineBasicBlock::iterator It = EmitMI(RPI);
+      MachineBasicBlock::iterator It = EmitMI(RPI, &PtrueCreated);
       if (First == MBB.end())
         First = It;
     }
@@ -3190,7 +3246,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     for (const RegPairInfo &RPI : RegPairs) {
       if (RPI.isScalable())
         continue;
-      (void)EmitMI(RPI);
+      (void)EmitMI(RPI, &PtrueCreated);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index cd4a18bfbc23a8..b44cc8d0d0dc9b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -300,6 +300,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
         int FrameIdx = Info.getFrameIdx();
         if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
           continue;
+        if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+          continue;
         int64_t Offset = MFI.getObjectOffset(FrameIdx);
         int64_t ObjSize = MFI.getObjectSize(FrameIdx);
         MinOffset = std::min<int64_t>(Offset, MinOffset);
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc0825673..6c94546c9525aa 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,45 +55,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    ptrue pn8.b
+; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
-; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
 ; STRIDED-NEXT:    //NO_APP
 ; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ptrue pn8.b
+; STRIDED-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    mov z1.d, z8.d
-; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
+; STRIDED-NEXT:    ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload
 ; STRIDED-NEXT:    addvl sp, sp, #17
 ; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; STRIDED-NEXT:    ret
@@ -103,20 +89,14 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
 ; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
-; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
 ; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
 ; CONTIGUOUS-NEXT:    mov p8.b, p0.b
@@ -130,21 +110,15 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
 ; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
-; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ptrue pn8.b
+; CONTIGUOUS-NEXT:    ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload
+; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -210,45 +184,31 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    ptrue pn8.b
+; STRIDED-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill
+; STRIDED-NEXT:    st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill
 ; STRIDED-NEXT:    mov p8.b, p0.b
-; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
-; STR...
[truncated]

Copy link

github-actions bot commented Jan 10, 2024

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff 99be3875fb161a5786aaad5dab0b92fa052e47d1 c636d7dac943879cb4b60dec2a56f4e00efb4137 -- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
View the diff from clang-format here.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 48a781b64de..1f2b7149b81 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3222,10 +3222,10 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
        Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
-       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
-       Size = 16;
-       Alignment = Align(16);
-       break;
+      LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
+      Size = 16;
+      Alignment = Align(16);
+      break;
     case RegPairInfo::PPR:
        LdrOpc = AArch64::LDR_PXI;
        Size = 2;

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h Outdated Show resolved Hide resolved
llvm/test/CodeGen/AArch64/sve2p1-intrinsics-ldst-pair.ll Outdated Show resolved Hide resolved
CarolineConcatto added a commit that referenced this pull request Feb 19, 2024
This is needed by PR#77665[1] that uses a P-register while restoring
Z-registers.

The reverse for SVE register restore in the epilogue was added to
guarantee performance, but further work was done to improve sve frame
restore and besides that the schedule also may change the order of the
restore, undoing the reverse restore.

[1]#77665
…VE2.1 for spill/fill

When possible the spill/fill register in Frame Lowering uses the ld/st
consecutive pairs available in sme or sve2.1.
…VE2.1 for spill/fill

When possible the spill/fill register in Frame Lowering uses the ld/st
consecutive pairs available in sme or sve2.1.
CarolineConcatto added a commit to CarolineConcatto/llvm-project that referenced this pull request Feb 21, 2024
This is needed by PR#77665[1] that uses a P-register while restoring
Z-registers.

The reverse for SVE register restore in the epilogue was added to
guarantee performance, but further work was done to improve sve frame
restore and besides that the schedule also may change the order of the
restore, undoing the reverse restore.

This also fix the problem reported on Windows with std::reverse and .base().

[1]llvm#77665
CarolineConcatto added a commit that referenced this pull request Feb 22, 2024
This is needed by PR#77665[1] that uses a P-register while restoring
Z-registers.

The reverse for SVE register restore in the epilogue was added to
guarantee performance, but further work was done to improve sve frame
restore and besides that the schedule also may change the order of the
restore, undoing the reverse restore.

This also fix the problem reported in (PR #79623) on Windows with
std::reverse and .base().

[1]#77665
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
PtrueCreated = true;
// Any one of predicate-as-count will be free to use
// This can be replaced in the future if needed
PnReg = AArch64::PN8;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not correct to blindly pick PN8 (P8) here. You can only clobber P8 if it is preserved by the preceding predicate callee-saves.

i.e.

define void @test_clobbers_3_z_regs(<vscale x 16 x i8> %v) {
  call void asm sideeffect "", "~{z8},~{z9}"()
  ret void
}

results in:

        str     x29, [sp, #-16]!
        addvl   sp, sp, #-2
        ptrue   pn8.b       ; pn8 is not preserved by foo, even though the AAPCS says that it should.
        st1b    { z8.b, z9.b }, pn8, [sp]
        ld1b    { z8.b, z9.b }, pn8/z, [sp]
        addvl   sp, sp, #2
        ldr     x29, [sp], #16
        ret

One thing you could do is try to see if one of the argument registers is available (p0 - p3), so that you can reuse one of those. Alternatively, you could mark p8 as clobbered by the function so that the preceding callee-save spills will include p8.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not correct to blindly pick PN8 (P8) here. You can only clobber P8 if it is preserved by the preceding predicate callee-saves.

Good point. I guess I misread the AAPCS when I suggested to just pick an arbitrary register as scratch.

Alternatively, you could mark p8 as clobbered by the function so that the preceding callee-save spills will include p8.

I would prefer this solution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The P8 register is added to the list of SavedRegs now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little concerned with just blindly picking pn8 here, because this may not match the given calling convention (if someone would choose to use a different one from the standard SVE calling convention), defined in AArch64CallingConvention.td.

Can you create a function that finds a suitable caller-saved register instead?

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
if (!MRI.isReserved(Reg1))
MBB.addLiveIn(Reg1);
if (RPI.isPaired()) {
if (!MRI.isReserved(Reg2))
MBB.addLiveIn(Reg2);
MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
if (RPI.isScalable())
MIB.addReg(PairRegs);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should PairRegs also use getPrologueDeath ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure. I can see in getPrologueDeath that it kills the reg if it is does not live.

PtrueCreated = true;
// Any one of predicate-as-count will be free to use
// This can be replaced in the future if needed
PnReg = AArch64::PN8;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little concerned with just blindly picking pn8 here, because this may not match the given calling convention (if someone would choose to use a different one from the standard SVE calling convention), defined in AArch64CallingConvention.td.

Can you create a function that finds a suitable caller-saved register instead?

@momchil-velikov
Copy link
Collaborator

I think it's converging, but how about squishing all the commits, if other reviewers don't mind? I know the advice is to do fixup commit, but they are too many now and it has become hard to see to final picture.

unsigned PairRegs = AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0);
unsigned PnReg;
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
unsigned PnReg = AFI->getPredicateRegForFillSpill();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good, the following are more like style remarks:

  • PnReg can be declared outside the loop. Then you can initialise it under if (!PtrueCreated) ..., which seems more logical place to do it.
  • PairRegs has only one use and it's on the following line, you don't save anything by naming the corresponding expression

AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
if (AArch64::PPRRegClass.contains(Reg) &&
(Reg > AArch64::P8 || Reg < AArch64::P15) && SavedRegs.test(Reg) &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like that should be Reg >= AArch64::P8 && Reg <= AArch64::P15

@momchil-velikov
Copy link
Collaborator

Can you create a function that finds a suitable caller-saved register instead?

But there are no caller-saved predicate registers, are there? p0-p3 are argument registers and p4-p15 are callee-saved and we can only use pn8-pn15 anyway.

@sdesmalen-arm
Copy link
Collaborator

sdesmalen-arm commented Apr 24, 2024

Can you create a function that finds a suitable caller-saved register instead?

But there are no caller-saved predicate registers, are there? p0-p3 are argument registers and p4-p15 are callee-saved and we can only use pn8-pn15 anyway.

Doesn't that suggest you can use p0-p3 if they are not used for passing arguments? (and if so, the caller will preserve them around the call, e.g. https://godbolt.org/z/1jM547WP8)

@sdesmalen-arm
Copy link
Collaborator

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

@sdesmalen-arm
Copy link
Collaborator

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

It seems that PTRUE (predicate-as-counter) can only use PN8-PN15, so I guess we can't use any of the other registers. So the choice of always using PN8 for this purpose is always fine, which depending on the calling convention may result in an extra callee-save spill/fill. Perhaps I'm just overcomplicating it, but it feels a bit wrong to hard-code a register like that if we have TableGen support many different calling conventions.

It would probably be useful to add an assert to make sure that PN8/P8 is not a reserved register though.

@momchil-velikov
Copy link
Collaborator

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

All right, how about this strategy:

  • don't optimise for hypothetical calling conventions

  • in determineCalleeSaves, make sure we chose an already saved register, or, if there are none, choose pn8 if the calling convention of the function is one where we know pn8 is a callee saved register:

    • AArch64_SVE_VectorCall
    • AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
    • AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2

    Not a correctness issue if we miss some.

  • everywhere else we use only AFI->getPredicateRegForFillSpill() != 0 as the condition determining we can and should load/store in pairs

@sdesmalen-arm
Copy link
Collaborator

I guess my point more widely was that when choosing a different calling convention (other than the one we currently implement), the choice of PN8 may not be right.

All right, how about this strategy:

  • don't optimise for hypothetical calling conventions

  • in determineCalleeSaves, make sure we chose an already saved register, or, if there are none, choose pn8 if the calling convention of the function is one where we know pn8 is a callee saved register:

    • AArch64_SVE_VectorCall
    • AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
    • AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2

    Not a correctness issue if we miss some.

  • everywhere else we use only AFI->getPredicateRegForFillSpill() != 0 as the condition determining we can and should load/store in pairs

Sounds good. The function would otherwise be free to choose any register that is not used as an argument/return register and is not a reserved register, but I'm happy fixing it to pn8 with the checks you outlined above. Just in case, please add an assert that PN8 is not also a reserved register.

I'd also suggest wrapping the AFI->getPredicateRegForFillSpill() != 0 in a function like canUseMultiVectorLoadStore (feel free to pick a different name) that combines it with && (Subtarget.hasSVE2p1() || Subtarget.hasSME2()), to keep the logic in other places a bit simpler.

@momchil-velikov
Copy link
Collaborator

I'd also suggest wrapping the AFI->getPredicateRegForFillSpill() != 0 in a function like canUseMultiVectorLoadStore (feel free to pick a different name) that combines it with && (Subtarget.hasSVE2p1() || Subtarget.hasSME2()), to keep the logic in other places a bit simpler.

It should not need to combine it, since !(Subtarget.hasSVE2p1() || Subtarget.hasSME2()) must imply AFI->getPredicateRegForFillSpill() == 0, i.e. we don't go through the trouble of choosing a register if there's no chance for using it.

Otherwise, yes

  • everywhere else we use only AFI->getPredicateRegForFillSpill() != 0 as the condition determining we can and should load/store in pairs

@CarolineConcatto
Copy link
Contributor Author

I have addressed the comments about the calling conventions and added tests for it.

Copy link
Collaborator

@sdesmalen-arm sdesmalen-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @CarolineConcatto, thanks for making the changes. The logic looks pretty good to me. I merely left some nits, stylistic comments and requests for asserts, but am happy with the changes otherwise.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
@@ -1508,6 +1508,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
switch (I->getOpcode()) {
default:
return false;
case AArch64::PTRUE_C_B:
case AArch64::LD1B_2Z_IMM:
case AArch64::ST1B_2Z_IMM:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As future work, I wonder if we can extend this further to use the quad variants of these instructions as well.

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp Outdated Show resolved Hide resolved
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
if (AArch64::PPRRegClass.contains(Reg) &&
(Reg >= AArch64::P8 && Reg <= AArch64::P15) && SavedRegs.test(Reg) &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When choosing a callee-saved register, there is the assumption that in the prologue this register will already be spilled before overwriting it with a new ptrue, and in the epilogue that it will be filled after defining it with a ptrue. This is dependent on the order in which the registers are specified in the AArch64CallingConvention.td file and the order in which they are iterated. To avoid this ever silently doing the wrong thing, can you add some asserts in restoreCalleeSavedRegisters and spillCalleeSavedRegisters to guard that?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't see where you've addressed this. Did you miss this comment?

AFI->setPredicateRegForFillSpill(AArch64::PN8);
}

assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Is it worth putting this functionality into a helper function to keep this function a bit simpler?

e.g.

if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2()))
  if (unsigned Reg = findFreePredicateReg())
    AFI->setPredicateRegForFillSpill(Reg);

?

AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if (Subtarget.hasSVE2p1() || Subtarget.hasSME2()) {
if (AArch64::PPRRegClass.contains(Reg) &&
(Reg >= AArch64::P8 && Reg <= AArch64::P15) && SavedRegs.test(Reg) &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't see where you've addressed this. Did you miss this comment?

Comment on lines 2787 to 2795
void verify(SmallVectorImpl<RegPairInfo> &RegPairs) {
auto IsPPR = [](const RegPairInfo &c) { return c.Reg1 == RegPairInfo::PPR; };
auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
assert(!(PPRBegin < ZPRBegin) &&
"Expected callee save predicate to be handled first");
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work when there are no PPR registers to save (PPRBegin == RegPairs.end())?

Additionally:

  • You only need to check this when using a predicate reg to do the paired spill/fill instructions.
  • nit: verify is quite a generic name, but the checks are rather specific. Maybe just inline this function where it is used?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work when there are no PPR registers to save (PPRBegin == RegPairs.end())?

If there is no PReg to save then it should be fine to clobber PReg.
AFAIU the check is to avoid to spill/store the PRegs after ZRegs and fill/load PRegs before ZRegs. If there is not PReg to spill and fill then the compiler should be fine.

Copy link
Collaborator

@sdesmalen-arm sdesmalen-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks for your patience @CarolineConcatto! :)

@CarolineConcatto CarolineConcatto merged commit c4bac7f into llvm:main May 17, 2024
3 of 4 checks passed
@CarolineConcatto CarolineConcatto deleted the ldstpair branch May 20, 2024 07:44
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants