From f589e5067fc8c15d8fc228169055c63ff29b2c14 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Fri, 28 Oct 2022 16:18:40 +0800
Subject: [PATCH] [LoongArch] Split SP adjustment

This patch split the SP adjustment to reduce the instructions in
prologue and epilogue. In this way, the offset of the callee saved
register could fit in a single store.

Similar to D68011(RISCV).

Differential Revision: https://reviews.llvm.org/D136222
---
 .../LoongArch/LoongArchFrameLowering.cpp      | 71 +++++++++++++-
 .../Target/LoongArch/LoongArchFrameLowering.h |  2 +
 .../test/CodeGen/LoongArch/split-sp-adjust.ll | 45 +++++++++
 .../CodeGen/LoongArch/stack-realignment.ll    | 94 ++++++++-----------
 4 files changed, 157 insertions(+), 55 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/split-sp-adjust.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 45472157b4821..e8985d9282432 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -138,11 +138,17 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
 
   // First, compute final stack size.
   uint64_t StackSize = MFI.getStackSize();
+  uint64_t RealStackSize = StackSize;
 
   // Early exit if there is no need to allocate space in the stack.
   if (StackSize == 0 && !MFI.adjustsStack())
     return;
 
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  // Split the SP adjustment to reduce the offsets of callee saved spill.
+  if (FirstSPAdjustAmount)
+    StackSize = FirstSPAdjustAmount;
+
   // Adjust stack.
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
   // Emit ".cfi_def_cfa_offset StackSize".
@@ -184,7 +190,29 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Emit the second SP adjustment after saving callee saved registers.
+  if (FirstSPAdjustAmount) {
+    uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount;
+    assert(SecondSPAdjustAmount > 0 &&
+           "SecondSPAdjustAmount should be greater than zero");
+    adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
+              MachineInstr::FrameSetup);
 
+    if (!hasFP(MF)) {
+      // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0",
+      // don't emit an sp-based .cfi_def_cfa_offset
+      // Emit ".cfi_def_cfa_offset RealStackSize"
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (hasFP(MF)) {
     // Realign stack.
     if (RI->hasStackRealignment(MF)) {
       unsigned ShiftAmount = Log2(MFI.getMaxAlign());
@@ -244,10 +272,47 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
               MachineInstr::FrameDestroy);
   }
 
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
+  if (FirstSPAdjustAmount) {
+    uint64_t SecondSPAdjustAmount = StackSize - FirstSPAdjustAmount;
+    assert(SecondSPAdjustAmount > 0 &&
+           "SecondSPAdjustAmount should be greater than zero");
+
+    adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount,
+              MachineInstr::FrameDestroy);
+    StackSize = FirstSPAdjustAmount;
+  }
+
   // Deallocate stack
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
 }
 
+// We would like to split the SP adjustment to reduce prologue/epilogue
+// as following instructions. In this way, the offset of the callee saved
+// register could fit in a single store.
+// e.g.
+//   addi.d  $sp, $sp, -2032
+//   st.d    $ra, $sp,  2024
+//   st.d    $fp, $sp,  2016
+//   addi.d  $sp, $sp,   -16
+uint64_t LoongArchFrameLowering::getFirstSPAdjustAmount(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  // Return the FirstSPAdjustAmount if the StackSize can not fit in a signed
+  // 12-bit and there exists a callee-saved register needing to be pushed.
+  if (!isInt<12>(MFI.getStackSize()) && (CSI.size() > 0)) {
+    // FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will
+    // cause sp = sp + 2048 in the epilogue to be split into multiple
+    // instructions. Offsets smaller than 2048 can fit in a single load/store
+    // instruction, and we have to stick with the stack alignment.
+    // So (2048 - StackAlign) will satisfy the stack alignment.
+    return 2048 - getStackAlign().value();
+  }
+  return 0;
+}
+
 void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                                   BitVector &SavedRegs,
                                                   RegScavenger *RS) const {
@@ -307,6 +372,7 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
   auto *LoongArchFI = MF.getInfo<LoongArchMachineFunctionInfo>();
   uint64_t StackSize = MFI.getStackSize();
+  uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
 
   // Callee-saved registers should be referenced relative to the stack
   // pointer (positive offset), otherwise use the frame pointer (negative
@@ -325,7 +391,10 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
 
   if (FI >= MinCSFI && FI <= MaxCSFI) {
     FrameReg = LoongArch::R3;
-    Offset += StackOffset::getFixed(StackSize);
+    if (FirstSPAdjustAmount)
+      Offset += StackOffset::getFixed(FirstSPAdjustAmount);
+    else
+      Offset += StackOffset::getFixed(StackSize);
   } else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
     // If the stack was realigned, the frame pointer is set in order to allow
     // SP to be restored, so we need another base register to record the stack
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
index e1e3e260f97a2..7ef79aaf32999 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -45,6 +45,8 @@ class LoongArchFrameLowering : public TargetFrameLowering {
   bool hasFP(const MachineFunction &MF) const override;
   bool hasBP(const MachineFunction &MF) const;
 
+  uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
+
 private:
   void determineFrameLayout(MachineFunction &MF) const;
   void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
new file mode 100644
index 0000000000000..093c92b0dadbb
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+;; The stack size is 2048 and the SP adjustment will be split.
+define i32 @SplitSP() nounwind {
+; CHECK-LABEL: SplitSP:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -2032
+; CHECK-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    addi.d $a0, $sp, 12
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 2032
+; CHECK-NEXT:    ret
+entry:
+  %xx = alloca [2028 x i8], align 1
+  %0 = getelementptr inbounds [2028 x i8], ptr %xx, i32 0, i32 0
+  %call = call i32 @foo(ptr nonnull %0)
+  ret i32 0
+}
+
+;; The stack size is 2032 and the SP adjustment will not be split.
+define i32 @NoSplitSP() nounwind {
+; CHECK-LABEL: NoSplitSP:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $sp, $sp, -2032
+; CHECK-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 2032
+; CHECK-NEXT:    ret
+entry:
+  %xx = alloca [2024 x i8], align 1
+  %0 = getelementptr inbounds [2024 x i8], ptr %xx, i32 0, i32 0
+  %call = call i32 @foo(ptr nonnull %0)
+  ret i32 0
+}
+
+declare i32 @foo(ptr)
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 16c7bcd8b1c5a..89672fbe8b0c2 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -453,46 +453,46 @@ define void @caller_no_realign1024() "no-realign-stack" {
 define void @caller2048() {
 ; LA32-LABEL: caller2048:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -2048
-; LA32-NEXT:    .cfi_def_cfa_offset 2048
-; LA32-NEXT:    st.w $ra, $sp, 2044 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 2040 # 4-byte Folded Spill
+; LA32-NEXT:    addi.w $sp, $sp, -2032
+; LA32-NEXT:    .cfi_def_cfa_offset 2032
+; LA32-NEXT:    st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 2024 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    .cfi_offset 22, -8
 ; LA32-NEXT:    addi.w $fp, $sp, 2032
-; LA32-NEXT:    addi.w $fp, $fp, 16
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
+; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    srli.w $a0, $sp, 11
 ; LA32-NEXT:    slli.w $sp, $a0, 11
 ; LA32-NEXT:    addi.w $a0, $sp, 0
 ; LA32-NEXT:    bl %plt(callee)
 ; LA32-NEXT:    addi.w $sp, $fp, -2048
-; LA32-NEXT:    ld.w $fp, $sp, 2040 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 2044 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 2032
 ; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 2032
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: caller2048:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -2048
-; LA64-NEXT:    .cfi_def_cfa_offset 2048
-; LA64-NEXT:    st.d $ra, $sp, 2040 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 2032 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -2032
+; LA64-NEXT:    .cfi_def_cfa_offset 2032
+; LA64-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 2016 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
 ; LA64-NEXT:    addi.d $fp, $sp, 2032
-; LA64-NEXT:    addi.d $fp, $fp, 16
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
+; LA64-NEXT:    addi.d $sp, $sp, -16
 ; LA64-NEXT:    srli.d $a0, $sp, 11
 ; LA64-NEXT:    slli.d $sp, $a0, 11
 ; LA64-NEXT:    addi.d $a0, $sp, 0
 ; LA64-NEXT:    bl %plt(callee)
 ; LA64-NEXT:    addi.d $sp, $fp, -2048
-; LA64-NEXT:    ld.d $fp, $sp, 2032 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 2040 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 2032
 ; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 2032
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 2048
   call void @callee(i8* %1)
@@ -531,66 +531,52 @@ define void @caller_no_realign2048() "no-realign-stack" {
 define void @caller4096() {
 ; LA32-LABEL: caller4096:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    lu12i.w $a0, 1
-; LA32-NEXT:    sub.w $sp, $sp, $a0
-; LA32-NEXT:    .cfi_def_cfa_offset 4096
-; LA32-NEXT:    ori $a0, $zero, 4092
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    st.w $ra, $a0, 0 # 4-byte Folded Spill
-; LA32-NEXT:    ori $a0, $zero, 4088
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    st.w $fp, $a0, 0 # 4-byte Folded Spill
+; LA32-NEXT:    addi.w $sp, $sp, -2032
+; LA32-NEXT:    .cfi_def_cfa_offset 2032
+; LA32-NEXT:    st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 2024 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    .cfi_offset 22, -8
-; LA32-NEXT:    lu12i.w $a0, 1
-; LA32-NEXT:    add.w $fp, $sp, $a0
+; LA32-NEXT:    addi.w $fp, $sp, 2032
 ; LA32-NEXT:    .cfi_def_cfa 22, 0
+; LA32-NEXT:    addi.w $sp, $sp, -2048
+; LA32-NEXT:    addi.w $sp, $sp, -16
 ; LA32-NEXT:    srli.w $a0, $sp, 12
 ; LA32-NEXT:    slli.w $sp, $a0, 12
 ; LA32-NEXT:    addi.w $a0, $sp, 0
 ; LA32-NEXT:    bl %plt(callee)
 ; LA32-NEXT:    lu12i.w $a0, 1
 ; LA32-NEXT:    sub.w $sp, $fp, $a0
-; LA32-NEXT:    ori $a0, $zero, 4088
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    ld.w $fp, $a0, 0 # 4-byte Folded Reload
-; LA32-NEXT:    ori $a0, $zero, 4092
-; LA32-NEXT:    add.w $a0, $sp, $a0
-; LA32-NEXT:    ld.w $ra, $a0, 0 # 4-byte Folded Reload
-; LA32-NEXT:    lu12i.w $a0, 1
-; LA32-NEXT:    add.w $sp, $sp, $a0
+; LA32-NEXT:    addi.w $sp, $sp, 2032
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 2032
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: caller4096:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    lu12i.w $a0, 1
-; LA64-NEXT:    sub.d $sp, $sp, $a0
-; LA64-NEXT:    .cfi_def_cfa_offset 4096
-; LA64-NEXT:    ori $a0, $zero, 4088
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    st.d $ra, $a0, 0 # 8-byte Folded Spill
-; LA64-NEXT:    ori $a0, $zero, 4080
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    st.d $fp, $a0, 0 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -2032
+; LA64-NEXT:    .cfi_def_cfa_offset 2032
+; LA64-NEXT:    st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 2016 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    lu12i.w $a0, 1
-; LA64-NEXT:    add.d $fp, $sp, $a0
+; LA64-NEXT:    addi.d $fp, $sp, 2032
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
+; LA64-NEXT:    addi.d $sp, $sp, -2048
+; LA64-NEXT:    addi.d $sp, $sp, -16
 ; LA64-NEXT:    srli.d $a0, $sp, 12
 ; LA64-NEXT:    slli.d $sp, $a0, 12
 ; LA64-NEXT:    addi.d $a0, $sp, 0
 ; LA64-NEXT:    bl %plt(callee)
 ; LA64-NEXT:    lu12i.w $a0, 1
 ; LA64-NEXT:    sub.d $sp, $fp, $a0
-; LA64-NEXT:    ori $a0, $zero, 4080
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    ld.d $fp, $a0, 0 # 8-byte Folded Reload
-; LA64-NEXT:    ori $a0, $zero, 4088
-; LA64-NEXT:    add.d $a0, $sp, $a0
-; LA64-NEXT:    ld.d $ra, $a0, 0 # 8-byte Folded Reload
-; LA64-NEXT:    lu12i.w $a0, 1
-; LA64-NEXT:    add.d $sp, $sp, $a0
+; LA64-NEXT:    addi.d $sp, $sp, 2032
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 2032
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 4096
   call void @callee(i8* %1)