From f589e5067fc8c15d8fc228169055c63ff29b2c14 Mon Sep 17 00:00:00 2001 From: wanglei Date: Fri, 28 Oct 2022 16:18:40 +0800 Subject: [PATCH] [LoongArch] Split SP adjustment This patch split the SP adjustment to reduce the instructions in prologue and epilogue. In this way, the offset of the callee saved register could fit in a single store. Similar to D68011(RISCV). Differential Revision: https://reviews.llvm.org/D136222 --- .../LoongArch/LoongArchFrameLowering.cpp | 71 +++++++++++++- .../Target/LoongArch/LoongArchFrameLowering.h | 2 + .../test/CodeGen/LoongArch/split-sp-adjust.ll | 45 +++++++++ .../CodeGen/LoongArch/stack-realignment.ll | 94 ++++++++----------- 4 files changed, 157 insertions(+), 55 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/split-sp-adjust.ll diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 45472157b4821..e8985d9282432 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -138,11 +138,17 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, // First, compute final stack size. uint64_t StackSize = MFI.getStackSize(); + uint64_t RealStackSize = StackSize; // Early exit if there is no need to allocate space in the stack. if (StackSize == 0 && !MFI.adjustsStack()) return; + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + // Split the SP adjustment to reduce the offsets of callee saved spill. + if (FirstSPAdjustAmount) + StackSize = FirstSPAdjustAmount; + // Adjust stack. adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); // Emit ".cfi_def_cfa_offset StackSize". @@ -184,7 +190,29 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlag(MachineInstr::FrameSetup); + } + + // Emit the second SP adjustment after saving callee saved registers. + if (FirstSPAdjustAmount) { + uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount; + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, + MachineInstr::FrameSetup); + if (!hasFP(MF)) { + // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0", + // don't emit an sp-based .cfi_def_cfa_offset + // Emit ".cfi_def_cfa_offset RealStackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } + } + + if (hasFP(MF)) { // Realign stack. if (RI->hasStackRealignment(MF)) { unsigned ShiftAmount = Log2(MFI.getMaxAlign()); @@ -244,10 +272,47 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF, MachineInstr::FrameDestroy); } + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + if (FirstSPAdjustAmount) { + uint64_t SecondSPAdjustAmount = StackSize - FirstSPAdjustAmount; + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + + adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount, + MachineInstr::FrameDestroy); + StackSize = FirstSPAdjustAmount; + } + // Deallocate stack adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); } +// We would like to split the SP adjustment to reduce prologue/epilogue +// as following instructions. In this way, the offset of the callee saved +// register could fit in a single store. +// e.g. +// addi.d $sp, $sp, -2032 +// st.d $ra, $sp, 2024 +// st.d $fp, $sp, 2016 +// addi.d $sp, $sp, -16 +uint64_t LoongArchFrameLowering::getFirstSPAdjustAmount( + const MachineFunction &MF) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const std::vector &CSI = MFI.getCalleeSavedInfo(); + + // Return the FirstSPAdjustAmount if the StackSize can not fit in a signed + // 12-bit and there exists a callee-saved register needing to be pushed. + if (!isInt<12>(MFI.getStackSize()) && (CSI.size() > 0)) { + // FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will + // cause sp = sp + 2048 in the epilogue to be split into multiple + // instructions. Offsets smaller than 2048 can fit in a single load/store + // instruction, and we have to stick with the stack alignment. + // So (2048 - StackAlign) will satisfy the stack alignment. + return 2048 - getStackAlign().value(); + } + return 0; +} + void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { @@ -307,6 +372,7 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference( const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); auto *LoongArchFI = MF.getInfo(); uint64_t StackSize = MFI.getStackSize(); + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); // Callee-saved registers should be referenced relative to the stack // pointer (positive offset), otherwise use the frame pointer (negative @@ -325,7 +391,10 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference( if (FI >= MinCSFI && FI <= MaxCSFI) { FrameReg = LoongArch::R3; - Offset += StackOffset::getFixed(StackSize); + if (FirstSPAdjustAmount) + Offset += StackOffset::getFixed(FirstSPAdjustAmount); + else + Offset += StackOffset::getFixed(StackSize); } else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) { // If the stack was realigned, the frame pointer is set in order to allow // SP to be restored, so we need another base register to record the stack diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h index e1e3e260f97a2..7ef79aaf32999 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -45,6 +45,8 @@ class LoongArchFrameLowering : public TargetFrameLowering { bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; + uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const; + private: void determineFrameLayout(MachineFunction &MF) const; void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll new file mode 100644 index 0000000000000..093c92b0dadbb --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +;; The stack size is 2048 and the SP adjustment will be split. +define i32 @SplitSP() nounwind { +; CHECK-LABEL: SplitSP: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -2032 +; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: addi.d $a0, $sp, 12 +; CHECK-NEXT: bl %plt(foo) +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 2032 +; CHECK-NEXT: ret +entry: + %xx = alloca [2028 x i8], align 1 + %0 = getelementptr inbounds [2028 x i8], ptr %xx, i32 0, i32 0 + %call = call i32 @foo(ptr nonnull %0) + ret i32 0 +} + +;; The stack size is 2032 and the SP adjustment will not be split. +define i32 @NoSplitSP() nounwind { +; CHECK-LABEL: NoSplitSP: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -2032 +; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: bl %plt(foo) +; CHECK-NEXT: move $a0, $zero +; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 2032 +; CHECK-NEXT: ret +entry: + %xx = alloca [2024 x i8], align 1 + %0 = getelementptr inbounds [2024 x i8], ptr %xx, i32 0, i32 0 + %call = call i32 @foo(ptr nonnull %0) + ret i32 0 +} + +declare i32 @foo(ptr) diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll index 16c7bcd8b1c5a..89672fbe8b0c2 100644 --- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll +++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll @@ -453,46 +453,46 @@ define void @caller_no_realign1024() "no-realign-stack" { define void @caller2048() { ; LA32-LABEL: caller2048: ; LA32: # %bb.0: -; LA32-NEXT: addi.w $sp, $sp, -2048 -; LA32-NEXT: .cfi_def_cfa_offset 2048 -; LA32-NEXT: st.w $ra, $sp, 2044 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 2040 # 4-byte Folded Spill +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: .cfi_offset 22, -8 ; LA32-NEXT: addi.w $fp, $sp, 2032 -; LA32-NEXT: addi.w $fp, $fp, 16 ; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: srli.w $a0, $sp, 11 ; LA32-NEXT: slli.w $sp, $a0, 11 ; LA32-NEXT: addi.w $a0, $sp, 0 ; LA32-NEXT: bl %plt(callee) ; LA32-NEXT: addi.w $sp, $fp, -2048 -; LA32-NEXT: ld.w $fp, $sp, 2040 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 2044 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 2032 ; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 ; LA32-NEXT: ret ; ; LA64-LABEL: caller2048: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -2048 -; LA64-NEXT: .cfi_def_cfa_offset 2048 -; LA64-NEXT: st.d $ra, $sp, 2040 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 2032 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 ; LA64-NEXT: addi.d $fp, $sp, 2032 -; LA64-NEXT: addi.d $fp, $fp, 16 ; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: srli.d $a0, $sp, 11 ; LA64-NEXT: slli.d $sp, $a0, 11 ; LA64-NEXT: addi.d $a0, $sp, 0 ; LA64-NEXT: bl %plt(callee) ; LA64-NEXT: addi.d $sp, $fp, -2048 -; LA64-NEXT: ld.d $fp, $sp, 2032 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 2040 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 2032 ; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 ; LA64-NEXT: ret %1 = alloca i8, align 2048 call void @callee(i8* %1) @@ -531,66 +531,52 @@ define void @caller_no_realign2048() "no-realign-stack" { define void @caller4096() { ; LA32-LABEL: caller4096: ; LA32: # %bb.0: -; LA32-NEXT: lu12i.w $a0, 1 -; LA32-NEXT: sub.w $sp, $sp, $a0 -; LA32-NEXT: .cfi_def_cfa_offset 4096 -; LA32-NEXT: ori $a0, $zero, 4092 -; LA32-NEXT: add.w $a0, $sp, $a0 -; LA32-NEXT: st.w $ra, $a0, 0 # 4-byte Folded Spill -; LA32-NEXT: ori $a0, $zero, 4088 -; LA32-NEXT: add.w $a0, $sp, $a0 -; LA32-NEXT: st.w $fp, $a0, 0 # 4-byte Folded Spill +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: .cfi_offset 22, -8 -; LA32-NEXT: lu12i.w $a0, 1 -; LA32-NEXT: add.w $fp, $sp, $a0 +; LA32-NEXT: addi.w $fp, $sp, 2032 ; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $sp, $sp, -2048 +; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: srli.w $a0, $sp, 12 ; LA32-NEXT: slli.w $sp, $a0, 12 ; LA32-NEXT: addi.w $a0, $sp, 0 ; LA32-NEXT: bl %plt(callee) ; LA32-NEXT: lu12i.w $a0, 1 ; LA32-NEXT: sub.w $sp, $fp, $a0 -; LA32-NEXT: ori $a0, $zero, 4088 -; LA32-NEXT: add.w $a0, $sp, $a0 -; LA32-NEXT: ld.w $fp, $a0, 0 # 4-byte Folded Reload -; LA32-NEXT: ori $a0, $zero, 4092 -; LA32-NEXT: add.w $a0, $sp, $a0 -; LA32-NEXT: ld.w $ra, $a0, 0 # 4-byte Folded Reload -; LA32-NEXT: lu12i.w $a0, 1 -; LA32-NEXT: add.w $sp, $sp, $a0 +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 ; LA32-NEXT: ret ; ; LA64-LABEL: caller4096: ; LA64: # %bb.0: -; LA64-NEXT: lu12i.w $a0, 1 -; LA64-NEXT: sub.d $sp, $sp, $a0 -; LA64-NEXT: .cfi_def_cfa_offset 4096 -; LA64-NEXT: ori $a0, $zero, 4088 -; LA64-NEXT: add.d $a0, $sp, $a0 -; LA64-NEXT: st.d $ra, $a0, 0 # 8-byte Folded Spill -; LA64-NEXT: ori $a0, $zero, 4080 -; LA64-NEXT: add.d $a0, $sp, $a0 -; LA64-NEXT: st.d $fp, $a0, 0 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: lu12i.w $a0, 1 -; LA64-NEXT: add.d $fp, $sp, $a0 +; LA64-NEXT: addi.d $fp, $sp, 2032 ; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $sp, $sp, -2048 +; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: srli.d $a0, $sp, 12 ; LA64-NEXT: slli.d $sp, $a0, 12 ; LA64-NEXT: addi.d $a0, $sp, 0 ; LA64-NEXT: bl %plt(callee) ; LA64-NEXT: lu12i.w $a0, 1 ; LA64-NEXT: sub.d $sp, $fp, $a0 -; LA64-NEXT: ori $a0, $zero, 4080 -; LA64-NEXT: add.d $a0, $sp, $a0 -; LA64-NEXT: ld.d $fp, $a0, 0 # 8-byte Folded Reload -; LA64-NEXT: ori $a0, $zero, 4088 -; LA64-NEXT: add.d $a0, $sp, $a0 -; LA64-NEXT: ld.d $ra, $a0, 0 # 8-byte Folded Reload -; LA64-NEXT: lu12i.w $a0, 1 -; LA64-NEXT: add.d $sp, $sp, $a0 +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 ; LA64-NEXT: ret %1 = alloca i8, align 4096 call void @callee(i8* %1)