Skip to content

Commit

Permalink
[LoongArch] Split SP adjustment
Browse files Browse the repository at this point in the history
This patch split the SP adjustment to reduce the instructions in
prologue and epilogue. In this way, the offset of the callee saved
register could fit in a single store.

Similar to D68011(RISCV).

Differential Revision: https://reviews.llvm.org/D136222
  • Loading branch information
wangleiat authored and SixWeining committed Oct 28, 2022
1 parent b251b60 commit f589e50
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 55 deletions.
71 changes: 70 additions & 1 deletion llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
Expand Up @@ -138,11 +138,17 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,

// First, compute final stack size.
uint64_t StackSize = MFI.getStackSize();
uint64_t RealStackSize = StackSize;

// Early exit if there is no need to allocate space in the stack.
if (StackSize == 0 && !MFI.adjustsStack())
return;

uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
// Split the SP adjustment to reduce the offsets of callee saved spill.
if (FirstSPAdjustAmount)
StackSize = FirstSPAdjustAmount;

// Adjust stack.
adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
// Emit ".cfi_def_cfa_offset StackSize".
Expand Down Expand Up @@ -184,7 +190,29 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}

// Emit the second SP adjustment after saving callee saved registers.
if (FirstSPAdjustAmount) {
uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount;
assert(SecondSPAdjustAmount > 0 &&
"SecondSPAdjustAmount should be greater than zero");
adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
MachineInstr::FrameSetup);

if (!hasFP(MF)) {
// If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0",
// don't emit an sp-based .cfi_def_cfa_offset
// Emit ".cfi_def_cfa_offset RealStackSize"
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}
}

if (hasFP(MF)) {
// Realign stack.
if (RI->hasStackRealignment(MF)) {
unsigned ShiftAmount = Log2(MFI.getMaxAlign());
Expand Down Expand Up @@ -244,10 +272,47 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
MachineInstr::FrameDestroy);
}

uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
if (FirstSPAdjustAmount) {
uint64_t SecondSPAdjustAmount = StackSize - FirstSPAdjustAmount;
assert(SecondSPAdjustAmount > 0 &&
"SecondSPAdjustAmount should be greater than zero");

adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount,
MachineInstr::FrameDestroy);
StackSize = FirstSPAdjustAmount;
}

// Deallocate stack
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
}

// We would like to split the SP adjustment to reduce prologue/epilogue
// as following instructions. In this way, the offset of the callee saved
// register could fit in a single store.
// e.g.
// addi.d $sp, $sp, -2032
// st.d $ra, $sp, 2024
// st.d $fp, $sp, 2016
// addi.d $sp, $sp, -16
uint64_t LoongArchFrameLowering::getFirstSPAdjustAmount(
const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();

// Return the FirstSPAdjustAmount if the StackSize can not fit in a signed
// 12-bit and there exists a callee-saved register needing to be pushed.
if (!isInt<12>(MFI.getStackSize()) && (CSI.size() > 0)) {
// FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will
// cause sp = sp + 2048 in the epilogue to be split into multiple
// instructions. Offsets smaller than 2048 can fit in a single load/store
// instruction, and we have to stick with the stack alignment.
// So (2048 - StackAlign) will satisfy the stack alignment.
return 2048 - getStackAlign().value();
}
return 0;
}

void LoongArchFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
Expand Down Expand Up @@ -307,6 +372,7 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
auto *LoongArchFI = MF.getInfo<LoongArchMachineFunctionInfo>();
uint64_t StackSize = MFI.getStackSize();
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);

// Callee-saved registers should be referenced relative to the stack
// pointer (positive offset), otherwise use the frame pointer (negative
Expand All @@ -325,7 +391,10 @@ StackOffset LoongArchFrameLowering::getFrameIndexReference(

if (FI >= MinCSFI && FI <= MaxCSFI) {
FrameReg = LoongArch::R3;
Offset += StackOffset::getFixed(StackSize);
if (FirstSPAdjustAmount)
Offset += StackOffset::getFixed(FirstSPAdjustAmount);
else
Offset += StackOffset::getFixed(StackSize);
} else if (RI->hasStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
// If the stack was realigned, the frame pointer is set in order to allow
// SP to be restored, so we need another base register to record the stack
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
Expand Up @@ -45,6 +45,8 @@ class LoongArchFrameLowering : public TargetFrameLowering {
bool hasFP(const MachineFunction &MF) const override;
bool hasBP(const MachineFunction &MF) const;

uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;

private:
void determineFrameLayout(MachineFunction &MF) const;
void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
Expand Down
45 changes: 45 additions & 0 deletions llvm/test/CodeGen/LoongArch/split-sp-adjust.ll
@@ -0,0 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc --mtriple=loongarch64 --verify-machineinstrs < %s \
; RUN: | FileCheck %s

;; The stack size is 2048 and the SP adjustment will be split.
define i32 @SplitSP() nounwind {
; CHECK-LABEL: SplitSP:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi.d $sp, $sp, -2032
; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
; CHECK-NEXT: addi.d $sp, $sp, -16
; CHECK-NEXT: addi.d $a0, $sp, 12
; CHECK-NEXT: bl %plt(foo)
; CHECK-NEXT: move $a0, $zero
; CHECK-NEXT: addi.d $sp, $sp, 16
; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 2032
; CHECK-NEXT: ret
entry:
%xx = alloca [2028 x i8], align 1
%0 = getelementptr inbounds [2028 x i8], ptr %xx, i32 0, i32 0
%call = call i32 @foo(ptr nonnull %0)
ret i32 0
}

;; The stack size is 2032 and the SP adjustment will not be split.
define i32 @NoSplitSP() nounwind {
; CHECK-LABEL: NoSplitSP:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi.d $sp, $sp, -2032
; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
; CHECK-NEXT: addi.d $a0, $sp, 0
; CHECK-NEXT: bl %plt(foo)
; CHECK-NEXT: move $a0, $zero
; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
; CHECK-NEXT: addi.d $sp, $sp, 2032
; CHECK-NEXT: ret
entry:
%xx = alloca [2024 x i8], align 1
%0 = getelementptr inbounds [2024 x i8], ptr %xx, i32 0, i32 0
%call = call i32 @foo(ptr nonnull %0)
ret i32 0
}

declare i32 @foo(ptr)
94 changes: 40 additions & 54 deletions llvm/test/CodeGen/LoongArch/stack-realignment.ll
Expand Up @@ -453,46 +453,46 @@ define void @caller_no_realign1024() "no-realign-stack" {
define void @caller2048() {
; LA32-LABEL: caller2048:
; LA32: # %bb.0:
; LA32-NEXT: addi.w $sp, $sp, -2048
; LA32-NEXT: .cfi_def_cfa_offset 2048
; LA32-NEXT: st.w $ra, $sp, 2044 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 2040 # 4-byte Folded Spill
; LA32-NEXT: addi.w $sp, $sp, -2032
; LA32-NEXT: .cfi_def_cfa_offset 2032
; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
; LA32-NEXT: .cfi_offset 1, -4
; LA32-NEXT: .cfi_offset 22, -8
; LA32-NEXT: addi.w $fp, $sp, 2032
; LA32-NEXT: addi.w $fp, $fp, 16
; LA32-NEXT: .cfi_def_cfa 22, 0
; LA32-NEXT: addi.w $sp, $sp, -16
; LA32-NEXT: srli.w $a0, $sp, 11
; LA32-NEXT: slli.w $sp, $a0, 11
; LA32-NEXT: addi.w $a0, $sp, 0
; LA32-NEXT: bl %plt(callee)
; LA32-NEXT: addi.w $sp, $fp, -2048
; LA32-NEXT: ld.w $fp, $sp, 2040 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 2044 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: addi.w $sp, $sp, 16
; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: ret
;
; LA64-LABEL: caller2048:
; LA64: # %bb.0:
; LA64-NEXT: addi.d $sp, $sp, -2048
; LA64-NEXT: .cfi_def_cfa_offset 2048
; LA64-NEXT: st.d $ra, $sp, 2040 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 2032 # 8-byte Folded Spill
; LA64-NEXT: addi.d $sp, $sp, -2032
; LA64-NEXT: .cfi_def_cfa_offset 2032
; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
; LA64-NEXT: .cfi_offset 1, -8
; LA64-NEXT: .cfi_offset 22, -16
; LA64-NEXT: addi.d $fp, $sp, 2032
; LA64-NEXT: addi.d $fp, $fp, 16
; LA64-NEXT: .cfi_def_cfa 22, 0
; LA64-NEXT: addi.d $sp, $sp, -16
; LA64-NEXT: srli.d $a0, $sp, 11
; LA64-NEXT: slli.d $sp, $a0, 11
; LA64-NEXT: addi.d $a0, $sp, 0
; LA64-NEXT: bl %plt(callee)
; LA64-NEXT: addi.d $sp, $fp, -2048
; LA64-NEXT: ld.d $fp, $sp, 2032 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 2040 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: addi.d $sp, $sp, 16
; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: ret
%1 = alloca i8, align 2048
call void @callee(i8* %1)
Expand Down Expand Up @@ -531,66 +531,52 @@ define void @caller_no_realign2048() "no-realign-stack" {
define void @caller4096() {
; LA32-LABEL: caller4096:
; LA32: # %bb.0:
; LA32-NEXT: lu12i.w $a0, 1
; LA32-NEXT: sub.w $sp, $sp, $a0
; LA32-NEXT: .cfi_def_cfa_offset 4096
; LA32-NEXT: ori $a0, $zero, 4092
; LA32-NEXT: add.w $a0, $sp, $a0
; LA32-NEXT: st.w $ra, $a0, 0 # 4-byte Folded Spill
; LA32-NEXT: ori $a0, $zero, 4088
; LA32-NEXT: add.w $a0, $sp, $a0
; LA32-NEXT: st.w $fp, $a0, 0 # 4-byte Folded Spill
; LA32-NEXT: addi.w $sp, $sp, -2032
; LA32-NEXT: .cfi_def_cfa_offset 2032
; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
; LA32-NEXT: .cfi_offset 1, -4
; LA32-NEXT: .cfi_offset 22, -8
; LA32-NEXT: lu12i.w $a0, 1
; LA32-NEXT: add.w $fp, $sp, $a0
; LA32-NEXT: addi.w $fp, $sp, 2032
; LA32-NEXT: .cfi_def_cfa 22, 0
; LA32-NEXT: addi.w $sp, $sp, -2048
; LA32-NEXT: addi.w $sp, $sp, -16
; LA32-NEXT: srli.w $a0, $sp, 12
; LA32-NEXT: slli.w $sp, $a0, 12
; LA32-NEXT: addi.w $a0, $sp, 0
; LA32-NEXT: bl %plt(callee)
; LA32-NEXT: lu12i.w $a0, 1
; LA32-NEXT: sub.w $sp, $fp, $a0
; LA32-NEXT: ori $a0, $zero, 4088
; LA32-NEXT: add.w $a0, $sp, $a0
; LA32-NEXT: ld.w $fp, $a0, 0 # 4-byte Folded Reload
; LA32-NEXT: ori $a0, $zero, 4092
; LA32-NEXT: add.w $a0, $sp, $a0
; LA32-NEXT: ld.w $ra, $a0, 0 # 4-byte Folded Reload
; LA32-NEXT: lu12i.w $a0, 1
; LA32-NEXT: add.w $sp, $sp, $a0
; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: addi.w $sp, $sp, 32
; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 2032
; LA32-NEXT: ret
;
; LA64-LABEL: caller4096:
; LA64: # %bb.0:
; LA64-NEXT: lu12i.w $a0, 1
; LA64-NEXT: sub.d $sp, $sp, $a0
; LA64-NEXT: .cfi_def_cfa_offset 4096
; LA64-NEXT: ori $a0, $zero, 4088
; LA64-NEXT: add.d $a0, $sp, $a0
; LA64-NEXT: st.d $ra, $a0, 0 # 8-byte Folded Spill
; LA64-NEXT: ori $a0, $zero, 4080
; LA64-NEXT: add.d $a0, $sp, $a0
; LA64-NEXT: st.d $fp, $a0, 0 # 8-byte Folded Spill
; LA64-NEXT: addi.d $sp, $sp, -2032
; LA64-NEXT: .cfi_def_cfa_offset 2032
; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
; LA64-NEXT: .cfi_offset 1, -8
; LA64-NEXT: .cfi_offset 22, -16
; LA64-NEXT: lu12i.w $a0, 1
; LA64-NEXT: add.d $fp, $sp, $a0
; LA64-NEXT: addi.d $fp, $sp, 2032
; LA64-NEXT: .cfi_def_cfa 22, 0
; LA64-NEXT: addi.d $sp, $sp, -2048
; LA64-NEXT: addi.d $sp, $sp, -16
; LA64-NEXT: srli.d $a0, $sp, 12
; LA64-NEXT: slli.d $sp, $a0, 12
; LA64-NEXT: addi.d $a0, $sp, 0
; LA64-NEXT: bl %plt(callee)
; LA64-NEXT: lu12i.w $a0, 1
; LA64-NEXT: sub.d $sp, $fp, $a0
; LA64-NEXT: ori $a0, $zero, 4080
; LA64-NEXT: add.d $a0, $sp, $a0
; LA64-NEXT: ld.d $fp, $a0, 0 # 8-byte Folded Reload
; LA64-NEXT: ori $a0, $zero, 4088
; LA64-NEXT: add.d $a0, $sp, $a0
; LA64-NEXT: ld.d $ra, $a0, 0 # 8-byte Folded Reload
; LA64-NEXT: lu12i.w $a0, 1
; LA64-NEXT: add.d $sp, $sp, $a0
; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: addi.d $sp, $sp, 32
; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 2032
; LA64-NEXT: ret
%1 = alloca i8, align 4096
call void @callee(i8* %1)
Expand Down

0 comments on commit f589e50

Please sign in to comment.