-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Limit immediate offsets when folding instructions into addressing modes #67345
Conversation
…essing modes Don't increase/decrease immediate offsets in folded instructions beyond the limits of `LDP`. Change-Id: I1228550f21d1e698866b2b0705caae8517bb1def
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 ChangesDon't increase/decrease immediate offsets in folded instructions beyond the limits of Full diff: https://github.com/llvm/llvm-project/pull/67345.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c6a59ec44ef80e9..3cd7965d8d6fb9c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2866,14 +2866,43 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
}
// Handle memory instructions with a [Reg, #Imm] addressing mode.
- auto canFoldAddSubImmIntoAddrMode = [&](int64_t Offset) -> bool {
- Offset += MemI.getOperand(2).getImm() * OffsetScale;
- if (!isLegalAddressingMode(NumBytes, Offset, /* Scale */ 0))
+
+ // Check we are not breaking a potential conversion to an LDP.
+ auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
+ int64_t NewOffset) -> bool {
+ int64_t MinOffset, MaxOffset;
+ switch (NumBytes) {
+ default:
+ return true;
+ case 4:
+ MinOffset = -256;
+ MaxOffset = 252;
+ break;
+ case 8:
+ MinOffset = -512;
+ MaxOffset = 504;
+ break;
+ case 16:
+ MinOffset = -1024;
+ MaxOffset = 1008;
+ break;
+ }
+ return OldOffset < MinOffset || OldOffset > MaxOffset ||
+ (NewOffset >= MinOffset && NewOffset <= MaxOffset);
+ };
+ auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
+ int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
+ int64_t NewOffset = OldOffset + Disp;
+ if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
+ return false;
+ // If the old offset would fit into an LDP, but the new offset wouldn't,
+ // bail out.
+ if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
return false;
AM.BaseReg = AddrI.getOperand(1).getReg();
AM.ScaledReg = 0;
AM.Scale = 0;
- AM.Displacement = Offset;
+ AM.Displacement = NewOffset;
AM.Form = ExtAddrMode::Formula::Basic;
return true;
};
@@ -2899,7 +2928,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
Subtarget.isSTRQroSlow();
};
- int64_t Offset = 0;
+ int64_t Disp = 0;
const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
switch (AddrI.getOpcode()) {
default:
@@ -2910,16 +2939,16 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
// ldr Xd, [Xa, #M]
// ->
// ldr Xd, [Xn, #N'+M]
- Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
- return canFoldAddSubImmIntoAddrMode(Offset);
+ Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
+ return canFoldAddSubImmIntoAddrMode(Disp);
case AArch64::SUBXri:
// sub Xa, Xn, #N
// ldr Xd, [Xa, #M]
// ->
// ldr Xd, [Xn, #N'+M]
- Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
- return canFoldAddSubImmIntoAddrMode(-Offset);
+ Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
+ return canFoldAddSubImmIntoAddrMode(-Disp);
case AArch64::ADDXrs: {
// add Xa, Xn, Xm, lsl #N
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
index 6aaefff1f724062..2c2d72ce6afd079 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-postlegalizer-consecutive-memops=0 < %s | FileCheck %s --check-prefix=CHECK-NO-SPLIT
-; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel < %s | FileCheck %s --check-prefix=CHECK-SPLIT
+; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-enable-sink-fold=true -aarch64-postlegalizer-consecutive-memops=0 < %s | FileCheck %s --check-prefix=CHECK-NO-SPLIT
+; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix=CHECK-SPLIT
define void @basic_split(ptr %p) {
; CHECK-NO-SPLIT-LABEL: basic_split:
diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll
index ba9a5398298af7a..5141b21c7976a90 100644
--- a/llvm/test/CodeGen/AArch64/pow.ll
+++ b/llvm/test/CodeGen/AArch64/pow.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-- -aarch64-enable-sink-fold=true | FileCheck %s
declare float @llvm.pow.f32(float, float)
declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
@@ -110,17 +110,17 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi
; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: mov d0, v0.d[1]
; CHECK-NEXT: fmov d1, #0.25000000
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: bl pow
; CHECK-NEXT: fmov d1, #0.25000000
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: bl pow
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: mov v0.d[1], v1.d[0]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe I'm misunderstanding, but isn't this pessimistic if there isn't another LDR to form an LDP with?
Yes, it's pessimistic from the point of view of "sink-and-fold" and optimistic from the point of view of load/store opt :D llvm-project/llvm/lib/CodeGen/MachineSink.cpp Line 507 in 1db42fa
and we could add another target hook to check for consecutive loads/stores, however, I'm quite a bit skeptical of how much we would gain for the effort. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see, this is tweaking a new optimization I didn't have in my local checkout. If canFoldIntoAddrMode()
isn't being used elsewhere then it seems fine to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good to me. Thanks.
…essing modes (llvm#67345) Don't increase/decrease immediate offsets in folded instructions beyond the limits of `LDP`.
Don't increase/decrease immediate offsets in folded instructions beyond the limits of
LDP
.