[AArch64] Limit immediate offsets when folding instructions into addressing modes #67345

momchil-velikov · 2023-09-25T16:29:22Z

Don't increase/decrease immediate offsets in folded instructions beyond the limits of LDP.

…essing modes Don't increase/decrease immediate offsets in folded instructions beyond the limits of `LDP`. Change-Id: I1228550f21d1e698866b2b0705caae8517bb1def

llvmbot · 2023-09-25T16:30:22Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Changes

Don't increase/decrease immediate offsets in folded instructions beyond the limits of LDP.

Full diff: https://github.com/llvm/llvm-project/pull/67345.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+38-9)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll (+2-2)
(modified) llvm/test/CodeGen/AArch64/pow.ll (+5-5)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c6a59ec44ef80e9..3cd7965d8d6fb9c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2866,14 +2866,43 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
   }
 
   // Handle memory instructions with a [Reg, #Imm] addressing mode.
-  auto canFoldAddSubImmIntoAddrMode = [&](int64_t Offset) -> bool {
-    Offset += MemI.getOperand(2).getImm() * OffsetScale;
-    if (!isLegalAddressingMode(NumBytes, Offset, /* Scale */ 0))
+
+  // Check we are not breaking a potential conversion to an LDP.
+  auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
+                                 int64_t NewOffset) -> bool {
+    int64_t MinOffset, MaxOffset;
+    switch (NumBytes) {
+    default:
+      return true;
+    case 4:
+      MinOffset = -256;
+      MaxOffset = 252;
+      break;
+    case 8:
+      MinOffset = -512;
+      MaxOffset = 504;
+      break;
+    case 16:
+      MinOffset = -1024;
+      MaxOffset = 1008;
+      break;
+    }
+    return OldOffset < MinOffset || OldOffset > MaxOffset ||
+           (NewOffset >= MinOffset && NewOffset <= MaxOffset);
+  };
+  auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
+    int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
+    int64_t NewOffset = OldOffset + Disp;
+    if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
+      return false;
+    // If the old offset would fit into an LDP, but the new offset wouldn't,
+    // bail out.
+    if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
       return false;
     AM.BaseReg = AddrI.getOperand(1).getReg();
     AM.ScaledReg = 0;
     AM.Scale = 0;
-    AM.Displacement = Offset;
+    AM.Displacement = NewOffset;
     AM.Form = ExtAddrMode::Formula::Basic;
     return true;
   };
@@ -2899,7 +2928,7 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
            Subtarget.isSTRQroSlow();
   };
 
-  int64_t Offset = 0;
+  int64_t Disp = 0;
   const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
   switch (AddrI.getOpcode()) {
   default:
@@ -2910,16 +2939,16 @@ bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
     // ldr Xd, [Xa, #M]
     // ->
     // ldr Xd, [Xn, #N'+M]
-    Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
-    return canFoldAddSubImmIntoAddrMode(Offset);
+    Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
+    return canFoldAddSubImmIntoAddrMode(Disp);
 
   case AArch64::SUBXri:
     // sub Xa, Xn, #N
     // ldr Xd, [Xa, #M]
     // ->
     // ldr Xd, [Xn, #N'+M]
-    Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
-    return canFoldAddSubImmIntoAddrMode(-Offset);
+    Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
+    return canFoldAddSubImmIntoAddrMode(-Disp);
 
   case AArch64::ADDXrs: {
     // add Xa, Xn, Xm, lsl #N
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
index 6aaefff1f724062..2c2d72ce6afd079 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-postlegalizer-consecutive-memops=0 < %s | FileCheck %s --check-prefix=CHECK-NO-SPLIT
-; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel < %s | FileCheck %s --check-prefix=CHECK-SPLIT
+; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-enable-sink-fold=true -aarch64-postlegalizer-consecutive-memops=0 < %s | FileCheck %s --check-prefix=CHECK-NO-SPLIT
+; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-enable-sink-fold=true < %s | FileCheck %s --check-prefix=CHECK-SPLIT
 
 define void @basic_split(ptr %p) {
 ; CHECK-NO-SPLIT-LABEL: basic_split:
diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll
index ba9a5398298af7a..5141b21c7976a90 100644
--- a/llvm/test/CodeGen/AArch64/pow.ll
+++ b/llvm/test/CodeGen/AArch64/pow.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-- -aarch64-enable-sink-fold=true | FileCheck %s
 
 declare float @llvm.pow.f32(float, float)
 declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
@@ -110,17 +110,17 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi
 ; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov d0, v0.d[1]
 ; CHECK-NEXT:    fmov d1, #0.25000000
 ; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl pow
 ; CHECK-NEXT:    fmov d1, #0.25000000
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bl pow
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]

aemerson

Maybe I'm misunderstanding, but isn't this pessimistic if there isn't another LDR to form an LDP with?

momchil-velikov · 2023-09-25T16:51:05Z

Yes, it's pessimistic from the point of view of "sink-and-fold" and optimistic from the point of view of load/store opt :D
This function (AArch64InstrInfo::canFoldIntoAddrMode) is a bit limited to what it can do, since it's supposed to look at single instruction pair.
In principle, we have all the potential folding candidates collected at

llvm-project/llvm/lib/CodeGen/MachineSink.cpp

Line 507 in 1db42fa

for (auto &[SinkDst, MaybeAM] : SinkInto) {

and we could add another target hook to check for consecutive loads/stores, however, I'm quite a bit skeptical of how much
we would gain for the effort.

aemerson

Oh I see, this is tweaking a new optimization I didn't have in my local checkout. If canFoldIntoAddrMode() isn't being used elsewhere then it seems fine to me.

llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll

davemgreen

Sounds good to me. Thanks.

llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll

…essing modes (llvm#67345) Don't increase/decrease immediate offsets in folded instructions beyond the limits of `LDP`.

[AArch64] Limit immediate offsets when folding instructions into addr…

a0a04a1

…essing modes Don't increase/decrease immediate offsets in folded instructions beyond the limits of `LDP`. Change-Id: I1228550f21d1e698866b2b0705caae8517bb1def

momchil-velikov requested a review from aemerson September 25, 2023 16:29

llvmbot added backend:AArch64 llvm:globalisel labels Sep 25, 2023

momchil-velikov requested review from hassnaaHamdi, davemgreen and CarolineConcatto September 25, 2023 16:29

aemerson reviewed Sep 25, 2023

View reviewed changes

llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll Show resolved Hide resolved

davemgreen approved these changes Sep 26, 2023

View reviewed changes

llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll Show resolved Hide resolved

momchil-velikov merged commit fe763d8 into llvm:main Sep 26, 2023
4 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Limit immediate offsets when folding instructions into addressing modes #67345

[AArch64] Limit immediate offsets when folding instructions into addressing modes #67345

momchil-velikov commented Sep 25, 2023

llvmbot commented Sep 25, 2023 •

edited

Loading

aemerson left a comment

momchil-velikov commented Sep 25, 2023

aemerson left a comment

davemgreen left a comment

[AArch64] Limit immediate offsets when folding instructions into addressing modes #67345

[AArch64] Limit immediate offsets when folding instructions into addressing modes #67345

Conversation

momchil-velikov commented Sep 25, 2023

llvmbot commented Sep 25, 2023 • edited Loading

aemerson left a comment

Choose a reason for hiding this comment

momchil-velikov commented Sep 25, 2023

aemerson left a comment

Choose a reason for hiding this comment

davemgreen left a comment

Choose a reason for hiding this comment

llvmbot commented Sep 25, 2023 •

edited

Loading