[AArch64] merge index address with large offset into base address #75343

vfdff · 2023-12-13T14:24:28Z

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
Fold

  mov     w8, #56952
  movk    w8, #15, lsl #16
  ldrb    w0, [x0, x8]

into

  add     x0, x0, 1036288
  ldrb    w0, [x0, 3704]

Only LDRBBroX is supported for the first time.
Note: The AArch64DAGToDAGISel is difficult to check the loop invariant, which may bring in regression.
Fix #71917

llvmbot · 2023-12-13T14:24:57Z

@llvm/pr-subscribers-backend-aarch64

Author: Allen (vfdff)

Changes

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold
mov w8, #56952
movk w8, #15, lsl #16
ldrb w0, [x0, x8]
into
add x0, x0, 1036288
ldrb w0, [x0, 3704]

Only LDRBBroX is supported for the first time.
Fix #71917

Full diff: https://github.com/llvm/llvm-project/pull/75343.diff

5 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+5)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (+3)
(modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+225)
(modified) llvm/test/CodeGen/AArch64/arm64-addrmode.ll (+2-3)
(added) llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir (+47)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 50cbd3672fbd0d..9214e54706f8ba 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4089,6 +4089,11 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
   return MI.getOperand(Idx);
 }
 
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+  return MI.getOperand(4);
+}
+
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index e97ff0a9758d69..6ad1ef643a6897 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
+  /// Returns the shift amount operator of a load/store.
+  static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dc6d5b8950c34a..98dd9262463ecc 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+          "Number of const offset of index address folded");
 
 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
               "Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
+// The LdStConstLimit limits how far we search for const offset instructions
+// when we form index address load/store instructions.
+static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
+                                        cl::init(10), cl::Hidden);
+
 // Enable register renaming to find additional store pairing opportunities.
 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
                                     cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
                                 int UnscaledOffset, unsigned Limit);
 
+  // Scan the instruction list to find a register assigned with a const
+  // value that can be combined with the current instruction (a load or store)
+  // using base addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+                                  unsigned &Offset);
+
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,18 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+                              unsigned IndexReg, unsigned &Offset);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
+  MachineBasicBlock::iterator
+  mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                       MachineBasicBlock::iterator Update, unsigned Offset);
+
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -199,6 +220,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +505,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   }
 }
 
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+  // TODO: Add more index address loads/stores.
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBBroX:
+    return AArch64::LDRBBui;
+  }
+}
+
 static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -722,6 +756,19 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  // Scaled instructions.
+  // TODO: Add more index address loads/stores.
+  case AArch64::LDRBBroX:
+    return true;
+  }
+}
+
 static bool isRewritableImplicitDef(unsigned Opc) {
   switch (Opc) {
   default:
@@ -2018,6 +2065,62 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update,
+                                          unsigned Offset) {
+  assert((Update->getOpcode() == AArch64::MOVKWi) &&
+         "Unexpected const mov instruction to merge!");
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+  MachineInstr &MemMI = *I;
+  unsigned Imm12 = Offset & 0x0fff;
+  unsigned High = Offset - Imm12;
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+  MachineInstrBuilder AddMIB, MemMIB;
+
+  // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+  AddMIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+          .addDef(IndexReg)
+          .addUse(BaseReg)
+          .addImm(High >> 12) // shifted value
+          .addImm(12);        // shift 12
+  (void)AddMIB;
+  // Ld/St DestReg, IndexReg, Imm12
+  unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+  MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+               .add(getLdStRegOp(MemMI))
+               .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+               .addImm(Imm12)
+               .setMemRefs(I->memoperands())
+               .setMIFlags(I->mergeFlagsWith(*Update));
+  (void)MemMIB;
+
+  ++NumConstOffsetFolded;
+  LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  LLVM_DEBUG(PrevI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Update->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  PrevI->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
                                                MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
@@ -2065,6 +2168,30 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+                                                 MachineInstr &MI,
+                                                 unsigned IndexReg,
+                                                 unsigned &Offset) {
+  // The update instruction source and destination register must be the
+  // same as the load/store index register.
+  if (MI.getOpcode() == AArch64::MOVKWi &&
+      TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+    // movz + movk hold a large offset of a Ld/St instruction.
+    MachineBasicBlock::iterator B = MI.getParent()->begin();
+    MachineBasicBlock::iterator MBBI = &MI;
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MovzMI = *MBBI;
+    if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+      unsigned Low = MovzMI.getOperand(1).getImm();
+      unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+      Offset = High + Low;
+      return true;
+    }
+  }
+  return false;
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2220,6 +2347,61 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+    MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching load or store.
+  if (MBBI == B)
+    return E;
+
+  // Make sure the IndexReg is killed and the shift amount is zero.
+  // TODO: Relex this restriction to extend, simplify processing now.
+  if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
+      !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+      (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+    return E;
+
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+  unsigned Count = 0;
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+      return MBBI;
+    }
+
+    // Update the status of what the instruction clobbered and used.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+    // Otherwise, if the index register is used or modified, we have no match,
+    // so return early.
+    if (!ModifiedRegUnits.available(IndexReg) ||
+        !UsedRegUnits.available(IndexReg))
+      return E;
+
+  } while (MBBI != B && Count < Limit);
+  return E;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2404,6 +2586,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   return false;
 }
 
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(
+    MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+  MachineBasicBlock::iterator Update;
+
+  // Don't know how to handle unscaled pre/post-index versions below, so bail.
+  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+    return false;
+
+  // Look back to try to find a const offset for index LdSt instruction. For
+  // example,
+  // mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  // ldr x1, [x0, x8]
+  // merged into:
+  // add x8, x0, a * (1<<12)
+  // ldr x1, [x8, imm12]
+  unsigned Offset;
+  Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
+  if (Update != E) {
+    // Merge the imm12 into the ld/st.
+    MBBI = mergeConstOffsetInsn(MBBI, Update, Offset);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
 
@@ -2482,6 +2692,21 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // 5) Find a register assigned with a const value that can be combined with
+  // into the load or store. e.g.,
+  //        mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  //        ldr x1, [x0, x8]
+  //        ; becomes
+  //        add x8, x0, a * (1<<12)
+  //        ldr x1, [x8, imm12]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    if (isMergeableIndexLdSt(*MBBI) && tryToMergeIndexLdSt(MBBI))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 3d4749a7b8e7df..e457fbdc34e3c4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -213,9 +213,8 @@ define void @t17(i64 %a) {
 define i32 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
new file mode 100644
index 00000000000000..15b6700398ea08
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass aarch64-ldst-opt %s -o - | FileCheck %s
+
+
+---
+name:            LdOffset
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+    ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...
+
+# Negative test: the IndexReg missing killed flags
+---
+name:            LdOffset_missing_killed
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset_missing_killed
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
+    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...

david-arm · 2023-12-13T15:20:32Z

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

@@ -4089,6 +4089,11 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
  return MI.getOperand(Idx);
 }

+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+  return MI.getOperand(4);


How do you know this is always going to be operand 4 without inspecting the opcode? For example, see the functions above.

yes, it need extend with a switch case when we add more supported instrunction.
The first version I only supported the LDRBBroX , and it is operand 4.
I only add one instrunction in the first version to make sure if it is reasonable for the optimization direction itself.
If it is adopted, I will add other memory access instructions in the subsequent patch.

OK, but my point is that if your patch lands another engineer may start using this function believing that it is complete. However, it might do the wrong thing if called for a different opcode. I think you need to explicitly add an error if the opcode is not what you expect so that it fails sensibly.

Thanks for your idea, fixed with commit 01029d9

david-arm · 2023-12-13T15:27:26Z

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+  // TODO: Add more index address loads/stores.
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no base address equivalent!");


At first glance this doesn't look right to me. Are there any checks to actually prevent us getting here with other opcodes? If not, then I think you may need to fill in the remaining opcodes in the switch statement, similar to getPostIndexedOpcode.

yes, similar to above getLdStAmountOp，I only support LDRBBroX in the first version.

I guess the point I'm trying to make here is that usually we add a llvm_unreachable because we actually expect the code to be unreachable. However, I can't see anything that prevents us hitting the default case. For example, I'd expect code to be written in this way:

void foo(MI) { if (opcode == my_expected_opcode) foo2(MI); } void foo2() { switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case my_expected_opcode: do_something(); break; } }

My question was - do you have anything to explicitly prevent us from calling the function with an unexpected opcode?

yes, the function isMergeableIndexLdSt called by AArch64LoadStoreOpt::optimizeBlock will check only the expected opcode will try this fold. (Now only support one opcode LDRBBroX )

vfdff · 2023-12-18T06:34:01Z

support more load.x instruction, the list is refered to canFoldIntoAddrMode
hi @davemgreen I also checked the TSVC from this patch, the regression disapprears now.

vfdff · 2023-12-19T00:48:27Z

ping ?

davemgreen

Yeah thanks, I think this is looking better now.

Can you add some tests for different types and edge conditions?

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

davemgreen · 2023-12-19T10:20:37Z

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+
+  // Make sure the IndexReg is killed and the shift amount is zero.
+  // TODO: Relex this restriction to extend, simplify processing now.
+  if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||


Is this essentially a one-use check on the MOV reg?

yes, because we'll delete these old instructions when we match the folding at the end of function mergeConstOffsetInsn (I add a test LdOffset_missing_killed for it in file large-offset-ldr-merge.mir)

vfdff · 2023-12-20T12:31:32Z

Can you add some tests for different types and edge conditions?
Done, thanks

davemgreen

Thanks. I noticed LDRSroX is missing, but so long as the perf looks OK with that added this looks OK.

davemgreen · 2023-12-20T13:40:39Z

llvm/test/CodeGen/AArch64/arm64-addrmode.ll

@@ -295,3 +459,27 @@ define i32 @LdOffset_i16_odd_offset(ptr nocapture noundef readonly %a)  {
  ret i32 %conv
 }

+; Already encoded with a signle mov MOVNWi


davemgreen · 2023-12-20T14:01:31Z

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+    Scale = 2;
+    return true;
+  case AArch64::LDRWroX:
+  case AArch64::LDRSWroX:


Should there be LDRSroX too?

Apply your comment, thanks

vfdff · 2023-12-21T03:25:19Z

Thanks. I noticed LDRSroX is missing, but so long as the perf looks OK with that added this looks OK.
Add the missing LDRSroX and releated tests, thanks.

davemgreen

Thanks. Every time we write patches like this they end up going wrong in some way. Please make sure you've run some decent testing and keep an eye if this might have gone wrong. From what I can tell it LGTM though. Cheers

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold mov w8, llvm#56952 movk w8, llvm#15, lsl llvm#16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] Only LDRBBroX is supported for the first time. Fix llvm#71917

The list of load.x is refer to canFoldIntoAddrMode on D152828. Also support LDRSroX missed in canFoldIntoAddrMode

asmok-g · 2023-12-29T16:17:40Z

heads-up : this is causing some tests crashing internally in google. Trying to come up with a reproducer.

vfdff · 2023-12-30T01:21:45Z

heads-up : this is causing some tests crashing internally in google. Trying to come up with a reproducer.
Thank you for your feedback. I will analyze this problem after you provide a method to reproduce the problem ASAP.

davemgreen · 2024-01-28T17:01:06Z

It looks like this is causing #79756 too. I hadn't realised that it was committed in separate commits and then only partially reverted. I'll remove the LDRBBroX part too to make sure we don't see problems on the llvm-19 branch.

@vfdff there is a test case in #79756 that looks useful for figuring out at least one of the problems going on.

vfdff · 2024-01-29T01:10:46Z

@davemgreen Thanks very much for the case, I'll take a look at it.

vfdff requested review from nikic, davemgreen and efriedma-quic December 13, 2023 14:24

llvmbot added the backend:AArch64 label Dec 13, 2023

vfdff mentioned this pull request Dec 13, 2023

[AArch64] merge index address with large offset into base address #72187

Closed

david-arm reviewed Dec 13, 2023

View reviewed changes

nikic removed their request for review December 14, 2023 09:20

vfdff requested a review from david-arm December 15, 2023 03:19

vfdff force-pushed the fix-547-point2-singleuse branch from 01029d9 to b11ed55 Compare December 18, 2023 06:30

davemgreen reviewed Dec 19, 2023

View reviewed changes

davemgreen reviewed Dec 20, 2023

View reviewed changes

vfdff force-pushed the fix-547-point2-singleuse branch from cec5319 to ad1f7d5 Compare December 21, 2023 03:20

davemgreen approved these changes Dec 21, 2023

View reviewed changes

vfdff added 3 commits December 21, 2023 18:54

[AArch64] Precommit tests for PR75343, NFC

4bad0cb

[AArch64] Fold more load.x into load.i with large offset

f568763

The list of load.x is refer to canFoldIntoAddrMode on D152828. Also support LDRSroX missed in canFoldIntoAddrMode

vfdff force-pushed the fix-547-point2-singleuse branch from ad1f7d5 to f568763 Compare December 21, 2023 10:56

vfdff merged commit f568763 into llvm:main Dec 21, 2023
2 of 4 checks passed

davemgreen mentioned this pull request Jan 28, 2024

[18 regression] Assertion `!NodePtr->isKnownSentinel()' failed at -O3 with aarch64 targets #79756

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] merge index address with large offset into base address #75343

[AArch64] merge index address with large offset into base address #75343

vfdff commented Dec 13, 2023 •

edited

llvmbot commented Dec 13, 2023

david-arm Dec 13, 2023

vfdff Dec 14, 2023

david-arm Dec 14, 2023

vfdff Dec 15, 2023

david-arm Dec 13, 2023

vfdff Dec 14, 2023

david-arm Dec 14, 2023

vfdff Dec 14, 2023

vfdff commented Dec 18, 2023 •

edited

vfdff commented Dec 19, 2023

davemgreen left a comment

davemgreen Dec 19, 2023

vfdff Dec 20, 2023 •

edited

vfdff commented Dec 20, 2023

davemgreen left a comment

davemgreen Dec 20, 2023

vfdff Dec 21, 2023

davemgreen Dec 20, 2023

vfdff Dec 21, 2023

vfdff commented Dec 21, 2023

davemgreen left a comment

asmok-g commented Dec 29, 2023 •

edited

vfdff commented Dec 30, 2023

davemgreen commented Jan 28, 2024

vfdff commented Jan 29, 2024 •

edited

[AArch64] merge index address with large offset into base address #75343

[AArch64] merge index address with large offset into base address #75343

Conversation

vfdff commented Dec 13, 2023 • edited

llvmbot commented Dec 13, 2023

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

vfdff commented Dec 18, 2023 • edited

vfdff commented Dec 19, 2023

davemgreen left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

vfdff Dec 20, 2023 • edited

Choose a reason for hiding this comment

vfdff commented Dec 20, 2023

davemgreen left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

vfdff commented Dec 21, 2023

davemgreen left a comment

Choose a reason for hiding this comment

asmok-g commented Dec 29, 2023 • edited

vfdff commented Dec 30, 2023

davemgreen commented Jan 28, 2024

vfdff commented Jan 29, 2024 • edited

vfdff commented Dec 13, 2023 •

edited

vfdff commented Dec 18, 2023 •

edited

vfdff Dec 20, 2023 •

edited

asmok-g commented Dec 29, 2023 •

edited

vfdff commented Jan 29, 2024 •

edited