[AArch64] merge index address with large offset into base address #72187

vfdff · 2023-11-14T01:36:12Z

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE

Fold
  mov     w8, #56952
  movk    w8, #15, lsl #16
  ldrb    w0, [x0, x8]
into
  add     x0, x0, 1036288
  ldrb    w0, [x0, 3704]

Only support single use base, multi-use scenes are supported by PR74046.
Fix #71917

llvmbot · 2023-11-14T01:36:42Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-aarch64

Author: Allen (vfdff)

Changes

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold
mov w8, #56952
movk w8, #15, lsl #16
ldrb w0, [x0, x8]
into
add x0, x0, 1036288
ldrb w0, [x0, 3704]

Only LDRBBroX is supported for the first time.
Fix #71917

Full diff: https://github.com/llvm/llvm-project/pull/72187.diff

4 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+5)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.h (+3)
(modified) llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp (+219)
(added) llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir (+32)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6fdf5363bae2928..87d23a9e2935fae 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4081,6 +4081,11 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
   return MI.getOperand(Idx);
 }
 
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+  return MI.getOperand(4);
+}
+
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index a934103c90cbf92..08b568bb3484135 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
+  /// Returns the shift amount operator of a load/store.
+  static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 7add127e21e3e55..eeb24dfc72465bf 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+          "Number of const offset of index address folded");
 
 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
               "Controls which pairs are considered for renaming");
@@ -171,6 +173,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
                                 int UnscaledOffset, unsigned Limit);
 
+  // Scan the instruction list to find a register assigned with a const
+  // value that can be combined with the current instruction (a load or store)
+  // using base addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+                                  unsigned &Offset);
+
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +191,18 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+                              unsigned IndexReg, unsigned &Offset);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
+  MachineBasicBlock::iterator
+  mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                       MachineBasicBlock::iterator Update, unsigned Offset);
+
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -199,6 +215,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +500,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   }
 }
 
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+  // TODO: Add more index address loads/stores.
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBBroX:
+    return AArch64::LDRBBui;
+  }
+}
+
 static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -722,6 +751,19 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  // Scaled instructions.
+  // TODO: Add more index address loads/stores.
+  case AArch64::LDRBBroX:
+    return true;
+  }
+}
+
 static bool isRewritableImplicitDef(unsigned Opc) {
   switch (Opc) {
   default:
@@ -1925,6 +1967,62 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update,
+                                          unsigned Offset) {
+  assert((Update->getOpcode() == AArch64::MOVKWi) &&
+         "Unexpected const mov instruction to merge!");
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+  MachineInstr &MemMI = *I;
+  unsigned Imm12 = Offset & 0x0fff;
+  unsigned High = Offset - Imm12;
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+  MachineInstrBuilder AddMIB, MemMIB;
+
+  // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+  AddMIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+          .addDef(IndexReg)
+          .addUse(BaseReg)
+          .addImm(High >> 12) // shifted value
+          .addImm(12);        // shift 12
+  (void)AddMIB;
+  // Ld/St DestReg, IndexReg, Imm12
+  unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+  MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+               .add(getLdStRegOp(MemMI))
+               .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+               .addImm(Imm12)
+               .setMemRefs(I->memoperands())
+               .setMIFlags(I->mergeFlagsWith(*Update));
+  (void)MemMIB;
+
+  ++NumConstOffsetFolded;
+  LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  LLVM_DEBUG(PrevI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Update->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  PrevI->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
                                                MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
@@ -1972,6 +2070,30 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+                                                 MachineInstr &MI,
+                                                 unsigned IndexReg,
+                                                 unsigned &Offset) {
+  // The update instruction source and destination register must be the
+  // same as the load/store index register.
+  if (MI.getOpcode() == AArch64::MOVKWi &&
+      TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+    // movz + movk hold a large offset of a Ld/St instruction.
+    MachineBasicBlock::iterator B = MI.getParent()->begin();
+    MachineBasicBlock::iterator MBBI = &MI;
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MovzMI = *MBBI;
+    if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+      unsigned Low = MovzMI.getOperand(1).getImm();
+      unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+      Offset = High + Low;
+      return true;
+    }
+  }
+  return false;
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2127,6 +2249,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+    MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching load or store.
+  if (MBBI == B)
+    return E;
+
+  // Make sure this the shift amount is zero.
+  // TODO: Relex this restriction to extend, simplify processing now.
+  if (!AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+      (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+    return E;
+
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+  unsigned Count = 0;
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+      return MBBI;
+    }
+
+    // Update the status of what the instruction clobbered and used.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (!ModifiedRegUnits.available(BaseReg) ||
+        !UsedRegUnits.available(BaseReg))
+      return E;
+
+  } while (MBBI != B && Count < Limit);
+  return E;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2311,6 +2487,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   return false;
 }
 
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(
+    MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+  MachineBasicBlock::iterator Update;
+
+  // Don't know how to handle unscaled pre/post-index versions below, so bail.
+  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+    return false;
+
+  // Look back to try to find a const offset for index LdSt instruction. For
+  // example,
+  // mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  // ldr x1, [x0, x8]
+  // merged into:
+  // add x8, x0, a * (1<<12)
+  // ldr x1, [x8, imm12]
+  unsigned Offset;
+  Update = findMatchingConstOffsetBackward(MBBI, UpdateLimit, Offset);
+  if (Update != E) {
+    // Merge the imm12 into the ld/st.
+    MBBI = mergeConstOffsetInsn(MBBI, Update, Offset);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
 
@@ -2389,6 +2593,21 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // 5) Find a register assigned with a const value that can be combined with
+  // into the load or store. e.g.,
+  //        mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  //        ldr x1, [x0, x8]
+  //        ; becomes
+  //        add x8, x0, a * (1<<12)
+  //        ldr x1, [x8, imm12]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    if (isMergeableIndexLdSt(*MBBI) && tryToMergeIndexLdSt(MBBI))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
new file mode 100644
index 000000000000000..719f666213c10e2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -0,0 +1,32 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass aarch64-ldst-opt %s -o - | FileCheck %s
+--- |
+  define i32 @testOffset(ptr nocapture noundef readonly %a) {
+  entry:
+    %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+    %0 = load i8, ptr %arrayidx, align 1
+    %conv = zext i8 %0 to i32
+    ret i32 %conv
+  }
+
+...
+---
+name:            testOffset
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: testOffset
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+    ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704 :: (load (s8) from %ir.arrayidx)
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0 :: (load (s8) from %ir.arrayidx)
+    RET undef $lr, implicit $w0
+
+...

efriedma-quic · 2023-11-15T17:41:04Z

This seems like it would be simpler to do at the point where we're computing the addressing mode in the first place (in ISelDAGToDAG).

vfdff · 2023-11-17T03:34:01Z

This seems like it would be simpler to do at the point where we're computing the addressing mode in the first place (in ISelDAGToDAG).

Thanks for your idea @efriedma-quic
I tried to refactor it in AArch64DAGToDAGISel::SelectAddrModeXRO according your idea, and find I can only modify the Base and Offset of the load/store instruction.
Therefore, the following format is generated after the adjustment. The instruction LDRWroX here requires [reg + reg], so an error is reported (checked with -verify-machineinstrs), Is there a way to adjust the opcode?

%1:gpr32 = MOVi32imm 4160992
%2:gpr64 = SUBREG_TO_REG 0, killed %1:gpr32, %subreg.sub_32
%3:gpr32 = LDRWroX %0:gpr64common, %2:gpr64, 0, 0
 ---> 
%1:gpr64sp = ADDXri %0:gpr64common, 1012, 12
%2:gpr32 = LDRWroX killed %1:gpr64sp, 15840, 0, 0  // The 2nd operand 15840 required a reg

vfdff · 2023-11-21T07:18:44Z

hi @efriedma-quic, I refactor that in AArch64DAGToDAGISel::SelectAddrModeIndexed to fix the above crash.

davemgreen

It might be worth doing this as a (late) AArch64ISelLowering DAG combine so that the add gets shared between multiple load/store instructions.

llvm/test/CodeGen/AArch64/arm64-addrmode.ll

davemgreen · 2023-11-21T13:22:17Z

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

+      //     ADD  BaseReg, WideImmediate & 0x0fff000
+      //     LDR  X2, [BaseReg, WideImmediate & 0x0fff]
+      SDValue LHS = N.getOperand(0);
+      if (isPreferredBaseAddrMode(RHSC)) {


I think there needs to be a check for Scale here too.

Thanks, I'll add the check for Scale.
Yes, because the shifted value ImmOffLow >> Scale is used as the offset of load/store, so check it is necessary

vfdff · 2023-11-22T06:23:11Z

It might be worth doing this as a (late) AArch64ISelLowering DAG combine so that the add gets shared between multiple load/store instructions.

Yes, the current solution does not implement the reuse of add, eg:

define i64 @LdOffset_i64_multi_offset(ptr %a) {
; CHECK-LABEL: LdOffset_i64_multi_offset:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    add x8, x0, #2031, lsl #12 // =8318976
; CHECK-NEXT:    add x9, x0, #2031, lsl #12 // =8318976
; CHECK-NEXT:    add x8, x8, #960
; CHECK-NEXT:    ldr x9, [x9, #960]
; CHECK-NEXT:    ldr x8, [x8, #2056]
; CHECK-NEXT:    add x0, x8, x9
; CHECK-NEXT:    ret
entry:
  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
  %0 = load i64, ptr %arrayidx, align 8
  %arrayidx1 = getelementptr inbounds i64, ptr %a, i64 1040249
  %1 = load i64, ptr %arrayidx1, align 8
  %add = add nsw i64 %1, %0
  ret i64 %add
}

In that case, I think it's reasonable to go back to my previous implement? （I'll add more test cases if it's right direction).
I don't prefer to DAG combine because it seems we need similar code to address load and store separately, may be performLOADCombine and performStoreCombine.
here is the link of the previous implement: bff3b81

davemgreen · 2023-11-22T21:03:13Z

I don't prefer to DAG combine because it seems we need similar code to address load and store separately, may be performLOADCombine and performStoreCombine.

Could it share a similar method called from both performLOADCombine and performStoreCombine? I'm not sure if the opposite transform exists in DAG, but if it doesn't it should help only produce a single add.

vfdff · 2023-11-27T03:06:34Z

hi @davemgreen

I checked the DAG combine is placed before the AArch64DAGToDAGISel, and it doesn't hold the machine IR）, so it only uses a register t4 to specify the destination address.

t4: i64 = add nuw t2, Constant:i64<1039992>
    t12: i32,ch = load<(load (s8) from %ir.arrayidx, !tbaa !6), zext from i8> t0, t4, undef:i64

When I split the const offset into 2 immediate numbers, then it will merge back in DAGCombiner::visitADD.

t15: i64 = add t2, Constant:i64<1036288>
      t17: i64 = add t15, Constant:i64<3704>
    t18: i32,ch = load<(load (s8) from %ir.arrayidx, !tbaa !6), zext from i8> t0, t17, undef:i64

then I try to prevent above the back merging in DAGCombiner::visitADD->..->FoldConstantArithmetic, it still can't reuse the
add instruction, i.e: it has same output to the method done in AArch64DAGToDAGISel.
test with multiple offsets in one case : https://gcc.godbolt.org/z/dE8dETcvK
```
  add	x8, x0, #1015, lsl #12          // =4157440
  add	x9, x0, #1015, lsl #12          // =4157440
  add	x8, x8, #2528
  ldr	w9, [x9, #2528]
  ldr	w8, [x8, #8]
  add	w0, w8, w9
```

so need more support in CodeGenPrepare::optimizeMemoryInst ?

davemgreen · 2023-11-27T22:48:14Z

Ah I see. Like this: https://gcc.godbolt.org/z/4Ph36xdEv? It sounds like that might work, so long as the operands have multiple uses it shouldn't just transform it back to a single add. It might need something like the current patch to handle single-use cases, with something different for multiple-uses.

vfdff · 2023-11-28T03:15:37Z

Thanks @davemgreen for your idea.
Based on the above discussion, I restrict the single use for the 1st improvement,

github-actions · 2023-11-29T09:30:56Z

✅ With the latest revision this PR passed the C/C++ code formatter.

vfdff · 2023-11-29T09:31:41Z

Add the 2nd patch to fix remainder multi-use common base

vfdff · 2023-11-30T01:56:49Z

update to fix Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll

davemgreen

It is probably best to split the CGP and DAG parts out into separate patches if we can. I'm seeing some failures in the test-suite, although the look like they might be post-isel so there might be something else going on. I'm still not 100% sure how to structure this without causing regressions in places.

davemgreen · 2023-11-30T12:58:01Z

llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll

-; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x8]
-; CHECK-NEXT:    cmp w9, w1
+; CHECK-NEXT:    add x9, x0, #9, lsl #12 // =36864


Do you know why this add is not being hoisted out of the loop? It could lead to some performance regressions if it is kept in. I think this same thing is happening in the TSVC benchmarks from the llvm-test-suite.

In this case the add is loop invariant. Could there be other cases where we split a non-loop-invariant add in a loop, leading to more instructions in the loop? The immediate could be moved out and kept in a register in the original version.

fixed this issue on PR74046.

llvm/test/CodeGen/AArch64/arm64-addrmode.ll

davemgreen · 2023-11-30T13:13:08Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+// 12bits are same.
+int64_t AArch64TargetLowering::getPreferBaseOffset(int64_t MinOffset,
+                                                   int64_t MaxOffset) const {
+  if (MinOffset >> 12 == MaxOffset >> 12 && MinOffset >> 24 == 0) {


I think this is assuming a uimm12s1 addressing mode for the load/store? Should it be more precise about the type being used, and maybe make use of isLegalAddressingMode if it can?

thanks, I updated with API isLegalAddImmediate

davemgreen · 2023-11-30T13:16:40Z

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

@@ -1069,6 +1069,42 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSigned
  return true;
 }

+// 16-bit optionally shifted immediates are legal for single mov.
+static bool isLegalSingleMOVImmediate(int64_t Immed) {


Can this make use of expandMOVImm?

AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); if (Insn.size() != 1) return;

Apply your comment, thanks

vfdff · 2023-12-01T08:12:29Z

It is probably best to split the CGP and DAG parts out into separate patches if we can. I'm seeing some failures in the test-suite, although the look like they might be post-isel so there might be something else going on. I'm still not 100% sure how to structure this without causing regressions in places.

hi @davemgreen, I spilt the 2nd patch into pr74046 according your idea, and I'll rebase this patch after that patch is merged because the OneUse checking can be avoid for the 1st patch if the 2nd is merged first.

vfdff · 2023-12-04T12:40:14Z

rebase the 1st change after the PR74046

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE ``` Fold mov w8, llvm#56952 movk w8, llvm#15, lsl llvm#16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] ``` Only support single use base, multi-use scenes are supported by PR74046. Fix llvm#71917 TODO: support the multiple-uses with reuseing common base offset. https://gcc.godbolt.org/z/Mr7srTjnz

davemgreen · 2023-12-06T08:27:22Z

I still seem to see some performance issues from this, in the TSVC benchmarks in the llvm test suite. Do you see the same thing?

vfdff · 2023-12-06T09:00:12Z

I still seem to see some performance issues from this, in the TSVC benchmarks in the llvm test suite. Do you see the same thing?
hi @davemgreen :
I don't see any changes from this patch with command clang -march=armv8-a+sve -O3 -ffp-contract=fast -mllvm -scalable-vectorization=preferred -ffast-math -fvectorize -S tsvc.c, would you show your command, thanks.

PS: add this patch base commit 57a11b7

davemgreen · 2023-12-07T20:00:44Z

It looks like it was running with -O3 -flto -mcpu=neoverse-v1, although I can reproduce without the -mcpu.
bin/clang -target aarch64 -O3 llvm-test-suite/MultiSource/Benchmarks/TSVC/StatementReordering-flt/tsc.c

I think it is in these loops that look like:

.LBB6_3:                               |  10079 .LBB6_3:                          
        ldr     s1, [x9, x23]          |  10082         add     x10, x9, #93, lsl #12   
        ldr     s2, [x9, x24]          |  10083         add     x11, x9, #62, lsl #12   
        subs    x8, x8, #1             |  10084         subs    x8, x8, #1              
---------------------------------------|  10085         ldr     s1, [x11, #2100]        
---------------------------------------|  10086         ldr     s2, [x10, #3156]        
        fmadd   s0, s1, s2, s0         |  10087         fmadd   s0, s1, s2, s0          
        str     s0, [x9, #4]!          |  10088         str     s0, [x9, #4]!           
        add     x10, x9, x22           |  10089         add     x10, x9, #125, lsl #12  
        ldr     s0, [x10, #4]          |  10090         add     x11, x9, x22            
        ldr     s1, [x9, x25]          |  10091         ldr     s1, [x10, #128]         
---------------------------------------|  10092         add     x10, x9, #31, lsl #12   
---------------------------------------|  10093         ldr     s0, [x11, #4]           
        fmsub   s0, s1, s2, s0         |  10094         fmsub   s0, s1, s2, s0          
        str     s0, [x10]              |  10095         str     s0, [x10, #1040]        
        b.ne    .LBB6_3                |  10096         b.ne    .LBB6_3

vfdff · 2023-12-09T07:36:09Z

Thanks for your help, I can reproduce the regression now, and a simplified case s211 shows that.
a) without this changes, the const MOVi32imm constImm is a loop Invariant, so it is hoisted out the loop.
b) With this changes, the %12:grp64sp = ADDxri %2:gpr64sp 93, 12 is not a loop Invariant, which checked by the MachineLoop::isLoopInvariant, then it is keep in the loop kernel.

  // If the loop contains the definition of an operand, then the instruction isn't loop invariant.
 if (contains(MRI->getVRegDef(Reg)))
   return false;

releate IR fragment

bb.2.for.body:
; predecessors: %bb.0, %bb.2
%2:gpr64sp = PHI %8:gpr64all, %bb.0, %5:gpr64all, %bb.2  
%3:fpr32 = PHI %0:fpr32, %bb.0, %4:fpr32, %bb.2
%12:gpr64sp = ADDXri %2:gpr64sp, 93, 12     # 
%13:gpr64sp = ADDXri %2:gpr64sp, 62, 12
%14:fpr32 = LDRSui killed %13:gpr64sp, 525 :: (load (s32) from %ir.splitgep34, !tbaa !4)
%15:fpr32 = LDRSui killed %12:gpr64sp, 789 :: (load (s32) from %ir.splitgep35, !tbaa !4)
 ....

At the meanwhile, there is no loop information at DAGIsel, so it seems difficult to check whether the current variable is loop invariant.

davemgreen · 2023-12-11T15:46:02Z

At the meanwhile, there is no loop information at DAGIsel, so it seems difficult to check whether the current variable is loop invariant.

I've wondered if this was worth adding, at least to check if the current block was in a loop. Otherwise we have had to move transforms to after ISel in order to check they are profitable. In this case I guess we need to know that the base is loop invariant?

vfdff · 2023-12-13T14:29:56Z

hi @efriedma-quic @efriedma-quic
Because I don't fix out how the check the loop invariant in AArch64DAGToDAGISel to avoid some regression cases, so I roll back solution to the 1st method, rebased the patch in #75343

vfdff · 2023-12-27T07:08:56Z

close because the new solution: #75343

llvmbot added the backend:AArch64 label Nov 14, 2023

vfdff force-pushed the fix-547-point2 branch from cda7796 to bff3b81 Compare November 14, 2023 14:44

vfdff requested review from MatzeB, davemgreen and efriedma-quic November 14, 2023 14:49

vfdff force-pushed the fix-547-point2 branch from bff3b81 to 4e685fc Compare November 21, 2023 07:15

davemgreen reviewed Nov 21, 2023

View reviewed changes

vfdff closed this Nov 22, 2023

vfdff reopened this Nov 23, 2023

vfdff force-pushed the fix-547-point2 branch from 4e685fc to 42e4cf5 Compare November 28, 2023 03:11

vfdff requested a review from davemgreen November 28, 2023 11:41

vfdff requested a review from nikic November 29, 2023 09:39

vfdff force-pushed the fix-547-point2 branch from 49e5f3a to 205255d Compare November 30, 2023 01:55

llvmbot added the llvm:transforms label Nov 30, 2023

davemgreen reviewed Nov 30, 2023

View reviewed changes

vfdff force-pushed the fix-547-point2 branch from 205255d to 275bbce Compare December 4, 2023 11:45

vfdff force-pushed the fix-547-point2 branch from 50da4ca to e5296d3 Compare December 5, 2023 06:17

vfdff requested a review from davemgreen December 5, 2023 06:23

vfdff closed this Dec 27, 2023

[AArch64] merge index address with large offset into base address #72187

[AArch64] merge index address with large offset into base address #72187

Uh oh!

Conversation

vfdff commented Nov 14, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 14, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

efriedma-quic commented Nov 15, 2023

Uh oh!

vfdff commented Nov 17, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

vfdff commented Nov 21, 2023

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

davemgreen Nov 21, 2023

Choose a reason for hiding this comment

Uh oh!

vfdff Nov 22, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

vfdff commented Nov 22, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen commented Nov 22, 2023

Uh oh!

vfdff commented Nov 27, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen commented Nov 27, 2023

Uh oh!

vfdff commented Nov 28, 2023

Uh oh!

github-actions bot commented Nov 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

vfdff commented Nov 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

vfdff commented Nov 30, 2023

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

davemgreen Nov 30, 2023

Choose a reason for hiding this comment

Uh oh!

vfdff Dec 1, 2023

Choose a reason for hiding this comment

Uh oh!

Uh oh!

davemgreen Nov 30, 2023

Choose a reason for hiding this comment

Uh oh!

vfdff Dec 5, 2023

Choose a reason for hiding this comment

Uh oh!

davemgreen Nov 30, 2023

Choose a reason for hiding this comment

Uh oh!

vfdff Dec 5, 2023

Choose a reason for hiding this comment

Uh oh!

vfdff commented Dec 1, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

vfdff commented Dec 4, 2023

Uh oh!

davemgreen commented Dec 6, 2023

Uh oh!

vfdff commented Dec 6, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

vfdff commented Nov 14, 2023 •

edited

Loading

llvmbot commented Nov 14, 2023 •

edited

Loading

vfdff commented Nov 17, 2023 •

edited

Loading

vfdff Nov 22, 2023 •

edited

Loading

vfdff commented Nov 22, 2023 •

edited

Loading

vfdff commented Nov 27, 2023 •

edited

Loading

github-actions bot commented Nov 29, 2023 •

edited

Loading

vfdff commented Nov 29, 2023 •

edited

Loading

vfdff commented Dec 1, 2023 •

edited

Loading

vfdff commented Dec 6, 2023 •

edited

Loading

vfdff commented Dec 9, 2023 •

edited

Loading