[ARM] Distribute MVE post-increments

This adds some extra processing into the Pre-RA ARM load/store optimizer to detect and merge MVE loads/stores and adds of the same base. This we don't always turn into a post-inc during ISel, and due to the nature of it being a graph we don't always know an order to use for the nodes, not knowing which nodes to make post-inc and which to use the new post-inc of. After ISel, we have an order that we can use to post-inc the following instructions. So this looks for a loads/store with a starting offset of 0, and an add/sub from the same base, plus a number of other loads/stores. We then do some checks and convert the zero offset load/store into a postinc variant. Any loads/stores after it have the offset subtracted from their immediates. For example: LDR #4 LDR #4 LDR #0 LDR_POSTINC #16 LDR #8 LDR #-8 LDR #12 LDR #-4 ADD #16 It only handles MVE loads/stores at the moment. Normal loads/store will be added in a followup patch, they just have some extra details to ensure that we keep generating LDRD/LDM successfully. Differential Revision: https://reviews.llvm.org/D77813
llvm · Apr 22, 2020 · 892af45 · 892af45
1 parent 4eca1c0
commit 892af45
Show file tree

Hide file tree

Showing 14 changed files with 482 additions and 279 deletions.
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -788,6 +788,49 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
                                          const ARMSubtarget *Subtarget,
                                          bool ForCodesize = false);
 
+// Return the immediate if this is ADDri or SUBri, scaled as appropriate.
+// Returns 0 for unknown instructions.
+inline int getAddSubImmediate(MachineInstr &MI) {
+  int Scale = 1;
+  unsigned ImmOp;
+  switch (MI.getOpcode()) {
+  case ARM::t2ADDri:
+    ImmOp = 2;
+    break;
+  case ARM::t2SUBri:
+  case ARM::t2SUBri12:
+    ImmOp = 2;
+    Scale = -1;
+    break;
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+    ImmOp = 3;
+    Scale = -1;
+    break;
+  default:
+    return 0;
+  }
+  return Scale * MI.getOperand(ImmOp).getImm();
+}
+
+// Given a memory access Opcode, check that the give Imm would be a valid Offset
+// for this instruction using its addressing mode.
+inline bool isLegalAddressImm(unsigned Opcode, int Imm,
+                              const TargetInstrInfo *TII) {
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i7:
+    return std::abs(Imm) < (((1 << 7) * 1) - 1);
+  case ARMII::AddrModeT2_i7s2:
+    return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
+  case ARMII::AddrModeT2_i7s4:
+    return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
+  default:
+    llvm_unreachable("Unhandled Addressing mode");
+  }
+}
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -50,6 +51,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
@@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
   case ARM::t2STRi8:
   case ARM::t2STRi12:
     return ARM::t2STR_POST;
+
+  case ARM::MVE_VLDRBS16:
+    return ARM::MVE_VLDRBS16_post;
+  case ARM::MVE_VLDRBS32:
+    return ARM::MVE_VLDRBS32_post;
+  case ARM::MVE_VLDRBU16:
+    return ARM::MVE_VLDRBU16_post;
+  case ARM::MVE_VLDRBU32:
+    return ARM::MVE_VLDRBU32_post;
+  case ARM::MVE_VLDRHS32:
+    return ARM::MVE_VLDRHS32_post;
+  case ARM::MVE_VLDRHU32:
+    return ARM::MVE_VLDRHU32_post;
+  case ARM::MVE_VLDRBU8:
+    return ARM::MVE_VLDRBU8_post;
+  case ARM::MVE_VLDRHU16:
+    return ARM::MVE_VLDRHU16_post;
+  case ARM::MVE_VLDRWU32:
+    return ARM::MVE_VLDRWU32_post;
+  case ARM::MVE_VSTRB16:
+    return ARM::MVE_VSTRB16_post;
+  case ARM::MVE_VSTRB32:
+    return ARM::MVE_VSTRB32_post;
+  case ARM::MVE_VSTRH32:
+    return ARM::MVE_VSTRH32_post;
+  case ARM::MVE_VSTRBU8:
+    return ARM::MVE_VSTRBU8_post;
+  case ARM::MVE_VSTRHU16:
+    return ARM::MVE_VSTRHU16_post;
+  case ARM::MVE_VSTRWU32:
+    return ARM::MVE_VSTRWU32_post;
+
   default: llvm_unreachable("Unhandled opcode!");
   }
 }
@@ -2046,6 +2080,7 @@ namespace {
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
     MachineRegisterInfo *MRI;
+    MachineDominatorTree *DT;
     MachineFunction *MF;
 
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
@@ -2058,6 +2093,8 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -2071,14 +2108,19 @@ namespace {
                        unsigned Base, bool isLd,
                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+    bool DistributeIncrements();
+    bool DistributeIncrements(Register Base);
   };
 
 } // end anonymous namespace
 
 char ARMPreAllocLoadStoreOpt::ID = 0;
 
-INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
-                ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+                      ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+                    ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
 // Limit the number of instructions to be rescheduled.
 // FIXME: tune this limit, and/or come up with some better heuristics.
@@ -2094,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
   MF  = &Fn;
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  bool Modified = false;
+  bool Modified = DistributeIncrements();
   for (MachineBasicBlock &MFI : Fn)
     Modified |= RescheduleLoadStoreInstrs(&MFI);
 
@@ -2475,6 +2518,198 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
   return RetVal;
 }
 
+// Get the Base register operand index from the memory access MachineInst if we
+// should attempt to distribute postinc on it. Return -1 if not of a valid
+// instruction type. If it returns an index, it is assumed that instruction is a
+// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index.
+static int getBaseOperandIndex(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case ARM::MVE_VLDRBS16:
+  case ARM::MVE_VLDRBS32:
+  case ARM::MVE_VLDRBU16:
+  case ARM::MVE_VLDRBU32:
+  case ARM::MVE_VLDRHS32:
+  case ARM::MVE_VLDRHU32:
+  case ARM::MVE_VLDRBU8:
+  case ARM::MVE_VLDRHU16:
+  case ARM::MVE_VLDRWU32:
+  case ARM::MVE_VSTRB16:
+  case ARM::MVE_VSTRB32:
+  case ARM::MVE_VSTRH32:
+  case ARM::MVE_VSTRBU8:
+  case ARM::MVE_VSTRHU16:
+  case ARM::MVE_VSTRWU32:
+    return 1;
+  }
+  return -1;
+}
+
+static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
+                                            Register NewReg,
+                                            const TargetInstrInfo *TII,
+                                            const TargetRegisterInfo *TRI) {
+  MachineFunction *MF = MI->getMF();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned NewOpcode = getPostIndexedLoadStoreOpcode(
+      MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub);
+
+  const MCInstrDesc &MCID = TII->get(NewOpcode);
+  // Constrain the def register class
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+  MRI.constrainRegClass(NewReg, TRC);
+  // And do the same for the base operand
+  TRC = TII->getRegClass(MCID, 2, TRI, *MF);
+  MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
+
+  return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+      .addReg(NewReg, RegState::Define)
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .addImm(Offset)
+      .add(MI->getOperand(3))
+      .add(MI->getOperand(4))
+      .cloneMemRefs(*MI);
+}
+
+// Given a Base Register, optimise the load/store uses to attempt to create more
+// post-inc accesses. We do this by taking zero offset loads/stores with an add,
+// and convert them to a postinc load/store of the same type. Any subsequent
+// accesses will be adjusted to use and account for the post-inc value.
+// For example:
+// LDR #0            LDR_POSTINC #16
+// LDR #4            LDR #-12
+// LDR #8            LDR #-8
+// LDR #12           LDR #-4
+// ADD #16
+bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
+  // We are looking for:
+  // One zero offset load/store that can become postinc
+  MachineInstr *BaseAccess = nullptr;
+  // An increment that can be folded in
+  MachineInstr *Increment = nullptr;
+  // Other accesses after BaseAccess that will need to be updated to use the
+  // postinc value
+  SmallPtrSet<MachineInstr *, 8> OtherAccesses;
+  for (auto &Use : MRI->use_nodbg_instructions(Base)) {
+    if (!Increment && getAddSubImmediate(Use) != 0) {
+      Increment = &Use;
+      continue;
+    }
+
+    int BaseOp = getBaseOperandIndex(Use);
+    if (BaseOp == -1)
+      return false;
+
+    if (!Use.getOperand(BaseOp).isReg() ||
+        Use.getOperand(BaseOp).getReg() != Base)
+      return false;
+    if (Use.getOperand(BaseOp + 1).getImm() == 0)
+      BaseAccess = &Use;
+    else
+      OtherAccesses.insert(&Use);
+  }
+
+  if (!BaseAccess || !Increment ||
+      BaseAccess->getParent() != Increment->getParent())
+    return false;
+  Register PredReg;
+  if (Increment->definesRegister(ARM::CPSR) ||
+      getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
+                    << Base.virtRegIndex() << "\n");
+
+  // Make sure that Increment has no uses before BaseAccess.
+  for (MachineInstr &Use :
+       MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
+    if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
+      LLVM_DEBUG(dbgs() << "  BaseAccess doesn't dominate use of increment\n");
+      return false;
+    }
+  }
+
+  // Make sure that Increment can be folded into Base
+  int IncrementOffset = getAddSubImmediate(*Increment);
+  unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
+      BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
+  if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
+    LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on postinc\n");
+    return false;
+  }
+
+  // And make sure that the negative value of increment can be added to all
+  // other offsets after the BaseAccess. We rely on either
+  // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
+  // to keep things simple.
+  SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
+  for (auto *Use : OtherAccesses) {
+    if (DT->dominates(BaseAccess, Use)) {
+      SuccessorAccesses.insert(Use);
+      unsigned BaseOp = getBaseOperandIndex(*Use);
+      if (!isLegalAddressImm(
+              Use->getOpcode(),
+              Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
+        LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on use\n");
+        return false;
+      }
+    } else if (!DT->dominates(Use, BaseAccess)) {
+      LLVM_DEBUG(
+          dbgs() << "  Unknown dominance relation between Base and Use\n");
+      return false;
+    }
+  }
+
+  // Replace BaseAccess with a post inc
+  LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
+  LLVM_DEBUG(dbgs() << "  And   : "; Increment->dump());
+  Register NewBaseReg = Increment->getOperand(0).getReg();
+  MachineInstr *BaseAccessPost =
+      createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
+  BaseAccess->eraseFromParent();
+  Increment->eraseFromParent();
+  LLVM_DEBUG(dbgs() << "  To    : "; BaseAccessPost->dump());
+
+  for (auto *Use : SuccessorAccesses) {
+    LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
+    unsigned BaseOp = getBaseOperandIndex(*Use);
+    Use->getOperand(BaseOp).setReg(NewBaseReg);
+    int OldOffset = Use->getOperand(BaseOp + 1).getImm();
+    Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
+    LLVM_DEBUG(dbgs() << "  To    : "; Use->dump());
+  }
+
+  // Remove the kill flag from all uses of NewBaseReg, in case any old uses
+  // remain.
+  for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg))
+    Op.setIsKill(false);
+  return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::DistributeIncrements() {
+  bool Changed = false;
+  SmallSetVector<Register, 4> Visited;
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      int BaseOp = getBaseOperandIndex(MI);
+      if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg())
+        continue;
+
+      Register Base = MI.getOperand(BaseOp).getReg();
+      if (!Base.isVirtual() || Visited.count(Base))
+        continue;
+
+      Visited.insert(Base);
+    }
+  }
+
+  for (auto Base : Visited)
+    Changed |= DistributeIncrements(Base);
+
+  return Changed;
+}
+
 /// Returns an instance of the load / store optimization pass.
 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
   if (PreAlloc)

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -463,21 +463,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   // that the predication will be equivalent. For this we need:
   // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
   // and we can also allow register copies within the chain too.
-  auto IsValidSub = [](MachineInstr *MI, unsigned ExpectedVecWidth) {
-    unsigned ImmOpIdx = 0;
-    switch (MI->getOpcode()) {
-    default:
-      llvm_unreachable("unhandled sub opcode");
-    case ARM::tSUBi3:
-    case ARM::tSUBi8:
-      ImmOpIdx = 3;
-      break;
-    case ARM::t2SUBri:
-    case ARM::t2SUBri12:
-      ImmOpIdx = 2;
-      break;
-    }
-    return MI->getOperand(ImmOpIdx).getImm() == ExpectedVecWidth;
+  auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
+    return -getAddSubImmediate(*MI) == ExpectedVecWidth;
   };
 
   MBB = VCTP->getParent();

diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -94,6 +94,7 @@
 ; CHECK-NEXT:      Remove dead machine instructions
 ; CHECK-NEXT:      MVE VPT Optimisation Pass
 ; CHECK-NEXT:      ARM MLA / MLS expansion pass
+; CHECK-NEXT:      MachineDominator Tree Construction
 ; CHECK-NEXT:      ARM pre- register allocation load / store optimization pass
 ; CHECK-NEXT:      ARM A15 S->D optimizer
 ; CHECK-NEXT:      Detect Dead Lanes