Skip to content

Commit

Permalink
[ARM] Distribute MVE post-increments
Browse files Browse the repository at this point in the history
This adds some extra processing into the Pre-RA ARM load/store optimizer
to detect and merge MVE loads/stores and adds of the same base. This we
don't always turn into a post-inc during ISel, and due to the nature of
it being a graph we don't always know an order to use for the nodes, not
knowing which nodes to make post-inc and which to use the new post-inc
of. After ISel, we have an order that we can use to post-inc the
following instructions.

So this looks for a loads/store with a starting offset of 0, and an
add/sub from the same base, plus a number of other loads/stores. We then
do some checks and convert the zero offset load/store into a postinc
variant. Any loads/stores after it have the offset subtracted from their
immediates.  For example:
  LDR #4           LDR #4
  LDR #0           LDR_POSTINC #16
  LDR #8           LDR #-8
  LDR #12          LDR #-4
  ADD #16
It only handles MVE loads/stores at the moment. Normal loads/store will
be added in a followup patch, they just have some extra details to
ensure that we keep generating LDRD/LDM successfully.

Differential Revision: https://reviews.llvm.org/D77813
  • Loading branch information
davemgreen committed Apr 22, 2020
1 parent 4eca1c0 commit 892af45
Show file tree
Hide file tree
Showing 14 changed files with 482 additions and 279 deletions.
43 changes: 43 additions & 0 deletions llvm/lib/Target/ARM/ARMBaseInstrInfo.h
Expand Up @@ -788,6 +788,49 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
const ARMSubtarget *Subtarget,
bool ForCodesize = false);

// Return the immediate if this is ADDri or SUBri, scaled as appropriate.
// Returns 0 for unknown instructions.
inline int getAddSubImmediate(MachineInstr &MI) {
int Scale = 1;
unsigned ImmOp;
switch (MI.getOpcode()) {
case ARM::t2ADDri:
ImmOp = 2;
break;
case ARM::t2SUBri:
case ARM::t2SUBri12:
ImmOp = 2;
Scale = -1;
break;
case ARM::tSUBi3:
case ARM::tSUBi8:
ImmOp = 3;
Scale = -1;
break;
default:
return 0;
}
return Scale * MI.getOperand(ImmOp).getImm();
}

// Given a memory access Opcode, check that the give Imm would be a valid Offset
// for this instruction using its addressing mode.
inline bool isLegalAddressImm(unsigned Opcode, int Imm,
const TargetInstrInfo *TII) {
const MCInstrDesc &Desc = TII->get(Opcode);
unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
switch (AddrMode) {
case ARMII::AddrModeT2_i7:
return std::abs(Imm) < (((1 << 7) * 1) - 1);
case ARMII::AddrModeT2_i7s2:
return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
case ARMII::AddrModeT2_i7s4:
return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
default:
llvm_unreachable("Unhandled Addressing mode");
}
}

} // end namespace llvm

#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
241 changes: 238 additions & 3 deletions llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
Expand Up @@ -32,6 +32,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
Expand All @@ -50,6 +51,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Allocator.h"
Expand Down Expand Up @@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
case ARM::t2STRi8:
case ARM::t2STRi12:
return ARM::t2STR_POST;

case ARM::MVE_VLDRBS16:
return ARM::MVE_VLDRBS16_post;
case ARM::MVE_VLDRBS32:
return ARM::MVE_VLDRBS32_post;
case ARM::MVE_VLDRBU16:
return ARM::MVE_VLDRBU16_post;
case ARM::MVE_VLDRBU32:
return ARM::MVE_VLDRBU32_post;
case ARM::MVE_VLDRHS32:
return ARM::MVE_VLDRHS32_post;
case ARM::MVE_VLDRHU32:
return ARM::MVE_VLDRHU32_post;
case ARM::MVE_VLDRBU8:
return ARM::MVE_VLDRBU8_post;
case ARM::MVE_VLDRHU16:
return ARM::MVE_VLDRHU16_post;
case ARM::MVE_VLDRWU32:
return ARM::MVE_VLDRWU32_post;
case ARM::MVE_VSTRB16:
return ARM::MVE_VSTRB16_post;
case ARM::MVE_VSTRB32:
return ARM::MVE_VSTRB32_post;
case ARM::MVE_VSTRH32:
return ARM::MVE_VSTRH32_post;
case ARM::MVE_VSTRBU8:
return ARM::MVE_VSTRBU8_post;
case ARM::MVE_VSTRHU16:
return ARM::MVE_VSTRHU16_post;
case ARM::MVE_VSTRWU32:
return ARM::MVE_VSTRWU32_post;

default: llvm_unreachable("Unhandled opcode!");
}
}
Expand Down Expand Up @@ -2046,6 +2080,7 @@ namespace {
const TargetRegisterInfo *TRI;
const ARMSubtarget *STI;
MachineRegisterInfo *MRI;
MachineDominatorTree *DT;
MachineFunction *MF;

ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
Expand All @@ -2058,6 +2093,8 @@ namespace {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand All @@ -2071,14 +2108,19 @@ namespace {
unsigned Base, bool isLd,
DenseMap<MachineInstr*, unsigned> &MI2LocMap);
bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
bool DistributeIncrements();
bool DistributeIncrements(Register Base);
};

} // end anonymous namespace

char ARMPreAllocLoadStoreOpt::ID = 0;

INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)

// Limit the number of instructions to be rescheduled.
// FIXME: tune this limit, and/or come up with some better heuristics.
Expand All @@ -2094,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
MRI = &Fn.getRegInfo();
DT = &getAnalysis<MachineDominatorTree>();
MF = &Fn;
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

bool Modified = false;
bool Modified = DistributeIncrements();
for (MachineBasicBlock &MFI : Fn)
Modified |= RescheduleLoadStoreInstrs(&MFI);

Expand Down Expand Up @@ -2475,6 +2518,198 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
return RetVal;
}

// Get the Base register operand index from the memory access MachineInst if we
// should attempt to distribute postinc on it. Return -1 if not of a valid
// instruction type. If it returns an index, it is assumed that instruction is a
// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index.
static int getBaseOperandIndex(MachineInstr &MI) {
switch (MI.getOpcode()) {
case ARM::MVE_VLDRBS16:
case ARM::MVE_VLDRBS32:
case ARM::MVE_VLDRBU16:
case ARM::MVE_VLDRBU32:
case ARM::MVE_VLDRHS32:
case ARM::MVE_VLDRHU32:
case ARM::MVE_VLDRBU8:
case ARM::MVE_VLDRHU16:
case ARM::MVE_VLDRWU32:
case ARM::MVE_VSTRB16:
case ARM::MVE_VSTRB32:
case ARM::MVE_VSTRH32:
case ARM::MVE_VSTRBU8:
case ARM::MVE_VSTRHU16:
case ARM::MVE_VSTRWU32:
return 1;
}
return -1;
}

static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
Register NewReg,
const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
MachineFunction *MF = MI->getMF();
MachineRegisterInfo &MRI = MF->getRegInfo();

unsigned NewOpcode = getPostIndexedLoadStoreOpcode(
MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub);

const MCInstrDesc &MCID = TII->get(NewOpcode);
// Constrain the def register class
const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
MRI.constrainRegClass(NewReg, TRC);
// And do the same for the base operand
TRC = TII->getRegClass(MCID, 2, TRI, *MF);
MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);

return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
.addReg(NewReg, RegState::Define)
.add(MI->getOperand(0))
.add(MI->getOperand(1))
.addImm(Offset)
.add(MI->getOperand(3))
.add(MI->getOperand(4))
.cloneMemRefs(*MI);
}

// Given a Base Register, optimise the load/store uses to attempt to create more
// post-inc accesses. We do this by taking zero offset loads/stores with an add,
// and convert them to a postinc load/store of the same type. Any subsequent
// accesses will be adjusted to use and account for the post-inc value.
// For example:
// LDR #0 LDR_POSTINC #16
// LDR #4 LDR #-12
// LDR #8 LDR #-8
// LDR #12 LDR #-4
// ADD #16
bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
// We are looking for:
// One zero offset load/store that can become postinc
MachineInstr *BaseAccess = nullptr;
// An increment that can be folded in
MachineInstr *Increment = nullptr;
// Other accesses after BaseAccess that will need to be updated to use the
// postinc value
SmallPtrSet<MachineInstr *, 8> OtherAccesses;
for (auto &Use : MRI->use_nodbg_instructions(Base)) {
if (!Increment && getAddSubImmediate(Use) != 0) {
Increment = &Use;
continue;
}

int BaseOp = getBaseOperandIndex(Use);
if (BaseOp == -1)
return false;

if (!Use.getOperand(BaseOp).isReg() ||
Use.getOperand(BaseOp).getReg() != Base)
return false;
if (Use.getOperand(BaseOp + 1).getImm() == 0)
BaseAccess = &Use;
else
OtherAccesses.insert(&Use);
}

if (!BaseAccess || !Increment ||
BaseAccess->getParent() != Increment->getParent())
return false;
Register PredReg;
if (Increment->definesRegister(ARM::CPSR) ||
getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
return false;

LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
<< Base.virtRegIndex() << "\n");

// Make sure that Increment has no uses before BaseAccess.
for (MachineInstr &Use :
MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n");
return false;
}
}

// Make sure that Increment can be folded into Base
int IncrementOffset = getAddSubImmediate(*Increment);
unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n");
return false;
}

// And make sure that the negative value of increment can be added to all
// other offsets after the BaseAccess. We rely on either
// dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
// to keep things simple.
SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
for (auto *Use : OtherAccesses) {
if (DT->dominates(BaseAccess, Use)) {
SuccessorAccesses.insert(Use);
unsigned BaseOp = getBaseOperandIndex(*Use);
if (!isLegalAddressImm(
Use->getOpcode(),
Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n");
return false;
}
} else if (!DT->dominates(Use, BaseAccess)) {
LLVM_DEBUG(
dbgs() << " Unknown dominance relation between Base and Use\n");
return false;
}
}

// Replace BaseAccess with a post inc
LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
LLVM_DEBUG(dbgs() << " And : "; Increment->dump());
Register NewBaseReg = Increment->getOperand(0).getReg();
MachineInstr *BaseAccessPost =
createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
BaseAccess->eraseFromParent();
Increment->eraseFromParent();
LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump());

for (auto *Use : SuccessorAccesses) {
LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
unsigned BaseOp = getBaseOperandIndex(*Use);
Use->getOperand(BaseOp).setReg(NewBaseReg);
int OldOffset = Use->getOperand(BaseOp + 1).getImm();
Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
LLVM_DEBUG(dbgs() << " To : "; Use->dump());
}

// Remove the kill flag from all uses of NewBaseReg, in case any old uses
// remain.
for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg))
Op.setIsKill(false);
return true;
}

bool ARMPreAllocLoadStoreOpt::DistributeIncrements() {
bool Changed = false;
SmallSetVector<Register, 4> Visited;
for (auto &MBB : *MF) {
for (auto &MI : MBB) {
int BaseOp = getBaseOperandIndex(MI);
if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg())
continue;

Register Base = MI.getOperand(BaseOp).getReg();
if (!Base.isVirtual() || Visited.count(Base))
continue;

Visited.insert(Base);
}
}

for (auto Base : Visited)
Changed |= DistributeIncrements(Base);

return Changed;
}

/// Returns an instance of the load / store optimization pass.
FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
if (PreAlloc)
Expand Down
17 changes: 2 additions & 15 deletions llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
Expand Up @@ -463,21 +463,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
// that the predication will be equivalent. For this we need:
// NumElements = NumElements - VectorWidth. The sub will be a sub immediate
// and we can also allow register copies within the chain too.
auto IsValidSub = [](MachineInstr *MI, unsigned ExpectedVecWidth) {
unsigned ImmOpIdx = 0;
switch (MI->getOpcode()) {
default:
llvm_unreachable("unhandled sub opcode");
case ARM::tSUBi3:
case ARM::tSUBi8:
ImmOpIdx = 3;
break;
case ARM::t2SUBri:
case ARM::t2SUBri12:
ImmOpIdx = 2;
break;
}
return MI->getOperand(ImmOpIdx).getImm() == ExpectedVecWidth;
auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
return -getAddSubImmediate(*MI) == ExpectedVecWidth;
};

MBB = VCTP->getParent();
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/ARM/O3-pipeline.ll
Expand Up @@ -94,6 +94,7 @@
; CHECK-NEXT: Remove dead machine instructions
; CHECK-NEXT: MVE VPT Optimisation Pass
; CHECK-NEXT: ARM MLA / MLS expansion pass
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: ARM pre- register allocation load / store optimization pass
; CHECK-NEXT: ARM A15 S->D optimizer
; CHECK-NEXT: Detect Dead Lanes
Expand Down

0 comments on commit 892af45

Please sign in to comment.