Skip to content

Commit

Permalink
[ARM] DLS/LE low-overhead loop code generation
Browse files Browse the repository at this point in the history
Introduce three pseudo instructions to be used during DAG ISel to
represent v8.1-m low-overhead loops. One maps to set_loop_iterations
while loop_decrement_reg is lowered to two, so that we can separate
the decrement and branching operations. The pseudo instructions are
expanded pre-emission, where we can still decide whether we actually
want to generate a low-overhead loop, in a new pass:
ARMLowOverheadLoops. The pass currently bails, reverting to an sub,
icmp and br, in the cases where a call or stack spill/restore happens
between the decrement and branching instructions, or if the loop is
too large.

Differential Revision: https://reviews.llvm.org/D63476

llvm-svn: 364288
  • Loading branch information
sparker-arm committed Jun 25, 2019
1 parent ecd9348 commit a6fd919
Show file tree
Hide file tree
Showing 17 changed files with 1,644 additions and 2 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/ARM/ARM.h
Expand Up @@ -35,7 +35,7 @@ class MachineInstr;
class MCInst;
class PassRegistry;


FunctionPass *createARMLowOverheadLoopsPass();
Pass *createARMParallelDSPPass();
FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
CodeGenOpt::Level OptLevel);
Expand Down Expand Up @@ -66,6 +66,7 @@ void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);

} // end namespace llvm

Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
Expand Up @@ -2986,6 +2986,36 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue();

if (InFlag.getOpcode() == ARMISD::CMPZ) {
if (InFlag.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) {
SDValue Int = InFlag.getOperand(0);
uint64_t ID = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();

// Handle low-overhead loops.
if (ID == Intrinsic::loop_decrement_reg) {
SDValue Elements = Int.getOperand(2);
SDValue Size = CurDAG->getTargetConstant(
cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl,
MVT::i32);

SDValue Args[] = { Elements, Size, Int.getOperand(0) };
SDNode *LoopDec =
CurDAG->getMachineNode(ARM::t2LoopDec, dl,
CurDAG->getVTList(MVT::i32, MVT::Other),
Args);
ReplaceUses(Int.getNode(), LoopDec);

SDValue EndArgs[] = { SDValue(LoopDec, 0), N1, Chain };
SDNode *LoopEnd =
CurDAG->getMachineNode(ARM::t2LoopEnd, dl, MVT::Other, EndArgs);

ReplaceUses(N, LoopEnd);
CurDAG->RemoveDeadNode(N);
CurDAG->RemoveDeadNode(InFlag.getNode());
CurDAG->RemoveDeadNode(Int.getNode());
return;
}
}

bool SwitchEQNEToPLMI;
SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
InFlag = N->getOperand(4);
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/ARM/ARMInstrThumb2.td
Expand Up @@ -5135,6 +5135,7 @@ class t2LOL<dag oops, dag iops, string asm, string ops>
let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB];
}

let isNotDuplicable = 1 in {
def t2WLS : t2LOL<(outs GPRlr:$LR),
(ins rGPR:$Rn, wlslabel_u11:$label),
"wls", "$LR, $Rn, $label"> {
Expand Down Expand Up @@ -5178,6 +5179,21 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> {
let Inst{10-1} = label{10-1};
}

def t2DoLoopStart :
t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
[(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;

def t2LoopDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;

let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in
def t2LoopEnd :
t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;

} // end isNotDuplicable

class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
: V8_1MI<(outs rGPR:$Rd), (ins GPRwithZR:$Rn, GPRwithZR:$Rm, pred_noal:$fcond),
AddrModeNone, NoItinerary, iname, "$Rd, $Rn, $Rm, $fcond", "", pattern> {
Expand Down
295 changes: 295 additions & 0 deletions llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -0,0 +1,295 @@
//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// Finalize v8.1-m low-overhead loops by converting the associated pseudo
/// instructions into machine operations.
/// The expectation is that the loop contains three pseudo instructions:
/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
/// form should be in the preheader, whereas the while form should be in the
/// preheaders only predecessor. TODO: Could DoLoopStart get moved into the
/// pre-preheader?
/// - t2LoopDec - placed within in the loop body.
/// - t2LoopEnd - the loop latch terminator.
///
//===----------------------------------------------------------------------===//

#include "ARM.h"
#include "ARMBaseInstrInfo.h"
#include "ARMBaseRegisterInfo.h"
#include "ARMBasicBlockInfo.h"
#include "ARMSubtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"

using namespace llvm;

#define DEBUG_TYPE "arm-low-overhead-loops"
#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"

namespace {

class ARMLowOverheadLoops : public MachineFunctionPass {
const ARMBaseInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;

public:
static char ID;

ARMLowOverheadLoops() : MachineFunctionPass(ID) { }

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}

bool runOnMachineFunction(MachineFunction &MF) override;

bool ProcessLoop(MachineLoop *ML);

void Expand(MachineLoop *ML, MachineInstr *Start,
MachineInstr *Dec, MachineInstr *End, bool Revert);

MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}

StringRef getPassName() const override {
return ARM_LOW_OVERHEAD_LOOPS_NAME;
}
};
}

char ARMLowOverheadLoops::ID = 0;

INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
false, false)

bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
//if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
//return false;

LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");

auto &MLI = getAnalysis<MachineLoopInfo>();
MRI = &MF.getRegInfo();
TII = static_cast<const ARMBaseInstrInfo*>(
MF.getSubtarget().getInstrInfo());
BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
BBUtils->computeAllBlockSizes();

bool Changed = false;
for (auto ML : MLI) {
if (!ML->getParentLoop())
Changed |= ProcessLoop(ML);
}
return Changed;
}

bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {

bool Changed = false;

// Process inner loops first.
for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
Changed |= ProcessLoop(*I);

LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);

auto IsLoopStart = [](MachineInstr &MI) {
return MI.getOpcode() == ARM::t2DoLoopStart;
};

auto SearchForStart =
[&IsLoopStart](MachineBasicBlock *MBB) -> MachineInstr* {
for (auto &MI : *MBB) {
if (IsLoopStart(MI))
return &MI;
}
return nullptr;
};

MachineInstr *Start = nullptr;
MachineInstr *Dec = nullptr;
MachineInstr *End = nullptr;
bool Revert = false;

if (auto *Preheader = ML->getLoopPreheader())
Start = SearchForStart(Preheader);

// Find the low-overhead loop components and decide whether or not to fall
// back to a normal loop.
for (auto *MBB : reverse(ML->getBlocks())) {
for (auto &MI : *MBB) {
if (MI.getOpcode() == ARM::t2LoopDec)
Dec = &MI;
else if (MI.getOpcode() == ARM::t2LoopEnd)
End = &MI;

if (!Dec)
continue;

// TODO: Though the call will require LE to execute again, does this
// mean we should revert? Always executing LE hopefully should be faster
// than performing a sub,cmp,br or even subs,br.
if (MI.getDesc().isCall())
Revert = true;

// If we find that we load/store LR between LoopDec and LoopEnd, expect
// that the decremented value has been spilled to the stack. Because
// this value isn't actually going to be produced until the latch, by LE,
// we would need to generate a real sub. The value is also likely to be
// reloaded for use of LoopEnd - in which in case we'd need to perform
// an add because it gets negated again by LE! The other option is to
// then generate the other form of LE which doesn't perform the sub.
if (MI.mayLoad() || MI.mayStore())
Revert =
MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
}

if (Dec && End && Revert)
break;
}

if (Start || Dec || End) {
if (!Start || !Dec || !End)
report_fatal_error("Failed to find all loop components");
} else {
LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
return Changed;
}

if (!End->getOperand(1).isMBB() ||
End->getOperand(1).getMBB() != ML->getHeader())
report_fatal_error("Expected LoopEnd to target Loop Header");

// The LE instructions has 12-bits for the label offset.
if (!BBUtils->isBBInRange(End, ML->getHeader(), 4096)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Too large for a low-overhead loop!\n");
Revert = true;
}

LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
<< " - Found Loop Dec: " << *Dec
<< " - Found Loop End: " << *End);

Expand(ML, Start, Dec, End, Revert);
return true;
}

void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
MachineInstr *Dec, MachineInstr *End,
bool Revert) {

auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
// The trip count should already been held in LR since the instructions
// within the loop can only read and write to LR. So, there should be a
// mov to setup the count. WLS/DLS perform this move, so find the original
// and delete it - inserting WLS/DLS in its place.
MachineBasicBlock *MBB = Start->getParent();
MachineInstr *InsertPt = Start;
for (auto &I : MRI->def_instructions(ARM::LR)) {
if (I.getParent() != MBB)
continue;

// Always execute.
if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
continue;

// Only handle move reg, if the trip count it will need moving into a reg
// before the setup instruction anyway.
if (!I.getDesc().isMoveReg() ||
!I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
continue;
InsertPt = &I;
break;
}

MachineInstrBuilder MIB =
BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(ARM::t2DLS));
if (InsertPt != Start)
InsertPt->eraseFromParent();

MIB.addDef(ARM::LR);
MIB.add(Start->getOperand(0));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted DLS: " << *MIB);
Start->eraseFromParent();
};

// Combine the LoopDec and LoopEnd instructions into LE(TP).
auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
MachineInstr *End) {
MachineBasicBlock *MBB = End->getParent();
MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
TII->get(ARM::t2LEUpdate));
MIB.addDef(ARM::LR);
MIB.add(End->getOperand(0));
MIB.add(End->getOperand(1));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);

// If there is a branch after loop end, which branches to the fallthrough
// block, remove the branch.
MachineBasicBlock *Latch = End->getParent();
MachineInstr *Terminator = &Latch->instr_back();
if (End != Terminator) {
MachineBasicBlock *Exit = ML->getExitBlock();
if (Latch->isLayoutSuccessor(Exit)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop exit branch: "
<< *Terminator);
Terminator->eraseFromParent();
}
}
End->eraseFromParent();
Dec->eraseFromParent();
};

// Generate a subs, or sub and cmp, and a branch instead of an LE.
// TODO: Check flags so that we can possibly generate a subs.
auto ExpandBranch = [this](MachineInstr *Dec, MachineInstr *End) {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub, cmp, br.\n");
// Create sub
MachineBasicBlock *MBB = Dec->getParent();
MachineInstrBuilder MIB = BuildMI(*MBB, Dec, Dec->getDebugLoc(),
TII->get(ARM::t2SUBri));
MIB.addDef(ARM::LR);
MIB.add(Dec->getOperand(1));
MIB.add(Dec->getOperand(2));
MIB.addImm(ARMCC::AL);
MIB.addReg(0);
MIB.addReg(0);

// Create cmp
MBB = End->getParent();
MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(ARM::t2CMPri));
MIB.addReg(ARM::LR);
MIB.addImm(0);
MIB.addImm(ARMCC::AL);

// Create bne
MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(ARM::t2Bcc));
MIB.add(End->getOperand(1)); // branch target
MIB.addImm(ARMCC::NE); // condition code
End->eraseFromParent();
Dec->eraseFromParent();
};

if (Revert) {
Start->eraseFromParent();
ExpandBranch(Dec, End);
} else {
ExpandLoopStart(ML, Start);
ExpandLoopEnd(ML, Dec, End);
}
}

FunctionPass *llvm::createARMLowOverheadLoopsPass() {
return new ARMLowOverheadLoops();
}

0 comments on commit a6fd919

Please sign in to comment.