Skip to content

Commit

Permalink
[ARM][LowOverheadLoops] Add checks for narrowing
Browse files Browse the repository at this point in the history
Modify ValidateLiveOuts to track 'FalseLaneZeros' more precisely,
including checks on specific operations that can generate non-zeros
from zero values, e.g VMVN. We can then check that any instructions
that retain some information in their output register (all narrowing
instructions) that they only use and def registers that always have
zeros in their falsely predicated bytes, whether or not tail
predication happens.

Most of the logic remains the same, just the names of the data
structures and helpers have been renamed to reflect the change in
logic. The key change, apart from the opcode checkers, is that the
FalseZeros set now strictly contains only instructions which will
always generate zeros, and not instructions that could also have
their false bytes masked away later.

Differential Revision: https://reviews.llvm.org/D76235
  • Loading branch information
sparker-arm committed Mar 24, 2020
1 parent 6f86e6b commit 94caceb
Showing 1 changed file with 77 additions and 34 deletions.
111 changes: 77 additions & 34 deletions llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
Expand Up @@ -61,6 +61,8 @@ using namespace llvm;

namespace {

using InstSet = SmallPtrSetImpl<MachineInstr *>;

class PostOrderLoopTraversal {
MachineLoop &ML;
MachineLoopInfo &MLI;
Expand Down Expand Up @@ -518,6 +520,59 @@ static bool isRegInClass(const MachineOperand &MO,
return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
}

// Can this instruction generate a non-zero result when given only zeroed
// operands? This allows us to know that, given operands with false bytes
// zeroed by masked loads, that the result will also contain zeros in those
// bytes.
static bool canGenerateNonZeros(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
break;
// FIXME: FP minus 0?
//case ARM::MVE_VNEGf16:
//case ARM::MVE_VNEGf32:
case ARM::MVE_VMVN:
case ARM::MVE_VORN:
case ARM::MVE_VCLZs8:
case ARM::MVE_VCLZs16:
case ARM::MVE_VCLZs32:
return true;
}
return false;
}

// MVE 'narrowing' operate on half a lane, reading from half and writing
// to half, which are referred to has the top and bottom half. The other
// half retains its previous value.
static bool retainsPreviousHalfElement(const MachineInstr &MI) {
const MCInstrDesc &MCID = MI.getDesc();
uint64_t Flags = MCID.TSFlags;
return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
}

// Look at its register uses to see if it only can only receive zeros
// into its false lanes which would then produce zeros. Also check that
// the output register is also defined by an FalseLaneZeros instruction
// so that if tail-predication happens, the lanes that aren't updated will
// still be zeros.
static bool producesFalseLaneZeros(MachineInstr &MI,
const TargetRegisterClass *QPRs,
const ReachingDefAnalysis &RDA,
InstSet &FalseLaneZeros) {
if (canGenerateNonZeros(MI))
return false;
for (auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.getReg())
continue;
if (auto *OpDef = RDA.getMIOperand(&MI, MO))
if (FalseLaneZeros.count(OpDef))
continue;
return false;
}
LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
return true;
}

bool LowOverheadLoop::ValidateLiveOuts() const {
// We want to find out if the tail-predicated version of this loop will
// produce the same values as the loop in its original form. For this to
Expand All @@ -538,76 +593,64 @@ bool LowOverheadLoop::ValidateLiveOuts() const {
// operands, or stored results are equivalent already. Other explicitly
// predicated instructions will perform the same operation in the original
// loop and the tail-predicated form too. Because of this, we can insert
// loads, stores and other predicated instructions into our KnownFalseZeros
// loads, stores and other predicated instructions into our Predicated
// set and build from there.
const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
SetVector<MachineInstr *> UnknownFalseLanes;
SmallPtrSet<MachineInstr *, 4> KnownFalseZeros;
SetVector<MachineInstr *> Unknown;
SmallPtrSet<MachineInstr *, 4> FalseLaneZeros;
SmallPtrSet<MachineInstr *, 4> Predicated;
MachineBasicBlock *MBB = ML.getHeader();

for (auto &MI : *MBB) {
const MCInstrDesc &MCID = MI.getDesc();
uint64_t Flags = MCID.TSFlags;
if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
continue;

if (isVectorPredicated(&MI)) {
KnownFalseZeros.insert(&MI);
if (MI.mayLoad())
FalseLaneZeros.insert(&MI);
Predicated.insert(&MI);
continue;
}

if (MI.getNumDefs() == 0)
continue;

// Only evaluate instructions which produce a single value.
assert((MI.getNumDefs() == 1 && MI.defs().begin()->isReg()) &&
"Expected no more than one register def");

Register DefReg = MI.defs().begin()->getReg();
for (auto &MO : MI.operands()) {
if (!isRegInClass(MO, QPRs) || !MO.isUse() || MO.getReg() != DefReg)
continue;

// If this instruction overwrites one of its operands, and that register
// has known lanes, then this instruction also has known predicated false
// lanes.
if (auto *OpDef = RDA.getMIOperand(&MI, MO)) {
if (KnownFalseZeros.count(OpDef)) {
KnownFalseZeros.insert(&MI);
break;
}
}
}
if (!KnownFalseZeros.count(&MI))
UnknownFalseLanes.insert(&MI);
if (producesFalseLaneZeros(MI, QPRs, RDA, FalseLaneZeros))
FalseLaneZeros.insert(&MI);
else if (retainsPreviousHalfElement(MI))
return false;
else
Unknown.insert(&MI);
}

auto HasKnownUsers = [this](MachineInstr *MI, const MachineOperand &MO,
SmallPtrSetImpl<MachineInstr *> &Knowns) {
auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
SmallPtrSetImpl<MachineInstr *> &Predicated) {
SmallPtrSet<MachineInstr *, 2> Uses;
RDA.getGlobalUses(MI, MO.getReg(), Uses);
for (auto *Use : Uses) {
if (Use != MI && !Knowns.count(Use))
if (Use != MI && !Predicated.count(Use))
return false;
}
return true;
};

// Now for all the unknown values, see if they're only consumed by known
// instructions. Visit in reverse so that we can start at the values being
// Visit the unknowns in reverse so that we can start at the values being
// stored and then we can work towards the leaves, hopefully adding more
// instructions to KnownFalseZeros.
for (auto *MI : reverse(UnknownFalseLanes)) {
// instructions to Predicated.
for (auto *MI : reverse(Unknown)) {
for (auto &MO : MI->operands()) {
if (!isRegInClass(MO, QPRs) || !MO.isDef())
continue;
if (!HasKnownUsers(MI, MO, KnownFalseZeros)) {
if (!HasPredicatedUsers(MI, MO, Predicated)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
return false;
}
}
// Any unknown false lanes have been masked away by the user(s).
KnownFalseZeros.insert(MI);
Predicated.insert(MI);
}

// Collect Q-regs that are live in the exit blocks. We don't collect scalars
Expand Down

0 comments on commit 94caceb

Please sign in to comment.