Skip to content

Commit

Permalink
[AArch64][GlobalISel] Move dup optimization into post-legalizer combiner
Browse files Browse the repository at this point in the history
Since all of the other G_SHUFFLE_VECTOR transforms are going there, let's do
this with dup as well. This is nice, because it lets us split up the original
code into matching, register bank selection, and instruction selection.

- Create G_DUP, make it equivalent to AArch64dup

- Add a post-legalizer combine which is 90% a copy-and-paste from
  tryOptVectorDup, except with shuffle matching closer to what SelectionDAG
  does in `ShuffleVectorSDNode::isSplatMask`.

- Teach RegBankSelect about G_DUP. Since dup selection relies on the correct
  register bank for FP/GPR dup selection, this is necessary.

- Kill `tryOptVectorDup`, since it's now entirely handled by G_DUP.

- Add testcases for the combine, RegBankSelect, and selection. The selection
  test gives the same selection results as the old test.

Differential Revision: https://reviews.llvm.org/D81221
  • Loading branch information
Jessica Paquette committed Jun 6, 2020
1 parent 7d59f49 commit 8f262a6
Show file tree
Hide file tree
Showing 8 changed files with 565 additions and 185 deletions.
9 changes: 8 additions & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Expand Up @@ -49,9 +49,16 @@ def uzp : GICombineRule<
(apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
>;

def dup: GICombineRule <
(defs root:$root, shuffle_matchdata:$matchinfo),
(match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
[{ return matchDup(*${root}, MRI, ${matchinfo}); }]),
(apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
>;

// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo
// instruction.
def shuffle_vector_pseudos : GICombineGroup<[rev, zip, uzp]>;
def shuffle_vector_pseudos : GICombineGroup<[dup, rev, zip, uzp]>;

def AArch64PostLegalizerCombinerHelper
: GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrGISel.td
Expand Up @@ -74,10 +74,18 @@ def G_ZIP2 : AArch64GenericInstruction {
let InOperandList = (ins type0:$v1, type0:$v2);
}

// Represents a dup instruction. Produced post-legalization from
// G_SHUFFLE_VECTORs with appropriate masks.
def G_DUP: AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type1:$lane);
}

def : GINodeEquiv<G_REV16, AArch64rev16>;
def : GINodeEquiv<G_REV32, AArch64rev32>;
def : GINodeEquiv<G_REV64, AArch64rev64>;
def : GINodeEquiv<G_UZP1, AArch64uzp1>;
def : GINodeEquiv<G_UZP2, AArch64uzp2>;
def : GINodeEquiv<G_ZIP1, AArch64zip1>;
def : GINodeEquiv<G_ZIP2, AArch64zip2>;
def : GINodeEquiv<G_DUP, AArch64dup>;
11 changes: 11 additions & 0 deletions llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
Expand Up @@ -501,6 +501,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(
const MachineInstr &MI, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
switch (MI.getOpcode()) {
case AArch64::G_DUP:
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
Expand Down Expand Up @@ -642,6 +643,16 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Some of the floating-point instructions have mixed GPR and FPR operands:
// fine-tune the computed mapping.
switch (Opc) {
case AArch64::G_DUP: {
Register ScalarReg = MI.getOperand(1).getReg();
auto ScalarDef = MRI.getVRegDef(ScalarReg);
if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
onlyDefinesFP(*ScalarDef, MRI, TRI))
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
else
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
break;
}
case TargetOpcode::G_TRUNC: {
LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
Expand Down
113 changes: 0 additions & 113 deletions llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
Expand Up @@ -306,8 +306,6 @@ class AArch64InstructionSelector : public InstructionSelector {
unsigned OpFlags) const;

// Optimization methods.
bool tryOptVectorShuffle(MachineInstr &I) const;
bool tryOptVectorDup(MachineInstr &MI) const;
bool tryOptSelect(MachineInstr &MI) const;
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
Expand Down Expand Up @@ -4211,119 +4209,8 @@ MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
return &*CmpMI;
}

bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
// Try to match a vector splat operation into a dup instruction.
// We're looking for this pattern:
// %scalar:gpr(s64) = COPY $x0
// %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
// %cst0:gpr(s32) = G_CONSTANT i32 0
// %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
// %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
// %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
// %zerovec(<2 x s32>)
//
// ...into:
// %splat = DUP %scalar
// We use the regbank of the scalar to determine which kind of dup to use.
MachineIRBuilder MIB(I);
MachineRegisterInfo &MRI = *MIB.getMRI();
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
using namespace TargetOpcode;
using namespace MIPatternMatch;

// Begin matching the insert.
auto *InsMI =
getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
if (!InsMI)
return false;
// Match the undef vector operand.
auto *UndefMI =
getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
if (!UndefMI)
return false;
// Match the scalar being splatted.
Register ScalarReg = InsMI->getOperand(2).getReg();
const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
// Match the index constant 0.
int64_t Index = 0;
if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
return false;

// The shuffle's second operand doesn't matter if the mask is all zero.
ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
if (!all_of(Mask, [](int Elem) { return Elem == 0; }))
return false;

// We're done, now find out what kind of splat we need.
LLT VecTy = MRI.getType(I.getOperand(0).getReg());
LLT EltTy = VecTy.getElementType();
if (EltTy.getSizeInBits() < 32) {
LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
return false;
}
bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
unsigned Opc = 0;
if (IsFP) {
switch (EltTy.getSizeInBits()) {
case 32:
if (VecTy.getNumElements() == 2) {
Opc = AArch64::DUPv2i32lane;
} else {
Opc = AArch64::DUPv4i32lane;
assert(VecTy.getNumElements() == 4);
}
break;
case 64:
assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
Opc = AArch64::DUPv2i64lane;
break;
}
} else {
switch (EltTy.getSizeInBits()) {
case 32:
if (VecTy.getNumElements() == 2) {
Opc = AArch64::DUPv2i32gpr;
} else {
Opc = AArch64::DUPv4i32gpr;
assert(VecTy.getNumElements() == 4);
}
break;
case 64:
assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
Opc = AArch64::DUPv2i64gpr;
break;
}
}
assert(Opc && "Did not compute an opcode for a dup");

// For FP splats, we need to widen the scalar reg via undef too.
if (IsFP) {
MachineInstr *Widen = emitScalarToVector(
EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
if (!Widen)
return false;
ScalarReg = Widen->getOperand(0).getReg();
}
auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
if (IsFP)
Dup.addImm(0);
constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
I.eraseFromParent();
return true;
}

bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
if (TM.getOptLevel() == CodeGenOpt::None)
return false;
if (tryOptVectorDup(I))
return true;
return false;
}

bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
if (tryOptVectorShuffle(I))
return true;
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
Expand Down
72 changes: 72 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
Expand Up @@ -19,6 +19,7 @@
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetPassConfig.h"
Expand All @@ -27,6 +28,7 @@
#define DEBUG_TYPE "aarch64-postlegalizer-combiner"

using namespace llvm;
using namespace MIPatternMatch;

/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
///
Expand All @@ -41,6 +43,29 @@ struct ShuffleVectorPseudo {
ShuffleVectorPseudo() {}
};

/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
/// If \p MI is not a splat, returns None.
static Optional<int> getSplatIndex(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Only G_SHUFFLE_VECTOR can have a splat index!");
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });

// If all elements are undefined, this shuffle can be considered a splat.
// Return 0 for better potential for callers to simplify.
if (FirstDefinedIdx == Mask.end())
return 0;

// Make sure all remaining elements are either undef or the same
// as the first non-undef value.
int SplatValue = *FirstDefinedIdx;
if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
[&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
return None;

return SplatValue;
}

/// Check if a vector shuffle corresponds to a REV instruction with the
/// specified blocksize.
static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
Expand Down Expand Up @@ -170,6 +195,53 @@ static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
return true;
}

static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
ShuffleVectorPseudo &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
auto Lane = getSplatIndex(MI);
if (!Lane || *Lane != 0)
return false;

// Try to match a vector splat operation into a dup instruction.
// We're looking for this pattern:
//
// %scalar:gpr(s64) = COPY $x0
// %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
// %cst0:gpr(s32) = G_CONSTANT i32 0
// %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
// %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
// %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
//
// ...into:
// %splat = G_DUP %scalar

// Begin matching the insert.
auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
MI.getOperand(1).getReg(), MRI);
if (!InsMI)
return false;

// Match the undef vector operand.
if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF,
InsMI->getOperand(1).getReg(), MRI))
return false;

// Match the index constant 0.
int64_t Index = 0;
if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
return false;

Register Dst = MI.getOperand(0).getReg();
if (MRI.getType(Dst).getScalarSizeInBits() < 32) {
LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
return false;
}

MatchInfo =
ShuffleVectorPseudo(AArch64::G_DUP, Dst, {InsMI->getOperand(2).getReg()});
return true;
}

/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
static bool applyShuffleVectorPseudo(MachineInstr &MI,
Expand Down

0 comments on commit 8f262a6

Please sign in to comment.