[AArch64][GlobalISel] Move dup optimization into post-legalizer combiner

Since all of the other G_SHUFFLE_VECTOR transforms are going there, let's do this with dup as well. This is nice, because it lets us split up the original code into matching, register bank selection, and instruction selection. - Create G_DUP, make it equivalent to AArch64dup - Add a post-legalizer combine which is 90% a copy-and-paste from tryOptVectorDup, except with shuffle matching closer to what SelectionDAG does in `ShuffleVectorSDNode::isSplatMask`. - Teach RegBankSelect about G_DUP. Since dup selection relies on the correct register bank for FP/GPR dup selection, this is necessary. - Kill `tryOptVectorDup`, since it's now entirely handled by G_DUP. - Add testcases for the combine, RegBankSelect, and selection. The selection test gives the same selection results as the old test. Differential Revision: https://reviews.llvm.org/D81221
llvm · Jun 6, 2020 · 8f262a6 · 8f262a6
1 parent 7d59f49
commit 8f262a6
Show file tree

Hide file tree

Showing 8 changed files with 565 additions and 185 deletions.
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -49,9 +49,16 @@ def uzp : GICombineRule<
   (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
 >;
 
+def dup: GICombineRule <
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchDup(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
 // Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo
 // instruction.
-def shuffle_vector_pseudos : GICombineGroup<[rev, zip, uzp]>;
+def shuffle_vector_pseudos : GICombineGroup<[dup, rev, zip, uzp]>;
 
 def AArch64PostLegalizerCombinerHelper
     : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -74,10 +74,18 @@ def G_ZIP2 : AArch64GenericInstruction {
   let InOperandList = (ins type0:$v1, type0:$v2);
 }
 
+// Represents a dup instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_DUP: AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$lane);
+}
+
 def : GINodeEquiv<G_REV16, AArch64rev16>;
 def : GINodeEquiv<G_REV32, AArch64rev32>;
 def : GINodeEquiv<G_REV64, AArch64rev64>;
 def : GINodeEquiv<G_UZP1, AArch64uzp1>;
 def : GINodeEquiv<G_UZP2, AArch64uzp2>;
 def : GINodeEquiv<G_ZIP1, AArch64zip1>;
 def : GINodeEquiv<G_ZIP2, AArch64zip2>;
+def : GINodeEquiv<G_DUP, AArch64dup>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -501,6 +501,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(
     const MachineInstr &MI, const MachineRegisterInfo &MRI,
     const TargetRegisterInfo &TRI) const {
   switch (MI.getOpcode()) {
+  case AArch64::G_DUP:
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP:
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -642,6 +643,16 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // Some of the floating-point instructions have mixed GPR and FPR operands:
   // fine-tune the computed mapping.
   switch (Opc) {
+  case AArch64::G_DUP: {
+    Register ScalarReg = MI.getOperand(1).getReg();
+    auto ScalarDef = MRI.getVRegDef(ScalarReg);
+    if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
+        onlyDefinesFP(*ScalarDef, MRI, TRI))
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+    else
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
     if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -306,8 +306,6 @@ class AArch64InstructionSelector : public InstructionSelector {
                              unsigned OpFlags) const;
 
   // Optimization methods.
-  bool tryOptVectorShuffle(MachineInstr &I) const;
-  bool tryOptVectorDup(MachineInstr &MI) const;
   bool tryOptSelect(MachineInstr &MI) const;
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
@@ -4211,119 +4209,8 @@ MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
   return &*CmpMI;
 }
 
-bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
-  // Try to match a vector splat operation into a dup instruction.
-  // We're looking for this pattern:
-  //    %scalar:gpr(s64) = COPY $x0
-  //    %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
-  //    %cst0:gpr(s32) = G_CONSTANT i32 0
-  //    %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
-  //    %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
-  //    %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
-  //                                             %zerovec(<2 x s32>)
-  //
-  // ...into:
-  // %splat = DUP %scalar
-  // We use the regbank of the scalar to determine which kind of dup to use.
-  MachineIRBuilder MIB(I);
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-  using namespace TargetOpcode;
-  using namespace MIPatternMatch;
-
-  // Begin matching the insert.
-  auto *InsMI =
-      getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
-  if (!InsMI)
-    return false;
-  // Match the undef vector operand.
-  auto *UndefMI =
-      getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
-  if (!UndefMI)
-    return false;
-  // Match the scalar being splatted.
-  Register ScalarReg = InsMI->getOperand(2).getReg();
-  const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
-  // Match the index constant 0.
-  int64_t Index = 0;
-  if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
-    return false;
-
-  // The shuffle's second operand doesn't matter if the mask is all zero.
-  ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
-  if (!all_of(Mask, [](int Elem) { return Elem == 0; }))
-    return false;
-
-  // We're done, now find out what kind of splat we need.
-  LLT VecTy = MRI.getType(I.getOperand(0).getReg());
-  LLT EltTy = VecTy.getElementType();
-  if (EltTy.getSizeInBits() < 32) {
-    LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
-    return false;
-  }
-  bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
-  unsigned Opc = 0;
-  if (IsFP) {
-    switch (EltTy.getSizeInBits()) {
-    case 32:
-      if (VecTy.getNumElements() == 2) {
-        Opc = AArch64::DUPv2i32lane;
-      } else {
-        Opc = AArch64::DUPv4i32lane;
-        assert(VecTy.getNumElements() == 4);
-      }
-      break;
-    case 64:
-      assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
-      Opc = AArch64::DUPv2i64lane;
-      break;
-    }
-  } else {
-    switch (EltTy.getSizeInBits()) {
-    case 32:
-      if (VecTy.getNumElements() == 2) {
-        Opc = AArch64::DUPv2i32gpr;
-      } else {
-        Opc = AArch64::DUPv4i32gpr;
-        assert(VecTy.getNumElements() == 4);
-      }
-      break;
-    case 64:
-      assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
-      Opc = AArch64::DUPv2i64gpr;
-      break;
-    }
-  }
-  assert(Opc && "Did not compute an opcode for a dup");
-
-  // For FP splats, we need to widen the scalar reg via undef too.
-  if (IsFP) {
-    MachineInstr *Widen = emitScalarToVector(
-        EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
-    if (!Widen)
-      return false;
-    ScalarReg = Widen->getOperand(0).getReg();
-  }
-  auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
-  if (IsFP)
-    Dup.addImm(0);
-  constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
-  I.eraseFromParent();
-  return true;
-}
-
-bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
-  if (TM.getOptLevel() == CodeGenOpt::None)
-    return false;
-  if (tryOptVectorDup(I))
-    return true;
-  return false;
-}
-
 bool AArch64InstructionSelector::selectShuffleVector(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
-  if (tryOptVectorShuffle(I))
-    return true;
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
   Register Src1Reg = I.getOperand(1).getReg();
   const LLT Src1Ty = MRI.getType(Src1Reg);

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -27,6 +28,7 @@
 #define DEBUG_TYPE "aarch64-postlegalizer-combiner"
 
 using namespace llvm;
+using namespace MIPatternMatch;
 
 /// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
 ///
@@ -41,6 +43,29 @@ struct ShuffleVectorPseudo {
   ShuffleVectorPseudo() {}
 };
 
+/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
+/// If \p MI is not a splat, returns None.
+static Optional<int> getSplatIndex(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Only G_SHUFFLE_VECTOR can have a splat index!");
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
+
+  // If all elements are undefined, this shuffle can be considered a splat.
+  // Return 0 for better potential for callers to simplify.
+  if (FirstDefinedIdx == Mask.end())
+    return 0;
+
+  // Make sure all remaining elements are either undef or the same
+  // as the first non-undef value.
+  int SplatValue = *FirstDefinedIdx;
+  if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
+             [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
+    return None;
+
+  return SplatValue;
+}
+
 /// Check if a vector shuffle corresponds to a REV instruction with the
 /// specified blocksize.
 static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
@@ -170,6 +195,53 @@ static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
   return true;
 }
 
+static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  auto Lane = getSplatIndex(MI);
+  if (!Lane || *Lane != 0)
+    return false;
+
+  // Try to match a vector splat operation into a dup instruction.
+  // We're looking for this pattern:
+  //
+  // %scalar:gpr(s64) = COPY $x0
+  // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+  // %cst0:gpr(s32) = G_CONSTANT i32 0
+  // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+  // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+  // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
+  //
+  // ...into:
+  // %splat = G_DUP %scalar
+
+  // Begin matching the insert.
+  auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
+                             MI.getOperand(1).getReg(), MRI);
+  if (!InsMI)
+    return false;
+
+  // Match the undef vector operand.
+  if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF,
+                               InsMI->getOperand(1).getReg(), MRI))
+    return false;
+
+  // Match the index constant 0.
+  int64_t Index = 0;
+  if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
+    return false;
+
+  Register Dst = MI.getOperand(0).getReg();
+  if (MRI.getType(Dst).getScalarSizeInBits() < 32) {
+    LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
+    return false;
+  }
+
+  MatchInfo =
+      ShuffleVectorPseudo(AArch64::G_DUP, Dst, {InsMI->getOperand(2).getReg()});
+  return true;
+}
+
 /// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
 /// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
 static bool applyShuffleVectorPseudo(MachineInstr &MI,