AMDGPU/GlobalISel: Add support for wide loads >= 256-bits

Summary: This adds support for the most commonly used wide load types: <8xi32>, <16xi32>, <4xi64>, and <8xi64> Reviewers: arsenm Reviewed By: arsenm Subscribers: hiraditya, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, volkan, Petar.Avramovic, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D57399 llvm-svn: 365586
llvm · Jul 10, 2019 · d0ba79f · d0ba79f
1 parent 693936a
commit d0ba79f
Show file tree

Hide file tree

Showing 7 changed files with 767 additions and 37 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -161,5 +161,77 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
   return &ValMappingsSGPR64OnlyVGPR32[2];
 }
 
+const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
+  /* 256-bit load */    {0, 256, SGPRRegBank},
+  /* 512-bit load */    {0, 512, SGPRRegBank},
+  /* 8 32-bit loads */  {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+                        {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+                        {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+                        {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+  /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
+                        {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
+                        {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
+                        {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
+                        {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
+                        {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
+                        {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
+                        {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
+  /* 4 64-bit loads */  {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+                        {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+  /* 8 64-bit loads */  {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
+                        {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
+                        {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
+                        {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},
+
+  /* FIXME: The generic register bank select does not support complex
+   * break downs where the number of vector elements does not equal the
+   * number of breakdowns.
+   * FIXME: register bank select now tries to handle complex break downs,
+   * but it emits an illegal instruction:
+   * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
+   */
+  /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+  /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
+                        {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
+  /* 256-bit load */     {&LoadSGPROnlyBreakDown[0], 1},
+  /* 512-bit load */     {&LoadSGPROnlyBreakDown[1], 1},
+  /* <8 x i32> load  */  {&LoadSGPROnlyBreakDown[2], 8},
+  /* <16 x i32> load */  {&LoadSGPROnlyBreakDown[10], 16},
+  /* <4 x i64> load */   {&LoadSGPROnlyBreakDown[26], 4},
+  /* <8 x i64> load */   {&LoadSGPROnlyBreakDown[30], 8}
+};
+
+const RegisterBankInfo::ValueMapping *
+getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
+  unsigned Size = SizeTy.getSizeInBits();
+  if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
+    return getValueMapping(BankID, Size);
+
+  assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);
+
+  // Default to using the non-split ValueMappings, we will use these if
+  // the register bank is SGPR or if we don't know how to handle the vector
+  // type.
+  unsigned Idx = Size == 256 ? 0 : 1;
+
+  // We need to split this load if it has a vgpr pointer.
+  if (BankID == AMDGPU::VGPRRegBankID) {
+    if (SizeTy == LLT::vector(8, 32))
+      Idx = 2;
+    else if (SizeTy == LLT::vector(16, 32))
+      Idx = 3;
+    else if (SizeTy == LLT::vector(4, 64))
+      Idx = 4;
+    else if (SizeTy == LLT::vector(8, 64))
+      Idx = 5;
+  }
+
+  return &ValMappingsLoadSGPROnly[Idx];
+}
+
+
 } // End AMDGPU namespace.
 } // End llvm namespace.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -517,7 +517,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
         case 256:
         case 512:
-          // TODO: constant loads
+          // TODO: Possibly support loads of i256 and i512 .  This will require
+          // adding i256 and i512 types to MVT in order for to be able to use
+          // TableGen.
+          // TODO: Add support for other vector types, this will require
+          //       defining more value mappings for the new types.
+          return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
+                                    Ty0.getScalarType().getSizeInBits() == 64);
+
         default:
           return false;
         }

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -37,22 +37,23 @@ using namespace llvm;
 namespace {
 
 // Observer to apply a register bank to new registers created by LegalizerHelper.
-class ApplySALUMapping final : public GISelChangeObserver {
+class ApplyRegBankMapping final : public GISelChangeObserver {
 private:
   MachineRegisterInfo &MRI;
+  const RegisterBank *NewBank;
   SmallVector<MachineInstr *, 4> NewInsts;
 
 public:
-  ApplySALUMapping(MachineRegisterInfo &MRI_)
-    : MRI(MRI_) {}
+  ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
+    : MRI(MRI_), NewBank(RB) {}
 
-  ~ApplySALUMapping() {
+  ~ApplyRegBankMapping() {
     for (MachineInstr *MI : NewInsts)
-      applySALUBank(*MI);
+      applyBank(*MI);
   }
 
   /// Set any registers that don't have a set register class or bank to SALU.
-  void applySALUBank(MachineInstr &MI) {
+  void applyBank(MachineInstr &MI) {
     for (MachineOperand &Op : MI.operands()) {
       if (!Op.isReg())
         continue;
@@ -61,10 +62,13 @@ class ApplySALUMapping final : public GISelChangeObserver {
       if (MRI.getRegClassOrRegBank(Reg))
         continue;
 
+      const RegisterBank *RB = NewBank;
       // FIXME: This might not be enough to detect when SCC should be used.
-      const RegisterBank &RB = MRI.getType(Reg) == LLT::scalar(1) ?
-        AMDGPU::SCCRegBank : AMDGPU::SGPRRegBank;
-      MRI.setRegBank(Reg, RB);
+      if (MRI.getType(Reg) == LLT::scalar(1))
+        RB = (NewBank == &AMDGPU::SGPRRegBank ?
+              &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
+
+      MRI.setRegBank(Reg, *RB);
     }
   }
 
@@ -80,7 +84,6 @@ class ApplySALUMapping final : public GISelChangeObserver {
 };
 
 }
-
 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
     : AMDGPUGenRegisterBankInfo(),
       TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
@@ -128,6 +131,12 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
   const ValueMapping &ValMapping,
   const RegisterBank *CurBank) const {
+  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
+  // VGPR.
+  // FIXME: Is there a better way to do this?
+  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
+    return 10; // This is expensive.
+
   assert(ValMapping.NumBreakDowns == 2 &&
          ValMapping.BreakDown[0].Length == 32 &&
          ValMapping.BreakDown[0].StartIdx == 0 &&
@@ -302,6 +311,14 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
   }
 }
 
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  return AMDGPUInstrInfo::isUniformMMO(MMO);
+}
+
 RegisterBankInfo::InstructionMappings
 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     const MachineInstr &MI) const {
@@ -356,29 +373,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
   }
   case TargetOpcode::G_LOAD: {
     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
     // FIXME: Should we be hard coding the size for these mappings?
-    const InstructionMapping &SSMapping = getInstructionMapping(
-        1, 1, getOperandsMapping(
-                  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
-                   AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
-        2); // Num Operands
-    AltMappings.push_back(&SSMapping);
+    if (isInstrUniform(MI)) {
+      const InstructionMapping &SSMapping = getInstructionMapping(
+          1, 1, getOperandsMapping(
+                    {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                     AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+          2); // Num Operands
+      AltMappings.push_back(&SSMapping);
+    }
 
     const InstructionMapping &VVMapping = getInstructionMapping(
         2, 1, getOperandsMapping(
-                  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                  {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
                    AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
         2); // Num Operands
     AltMappings.push_back(&VVMapping);
 
-    // FIXME: Should this be the pointer-size (64-bits) or the size of the
-    // register that will hold the bufffer resourc (128-bits).
-    const InstructionMapping &VSMapping = getInstructionMapping(
-        3, 1, getOperandsMapping(
-                  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
-                   AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
-        2); // Num Operands
-    AltMappings.push_back(&VSMapping);
+    // It may be possible to have a vgpr = load sgpr mapping here, because
+    // the mubuf instructions support this kind of load, but probably for only
+    // gfx7 and older.  However, the addressing mode matching in the instruction
+    // selector should be able to do a better job of detecting and selecting
+    // these kinds of loads from the vgpr = load vgpr mapping.
 
     return AltMappings;
 
@@ -874,6 +891,91 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
   MI.getOperand(OpIdx).setReg(SGPR);
 }
 
+// When regbankselect repairs registers, it will insert a repair instruction
+// which defines the repaired register.  Then it calls applyMapping and expects
+// that the targets will either delete or rewrite the originally wrote to the
+// repaired registers.  Beccause of this, we end up in a situation where
+// we have 2 instructions defining the same registers.
+static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
+                                     Register Reg,
+                                     const MachineInstr &MI) {
+  // Is there some way we can assert that there are exactly 2 def instructions?
+  for (MachineInstr &Other : MRI.def_instructions(Reg)) {
+    if (&Other != &MI)
+      return &Other;
+  }
+
+  return nullptr;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
+                        const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+                                              MachineRegisterInfo &MRI) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  const LLT LoadTy =  MRI.getType(DstReg);
+  unsigned LoadSize = LoadTy.getSizeInBits();
+  const unsigned MaxNonSmrdLoadSize = 128;
+  // 128-bit loads are supported for all instruction types.
+  if (LoadSize <= MaxNonSmrdLoadSize)
+    return false;
+
+  SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
+  SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
+
+  // If the pointer is an SGPR, we have nothing to do.
+  if (SrcRegs.empty())
+    return false;
+
+  assert(LoadSize % MaxNonSmrdLoadSize == 0);
+
+  // We want to get the repair instruction now, because it will help us
+  // determine which instruction the legalizer inserts that will also
+  // write to DstReg.
+  MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
+
+  // RegBankSelect only emits scalar types, so we need to reset the pointer
+  // operand to a pointer type.
+  Register BasePtrReg = SrcRegs[0];
+  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+  MRI.setType(BasePtrReg, PtrTy);
+
+  MachineIRBuilder B(MI);
+
+  unsigned SplitElts =
+      MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
+  const LLT LoadSplitTy =  LLT::vector(SplitElts, LoadTy.getScalarType());
+  ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
+  GISelObserverWrapper Observer(&O);
+  B.setChangeObserver(Observer);
+  LegalizerHelper Helper(B.getMF(), Observer, B);
+  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+    return false;
+
+  // At this point, the legalizer has split the original load into smaller
+  // loads.  At the end of lowering, it inserts an instruction (LegalizedInst)
+  // that combines the outputs of the lower loads and writes it to DstReg.
+  // The register bank selector has also added the RepairInst which writes to
+  // DstReg as well.
+
+  MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
+
+  // Replace the output of the LegalizedInst with a temporary register, since
+  // RepairInst already defines DstReg.
+  Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
+  LegalizedInst->getOperand(0).setReg(TmpReg);
+  B.setInsertPt(*RepairInst->getParent(), RepairInst);
+
+  for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
+    Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+    B.buildConstant(IdxReg, DefIdx);
+    MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
+    B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
+  }
+
+  MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
+  return true;
+}
+
 // For cases where only a single copy is inserted for matching register banks.
 // Replace the register in the instruction operand
 static void substituteSimpleCopyRegs(
@@ -1008,7 +1110,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
     MachineFunction *MF = MI.getParent()->getParent();
     MachineIRBuilder B(MI);
-    ApplySALUMapping ApplySALU(MRI);
+    ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
     GISelObserverWrapper Observer(&ApplySALU);
     LegalizerHelper Helper(*MF, Observer, B);
 
@@ -1028,7 +1130,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     MachineFunction *MF = MI.getParent()->getParent();
     MachineIRBuilder B(MI);
-    ApplySALUMapping ApplySALU(MRI);
+    ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
     GISelObserverWrapper Observer(&ApplySALU);
     LegalizerHelper Helper(*MF, Observer, B);
 
@@ -1212,21 +1314,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     }
     break;
   }
+  case AMDGPU::G_LOAD: {
+    if (applyMappingWideLoad(MI, OpdMapper, MRI))
+      return;
+    break;
+  }
   default:
     break;
   }
 
   return applyDefaultMapping(OpdMapper);
 }
 
-static bool isInstrUniform(const MachineInstr &MI) {
-  if (!MI.hasOneMemOperand())
-    return false;
-
-  const MachineMemOperand *MMO = *MI.memoperands_begin();
-  return AMDGPUInstrInfo::isUniformMMO(MMO);
-}
-
 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1322,6 +1421,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+  LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
   unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
 
   const ValueMapping *ValMapping;
@@ -1332,7 +1432,7 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
     ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
   } else {
-    ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
     // FIXME: What would happen if we used SGPRRegBankID here?
     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
   }

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -44,6 +44,9 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
 
   void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
                                     unsigned OpIdx) const;
+  bool applyMappingWideLoad(MachineInstr &MI,
+                            const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+                            MachineRegisterInfo &MRI) const;
 
   /// See RegisterBankInfo::applyMapping.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;