[AMDGPU] Define 16 bit VGPR subregs

We have loads preserving low and high 16 bits of their destinations. However, we always use a whole 32 bit register for these. The same happens with 16 bit stores, we have to use full 32 bit register so if high bits are clobbered the register needs to be copied. One example of such code is added to the load-hi16.ll. The proper solution to the problem is to define 16 bit subregs and use them in the operations which do not read another half of a VGPR or preserve it if the VGPR is written. This patch simply defines subregisters and register classes. At the moment there should be no difference in code generation. A lot more work is needed to actually use these new register classes. Therefore, there are no new tests at this time. Register weight calculation has changed with new subregs so appropriate changes were made to keep all calculations just as they are now, especially calculations of register pressure. Differential Revision: https://reviews.llvm.org/D74873
llvm · Mar 31, 2020 · 08682dc · 08682dc
1 parent 28518d9
commit 08682dc
Show file tree

Hide file tree

Showing 10 changed files with 189 additions and 62 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
+  [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -40,7 +40,20 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
 
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
-      SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {}
+      SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
+
+  assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
+         getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
+         (getSubRegIndexLaneMask(AMDGPU::lo16) |
+          getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
+           getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
+         "getNumCoveredRegs() will not work with generated subreg masks!");
+
+  RegPressureIgnoredUnits.resize(getNumRegUnits());
+  RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
+  for (auto Reg : AMDGPU::VGPR_HI16RegClass)
+    RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
+}
 
 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
                                            unsigned Reg) const {
@@ -1777,6 +1790,8 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   default:
     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
   case AMDGPU::VGPR_32RegClassID:
+  case AMDGPU::VGPR_LO16RegClassID:
+  case AMDGPU::VGPR_HI16RegClassID:
     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
   case AMDGPU::SGPR_32RegClassID:
     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
@@ -1800,8 +1815,9 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
   static const int Empty[] = { -1 };
 
-  if (hasRegUnit(AMDGPU::M0, RegUnit))
+  if (RegPressureIgnoredUnits[RegUnit])
     return Empty;
+
   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -32,6 +32,7 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   const GCNSubtarget &ST;
   bool SpillSGPRToVGPR;
   bool isWave32;
+  BitVector RegPressureIgnoredUnits;
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
 
@@ -269,7 +270,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 
   // \returns number of 32 bit registers covered by a \p LM
   static unsigned getNumCoveredRegs(LaneBitmask LM) {
-    return LM.getNumLanes();
+    // The assumption is that every lo16 subreg is an even bit and every hi16
+    // is an adjacent odd bit or vice versa.
+    uint64_t Mask = LM.getAsInteger();
+    uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL;
+    Mask = (Even >> 1) | Mask;
+    uint64_t Odd = Mask & 0x5555555555555555ULL;
+    return countPopulation(Odd);
   }
 
   // \returns a DWORD offset of a \p SubReg

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -10,10 +10,44 @@
 //  Subregister declarations
 //===----------------------------------------------------------------------===//
 
+class Indexes<int N> {
+  list<int> all = [0,   1,  2,  3,  4,  5,  6 , 7,
+                   8,   9, 10, 11, 12, 13, 14, 15,
+                   16, 17, 18, 19, 20, 21, 22, 23,
+                   24, 25, 26, 27, 28, 29, 30, 31];
+
+  // Returns list of indexes [0..N)
+  list<int> slice =
+    !foldl([]<int>, all, acc, cur,
+           !listconcat(acc, !if(!lt(cur, N), [cur], [])));
+}
+
 let Namespace = "AMDGPU" in {
+
+def lo16 : SubRegIndex<16, 0>;
+def hi16 : SubRegIndex<16, 16>;
+
 foreach Index = 0-31 in {
   def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
 }
+
+foreach Index = 1-31 in {
+  def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>;
+  def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>;
+}
+
+foreach Size = {2-5,8,16} in {
+  foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in {
+    def !foldl("", Indexes<Size>.slice, acc, cur,
+               !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
+      SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
+      let CoveringSubRegIndices =
+        !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur,
+               !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))]));
+    }
+  }
+}
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -92,6 +126,16 @@ class SIReg <string n, bits<16> regIdx = 0> :
   let HWEncoding = regIdx;
 }
 
+class SIRegWithSubRegs <string n, list<Register> subregs, bits<16> regIdx = 0> :
+  RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+
+  // This is the not yet the complete register encoding. An additional
+  // bit is set for VGPRs.
+  let HWEncoding = regIdx;
+  let CoveredBySubRegs = 1;
+}
+
 // Special Registers
 def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
@@ -221,10 +265,29 @@ foreach Index = 0-255 in {
   // The ratio of index/allocation_granularity is taken as the cost value.
   // Considered the allocation granularity as 4 here.
   let CostPerUse=!if(!gt(Index, 31), !srl(Index, 2), 0) in {
-  def VGPR#Index :
-      SIReg <"v"#Index, Index>,
-      DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
+
+  // There is no special encoding for low 16 bit subreg, this not a real
+  // register but rather an operand for instructions preserving high 16 bits
+  // of the result or reading just low 16 bits of a 32 bit VGPR.
+  // It is encoded as a corresponding 32 bit register.
+  def VGPR#Index#_LO16 : SIReg <"v"#Index#".l", Index>,
+    DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
+    let HWEncoding{8} = 1;
+  }
+  // There is no special encoding for low 16 bit subreg, this not a real
+  // register but rather an operand for instructions preserving low 16 bits
+  // of the result or reading just high 16 bits of a 32 bit VGPR.
+  // It is encoded as a corresponding 32 bit register.
+  def VGPR#Index#_HI16 : SIReg <"v"#Index#".h", Index>,
+    DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
+    let HWEncoding{8} = 1;
+  }
+  def VGPR#Index : SIRegWithSubRegs <"v"#Index,
+    [!cast<Register>("VGPR"#Index#"_LO16"), !cast<Register>("VGPR"#Index#"_HI16")],
+    Index>,
+    DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
     let HWEncoding{8} = 1;
+    let SubRegIndices = [lo16, hi16];
   }
   }
 }
@@ -386,13 +449,27 @@ class RegisterTypes<list<ValueType> reg_types> {
 def Reg16Types : RegisterTypes<[i16, f16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
 
+def VGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+                              (add (sequence "VGPR%u_LO16", 0, 255))> {
+  let AllocationPriority = 1;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
+
+def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+                              (add (sequence "VGPR%u_HI16", 0, 255))> {
+  let AllocationPriority = 1;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
 
 // VGPR 32-bit registers
 // i16/f16 only on VI+
 def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
+  let Weight = 1;
 }
 
 // VGPR 64-bit registers
@@ -634,6 +711,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0,
   // Requires 2 v_mov_b32 to copy
   let CopyCost = 2;
   let AllocationPriority = 2;
+  let Weight = 2;
 }
 
 def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
@@ -642,6 +720,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
   // Requires 3 v_mov_b32 to copy
   let CopyCost = 3;
   let AllocationPriority = 3;
+  let Weight = 3;
 }
 
 def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64, i128], 32,
@@ -651,6 +730,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64, i128], 32,
   // Requires 4 v_mov_b32 to copy
   let CopyCost = 4;
   let AllocationPriority = 4;
+  let Weight = 4;
 }
 
 def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
@@ -660,27 +740,31 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
   // Requires 5 v_mov_b32 to copy
   let CopyCost = 5;
   let AllocationPriority = 5;
+  let Weight = 5;
 }
 
 def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
                              (add VGPR_256)> {
   let Size = 256;
   let CopyCost = 8;
   let AllocationPriority = 6;
+  let Weight = 8;
 }
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
                              (add VGPR_512)> {
   let Size = 512;
   let CopyCost = 16;
   let AllocationPriority = 7;
+  let Weight = 16;
 }
 
 def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
                               (add VGPR_1024)> {
   let Size = 1024;
   let CopyCost = 32;
   let AllocationPriority = 8;
+  let Weight = 32;
 }
 
 def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,