AMDGPU/SI: Fold operands with sub-registers

Summary: Multi-dword constant loads generated unnecessary moves from SGPRs into VGPRs, increasing the code size and VGPR pressure. These moves are now folded away. Note that this lack of operand folding was not a problem for VMEM loads, because COPY nodes from VReg_Nnn to VGPR32 are eliminated by the register coalescer. Some tests are updated, note that the fsub.ll test explicitly checks that the move is elided. With the IR generated by current Mesa, the changes are obviously relatively minor: 7063 shaders in 3531 tests Totals: SGPRS: 351872 -> 352560 (0.20 %) VGPRS: 199984 -> 200732 (0.37 %) Code Size: 9876968 -> 9881112 (0.04 %) bytes LDS: 91 -> 91 (0.00 %) blocks Scratch: 1779712 -> 1767424 (-0.69 %) bytes per wave Wait states: 295164 -> 295337 (0.06 %) Totals from affected shaders: SGPRS: 65784 -> 66472 (1.05 %) VGPRS: 38064 -> 38812 (1.97 %) Code Size: 1993828 -> 1997972 (0.21 %) bytes LDS: 42 -> 42 (0.00 %) blocks Scratch: 795648 -> 783360 (-1.54 %) bytes per wave Wait states: 54026 -> 54199 (0.32 %) Reviewers: tstellarAMD, arsenm, mareko Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15875 llvm-svn: 257074
llvm · Jan 7, 2016 · 82fc962 · 82fc962
1 parent 3c05d6d
commit 82fc962
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 21 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -215,7 +215,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 
   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
     unsigned SrcReg = MI.getOperand(I).getReg();
-    unsigned SrcSubReg = MI.getOperand(I).getReg();
+    unsigned SrcSubReg = MI.getOperand(I).getSubReg();
 
     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
     assert(TRI->isSGPRClass(SrcRC) &&

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -334,13 +334,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           !MRI.hasOneUse(MI.getOperand(0).getReg()))
         continue;
 
-      // FIXME: Fold operands with subregs.
       if (OpToFold.isReg() &&
-          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
-           OpToFold.getSubReg()))
+          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
         continue;
 
-
       // We need mutate the operands of new mov instructions to add implicit
       // uses of EXEC, but adding them invalidates the use_iterator, so defer
       // this.

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1777,6 +1777,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
     MRI.getRegClass(Reg) :
     RI.getPhysRegClass(Reg);
 
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  RC = TRI->getSubRegClass(RC, MO.getSubReg());
+
   // In order to be legal, the common sub-class must be equal to the
   // class of the current operand.  For example:
   //

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -464,12 +464,38 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
   if (SubIdx == AMDGPU::NoSubRegister)
     return RC;
 
-  // If this register has a sub-register, we can safely assume it is a 32-bit
-  // register, because all of SI's sub-registers are 32-bit.
+  // We can assume that each lane corresponds to one 32-bit register.
+  unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
   if (isSGPRClass(RC)) {
-    return &AMDGPU::SGPR_32RegClass;
+    switch (Count) {
+    case 1:
+      return &AMDGPU::SGPR_32RegClass;
+    case 2:
+      return &AMDGPU::SReg_64RegClass;
+    case 4:
+      return &AMDGPU::SReg_128RegClass;
+    case 8:
+      return &AMDGPU::SReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
   } else {
-    return &AMDGPU::VGPR_32RegClass;
+    switch (Count) {
+    case 1:
+      return &AMDGPU::VGPR_32RegClass;
+    case 2:
+      return &AMDGPU::VReg_64RegClass;
+    case 3:
+      return &AMDGPU::VReg_96RegClass;
+    case 4:
+      return &AMDGPU::VReg_128RegClass;
+    case 8:
+      return &AMDGPU::VReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
   }
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f32
 ; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-NONAN: v_min_f32_e32
+; SI-SAFE: v_min_legacy_f32_e64
+; SI-NONAN: v_min_f32_e64
 define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1

diff --git a/llvm/test/CodeGen/AMDGPU/fsub.ll b/llvm/test/CodeGen/AMDGPU/fsub.ll
@@ -32,9 +32,8 @@ declare void @llvm.AMDGPU.store.output(float, i32)
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
 
-; FIXME: Should be using SGPR directly for first operand
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
   %sub = fsub <2 x float> %a, %b
   store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
@@ -60,13 +59,11 @@ define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(
   ret void
 }
 
-; FIXME: Should be using SGPR directly for first operand
-
 ; FUNC-LABEL: {{^}}s_fsub_v4f32:
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: s_endpgm
 define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -21,7 +21,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 ; SI-DAG: v_cmp_eq_i32
 
 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
-; SI-DAG: v_cmp_gt_i32_e32
+; SI-DAG: v_cmp_gt_i32
 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
 
 ; SI: buffer_store_dwordx2