AMDGPU: Select f64 fmul by negative power of 2 to ldexp

Select fmul x, -K -> ldexp(-x, log2(fabsK)) Select fmul fabs(x), -K -> ldexp(-|x|, log2(fabsK)) https://reviews.llvm.org/D158173
llvm · Aug 24, 2023 · 16bc07a · 16bc07a
1 parent 4c4ff50
commit 16bc07a
Show file tree

Hide file tree

Showing 4 changed files with 486 additions and 255 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5131,7 +5131,7 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
                                                        const MachineInstr &MI,
                                                        int OpIdx) const {
   const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
-  int ExpVal = APF.getExactLog2();
+  int ExpVal = APF.getExactLog2Abs();
   assert(ExpVal != INT_MIN);
   MIB.addImm(ExpVal);
 }

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3330,7 +3330,7 @@ defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
 // Convert a floating-point power of 2 to the integer exponent.
 def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
   const auto &APF = N->getValueAPF();
-  int Log2 = APF.getExactLog2();
+  int Log2 = APF.getExactLog2Abs();
   assert(Log2 != INT_MIN);
   return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32);
 }]>;
@@ -3339,8 +3339,24 @@ def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
 // immediate where it's preferable to emit a multiply by as an
 // ldexp. We skip over 0.5 to 4.0 as those are inline immediates
 // anyway.
-def fpimm_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
-    int Exp = Imm.getExactLog2();
+def fpimm_pos_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
+    if (Imm.isNegative())
+      return false;
+
+    int Exp = Imm.getExactLog2Abs();
+    // Prefer leaving the FP inline immediates as they are.
+    // 0.5, 1.0, 2.0, 4.0
+
+    // For f64 ldexp is always better than materializing a 64-bit
+    // constant.
+    return Exp != INT_MIN && (Exp < -1 || Exp > 2);
+  }], FPPow2ToExponentXForm
+>;
+
+def fpimm_neg_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
+    if (!Imm.isNegative())
+      return false;
+    int Exp = Imm.getExactLog2Abs();
     // Prefer leaving the FP inline immediates as they are.
     // 0.5, 1.0, 2.0, 4.0
 
@@ -3353,17 +3369,32 @@ def fpimm_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
 // f64 is different because we also want to handle cases that may
 // require materialization of the exponent.
 // TODO: If we know f64 ops are fast, prefer add (ldexp x, N), y over fma
-// TODO: fmul x, -2^n -> ldexp(-x, n)
 // TODO: For f32/f16, it's not a clear win on code size to use ldexp
 // in place of mul since we have to use the vop3 form. Are there power
 // savings or some other reason to prefer ldexp over mul?
 def : GCNPat<
   (any_fmul (f64 (VOP3Mods f64:$src0, i32:$src0_mods)),
-            fpimm_pow2_prefer_ldexp_f64:$src1),
+            fpimm_pos_pow2_prefer_ldexp_f64:$src1),
   (V_LDEXP_F64_e64 i32:$src0_mods, VSrc_b64:$src0,
                    0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
 >;
 
+def : GCNPat<
+  (any_fmul f64:$src0, fpimm_neg_pow2_prefer_ldexp_f64:$src1),
+  (V_LDEXP_F64_e64 SRCMODS.NEG, VSrc_b64:$src0,
+                   0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
+>;
+
+// We want to avoid using VOP3Mods which could pull in another fneg
+// which we would need to be re-negated (which should never happen in
+// practice). I don't see a way to apply an SDNodeXForm that accounts
+// for a second operand.
+def : GCNPat<
+  (any_fmul (fabs f64:$src0), fpimm_neg_pow2_prefer_ldexp_f64:$src1),
+  (V_LDEXP_F64_e64 SRCMODS.NEG_ABS, VSrc_b64:$src0,
+                   0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
+>;
+
 class AMDGPUGenericInstruction : GenericInstruction {
   let Namespace = "AMDGPU";
 }

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
@@ -289,11 +289,9 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+    ; GCN-NEXT: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_LDEXP_F64_e64 1, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_LDEXP_F64_e64_]]
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FCONSTANT double -16.0
@@ -315,11 +313,9 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 2, [[COPY]], 0, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+    ; GCN-NEXT: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_LDEXP_F64_e64 3, [[COPY]], 0, [[S_MOV_B32_]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_LDEXP_F64_e64_]]
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FABS %0
@@ -342,11 +338,14 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 0, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
+    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_MOV_B32_]], [[COPY1]], implicit $exec
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_OR_B32_e64_]], %subreg.sub1
+    ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4
+    ; GCN-NEXT: [[V_LDEXP_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_LDEXP_F64_e64 1, [[REG_SEQUENCE]], 0, [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_LDEXP_F64_e64_]]
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_FABS %0