-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[RISCV] Undo fneg (fmul x, y) -> fmul x, (fneg y) transform #157388
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Undo fneg (fmul x, y) -> fmul x, (fneg y) transform #157388
Conversation
InstCombine will hoist an fneg through an fmul, but not for fadd/fsub. This prevents us from matching fmsub and fnmadd in some cases. This patch adds a DAG combine to undo this in InstCombine, which helps some hot loops in 508.namd_r: ```diff @@ -983,18 +983,15 @@ fld ft2, 48(a5) fld ft3, 64(a5) fld ft4, 72(a5) - fneg.d fa0, fa0 - fneg.d ft0, ft0 - fneg.d ft2, ft2 fmul.d fa3, ft5, fa3 fmul.d fa0, fa3, fa0 fmul.d ft0, fa3, ft0 fmul.d fa3, fa3, ft2 fld ft2, 0(s1) fmul.d fa4, ft5, fa4 - fmadd.d fa2, fa4, fa2, fa0 - fmadd.d ft6, fa4, ft6, ft0 - fmadd.d fa4, fa4, ft1, fa3 + fmsub.d fa2, fa4, fa2, fa0 + fmsub.d ft6, fa4, ft6, ft0 + fmsub.d fa4, fa4, ft1, fa3 ``` This gives a [1.77% improvement in both instruction count and runtime on 508.namd_r](https://lnt.lukelau.me/db_default/v4/nts/profile/1/1022/1021) This also causes some more fnegs to be sunk after a bitcast to integer, so they're now done as xor. From glancing at some of the schedules for WriteFSGN my guess is that this is also profitable.
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesInstCombine will hoist an fneg through an fmul, but not for fadd/fsub. This patch adds a DAG combine to undo this in InstCombine, which helps some hot loops in 508.namd_r: @@ -983,18 +983,15 @@
fld ft2, 48(a5)
fld ft3, 64(a5)
fld ft4, 72(a5)
- fneg.d fa0, fa0
- fneg.d ft0, ft0
- fneg.d ft2, ft2
fmul.d fa3, ft5, fa3
fmul.d fa0, fa3, fa0
fmul.d ft0, fa3, ft0
fmul.d fa3, fa3, ft2
fld ft2, 0(s1)
fmul.d fa4, ft5, fa4
- fmadd.d fa2, fa4, fa2, fa0
- fmadd.d ft6, fa4, ft6, ft0
- fmadd.d fa4, fa4, ft1, fa3
+ fmsub.d fa2, fa4, fa2, fa0
+ fmsub.d ft6, fa4, ft6, ft0
+ fmsub.d fa4, fa4, ft1, fa3 This gives a 1.77% improvement in both instruction count and runtime on 508.namd_r This also causes some more fnegs to be sunk after a bitcast to integer, so they're now done as xor. From glancing at some of the schedules for WriteFSGN my guess is that this is also profitable. Patch is 26.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157388.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index c468f2f676281..21dec19e3cb9d 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -1076,6 +1076,10 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_Cttz(const Opnd &Op) {
return UnaryOpc_match<Opnd>(ISD::CTTZ, Op);
}
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_FNeg(const Opnd &Op) {
+ return UnaryOpc_match<Opnd>(ISD::FNEG, Op);
+}
+
// === Constants ===
struct ConstantInt_match {
APInt *BindVal;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f8ec1be1fd8d6..7dd84a0eee69c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20250,6 +20250,17 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return V;
break;
case ISD::FMUL: {
+ using namespace SDPatternMatch;
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue X, Y;
+ // InstCombine canonicalizes fneg (fmul x, y) -> fmul x, (fneg y), see
+ // hoistFNegAboveFMulFDiv.
+ // Undo this and sink the fneg so we match more fmsub/fnmadd patterns.
+ if (sd_match(N, m_FMul(m_Value(X), m_OneUse(m_FNeg(m_Value(Y))))))
+ return DAG.getNode(ISD::FNEG, DL, VT,
+ DAG.getNode(ISD::FMUL, DL, VT, X, Y));
+
// fmul X, (copysign 1.0, Y) -> fsgnjx X, Y
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -20260,7 +20271,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0));
if (!C || !C->getValueAPF().isExactlyValue(+1.0))
return SDValue();
- EVT VT = N->getValueType(0);
if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT))
return SDValue();
SDValue Sign = N0->getOperand(1);
diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll
index d6c4f8d5f350f..911692ec32fb6 100644
--- a/llvm/test/CodeGen/RISCV/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/double-arith.ll
@@ -610,6 +610,86 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind {
ret double %1
}
+define double @fmsub_d_fmul_fneg(double %a, double %b, double %c, double %d) nounwind {
+; CHECKIFD-LABEL: fmsub_d_fmul_fneg:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: fmul.d fa5, fa2, fa3
+; CHECKIFD-NEXT: fmsub.d fa0, fa0, fa1, fa5
+; CHECKIFD-NEXT: ret
+;
+; RV32IZFINXZDINX-LABEL: fmsub_d_fmul_fneg:
+; RV32IZFINXZDINX: # %bb.0:
+; RV32IZFINXZDINX-NEXT: fmul.d a4, a4, a6
+; RV32IZFINXZDINX-NEXT: fmsub.d a0, a0, a2, a4
+; RV32IZFINXZDINX-NEXT: ret
+;
+; RV64IZFINXZDINX-LABEL: fmsub_d_fmul_fneg:
+; RV64IZFINXZDINX: # %bb.0:
+; RV64IZFINXZDINX-NEXT: fmul.d a2, a2, a3
+; RV64IZFINXZDINX-NEXT: fmsub.d a0, a0, a1, a2
+; RV64IZFINXZDINX-NEXT: ret
+;
+; RV32I-LABEL: fmsub_d_fmul_fneg:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s0, a3
+; RV32I-NEXT: mv s1, a2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: mv s3, a0
+; RV32I-NEXT: lui a0, 524288
+; RV32I-NEXT: xor a3, a7, a0
+; RV32I-NEXT: mv a0, a4
+; RV32I-NEXT: mv a1, a5
+; RV32I-NEXT: mv a2, a6
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv a4, a0
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: mv a0, s3
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: mv a2, s1
+; RV32I-NEXT: mv a3, s0
+; RV32I-NEXT: call fma
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fmsub_d_fmul_fneg:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -32
+; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: mv s0, a1
+; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: li a0, -1
+; RV64I-NEXT: slli a0, a0, 63
+; RV64I-NEXT: xor a1, a3, a0
+; RV64I-NEXT: mv a0, a2
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv a2, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: mv a1, s0
+; RV64I-NEXT: call fma
+; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 32
+; RV64I-NEXT: ret
+ %negd = fneg double %d
+ %fmul = fmul double %c, %negd
+ %1 = call double @llvm.fma.f64(double %a, double %b, double %fmul)
+ ret double %1
+}
+
define double @fnmadd_d(double %a, double %b, double %c) nounwind {
; RV32IFD-LABEL: fnmadd_d:
; RV32IFD: # %bb.0:
@@ -877,6 +957,88 @@ define double @fnmadd_d_3(double %a, double %b, double %c) nounwind {
ret double %neg
}
+define double @fnmadd_d_fmul_fneg(double %a, double %b, double %c, double %d) nounwind {
+; CHECKIFD-LABEL: fnmadd_d_fmul_fneg:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: fmul.d fa5, fa1, fa0
+; CHECKIFD-NEXT: fmsub.d fa0, fa2, fa3, fa5
+; CHECKIFD-NEXT: ret
+;
+; RV32IZFINXZDINX-LABEL: fnmadd_d_fmul_fneg:
+; RV32IZFINXZDINX: # %bb.0:
+; RV32IZFINXZDINX-NEXT: fmul.d a0, a2, a0
+; RV32IZFINXZDINX-NEXT: fmsub.d a0, a4, a6, a0
+; RV32IZFINXZDINX-NEXT: ret
+;
+; RV64IZFINXZDINX-LABEL: fnmadd_d_fmul_fneg:
+; RV64IZFINXZDINX: # %bb.0:
+; RV64IZFINXZDINX-NEXT: fmul.d a0, a1, a0
+; RV64IZFINXZDINX-NEXT: fmsub.d a0, a2, a3, a0
+; RV64IZFINXZDINX-NEXT: ret
+;
+; RV32I-LABEL: fnmadd_d_fmul_fneg:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s0, a7
+; RV32I-NEXT: mv s1, a6
+; RV32I-NEXT: mv s2, a5
+; RV32I-NEXT: mv s3, a4
+; RV32I-NEXT: mv a5, a3
+; RV32I-NEXT: mv a4, a0
+; RV32I-NEXT: lui a3, 524288
+; RV32I-NEXT: xor a3, a1, a3
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: mv a1, a5
+; RV32I-NEXT: mv a2, a4
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv a4, a0
+; RV32I-NEXT: mv a5, a1
+; RV32I-NEXT: mv a0, s3
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: mv a2, s1
+; RV32I-NEXT: mv a3, s0
+; RV32I-NEXT: call fma
+; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fnmadd_d_fmul_fneg:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -32
+; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: mv s0, a3
+; RV64I-NEXT: mv s1, a2
+; RV64I-NEXT: mv a2, a1
+; RV64I-NEXT: li a1, -1
+; RV64I-NEXT: slli a1, a1, 63
+; RV64I-NEXT: xor a1, a0, a1
+; RV64I-NEXT: mv a0, a2
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv a2, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: mv a1, s0
+; RV64I-NEXT: call fma
+; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 32
+; RV64I-NEXT: ret
+ %nega = fneg double %a
+ %mul = fmul double %b, %nega
+ %1 = call double @llvm.fma.f64(double %c, double %d, double %mul)
+ ret double %1
+}
define double @fnmadd_nsz(double %a, double %b, double %c) nounwind {
; CHECKIFD-LABEL: fnmadd_nsz:
diff --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll
index 57b3423da69a6..95f1fc6899206 100644
--- a/llvm/test/CodeGen/RISCV/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/float-arith.ll
@@ -529,6 +529,89 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind {
ret float %1
}
+define float @fmsub_s_fmul_fneg(float %a, float %b, float %c, float %d) nounwind {
+; CHECKIFD-LABEL: fmsub_d_fmul_fneg:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: fneg.d fa5, fa3
+; CHECKIFD-NEXT: fmul.d fa5, fa2, fa5
+; CHECKIFD-NEXT: fmadd.d fa0, fa0, fa1, fa5
+; CHECKIFD-NEXT: ret
+;
+; RV32IZFINXZDINX-LABEL: fmsub_d_fmul_fneg:
+; RV32IZFINXZDINX: # %bb.0:
+; RV32IZFINXZDINX-NEXT: fneg.d a6, a6
+; RV32IZFINXZDINX-NEXT: fmul.d a4, a4, a6
+; RV32IZFINXZDINX-NEXT: fmadd.d a0, a0, a2, a4
+; RV32IZFINXZDINX-NEXT: ret
+;
+; RV64IZFINXZDINX-LABEL: fmsub_d_fmul_fneg:
+; RV64IZFINXZDINX: # %bb.0:
+; RV64IZFINXZDINX-NEXT: fneg.d a3, a3
+; RV64IZFINXZDINX-NEXT: fmul.d a2, a2, a3
+; RV64IZFINXZDINX-NEXT: fmadd.d a0, a0, a1, a2
+; RV64IZFINXZDINX-NEXT: ret
+;
+; CHECKIF-LABEL: fmsub_s_fmul_fneg:
+; CHECKIF: # %bb.0:
+; CHECKIF-NEXT: fmul.s fa5, fa2, fa3
+; CHECKIF-NEXT: fmsub.s fa0, fa0, fa1, fa5
+; CHECKIF-NEXT: ret
+;
+; CHECKIZFINX-LABEL: fmsub_s_fmul_fneg:
+; CHECKIZFINX: # %bb.0:
+; CHECKIZFINX-NEXT: fmul.s a2, a2, a3
+; CHECKIZFINX-NEXT: fmsub.s a0, a0, a1, a2
+; CHECKIZFINX-NEXT: ret
+;
+; RV32I-LABEL: fmsub_s_fmul_fneg:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: lui a1, 524288
+; RV32I-NEXT: xor a1, a3, a1
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: call __mulsf3
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s0
+; RV32I-NEXT: call fmaf
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fmsub_s_fmul_fneg:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -32
+; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: mv s0, a1
+; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: lui a1, 524288
+; RV64I-NEXT: xor a1, a3, a1
+; RV64I-NEXT: mv a0, a2
+; RV64I-NEXT: call __mulsf3
+; RV64I-NEXT: mv a2, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: mv a1, s0
+; RV64I-NEXT: call fmaf
+; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 32
+; RV64I-NEXT: ret
+ %negd = fneg float %d
+ %fmul = fmul float %c, %negd
+ %1 = call float @llvm.fma.f32(float %a, float %b, float %fmul)
+ ret float %1
+}
+
define float @fnmadd_s(float %a, float %b, float %c) nounwind {
; CHECKIF-LABEL: fnmadd_s:
; CHECKIF: # %bb.0:
@@ -738,6 +821,91 @@ define float @fnmadd_s_3(float %a, float %b, float %c) nounwind {
ret float %neg
}
+define float @fnmadd_s_fmul_fneg(float %a, float %b, float %c, float %d) nounwind {
+; CHECKIFD-LABEL: fnmadd_d_fmul_fneg:
+; CHECKIFD: # %bb.0:
+; CHECKIFD-NEXT: fneg.d fa5, fa0
+; CHECKIFD-NEXT: fmul.d fa5, fa1, fa5
+; CHECKIFD-NEXT: fmadd.d fa0, fa2, fa3, fa5
+; CHECKIFD-NEXT: ret
+;
+; RV32IZFINXZDINX-LABEL: fnmadd_d_fmul_fneg:
+; RV32IZFINXZDINX: # %bb.0:
+; RV32IZFINXZDINX-NEXT: fneg.d a0, a0
+; RV32IZFINXZDINX-NEXT: fmul.d a0, a2, a0
+; RV32IZFINXZDINX-NEXT: fmadd.d a0, a4, a6, a0
+; RV32IZFINXZDINX-NEXT: ret
+;
+; RV64IZFINXZDINX-LABEL: fnmadd_d_fmul_fneg:
+; RV64IZFINXZDINX: # %bb.0:
+; RV64IZFINXZDINX-NEXT: fneg.d a0, a0
+; RV64IZFINXZDINX-NEXT: fmul.d a0, a1, a0
+; RV64IZFINXZDINX-NEXT: fmadd.d a0, a2, a3, a0
+; RV64IZFINXZDINX-NEXT: ret
+;
+; CHECKIF-LABEL: fnmadd_s_fmul_fneg:
+; CHECKIF: # %bb.0:
+; CHECKIF-NEXT: fmul.s fa5, fa1, fa0
+; CHECKIF-NEXT: fmsub.s fa0, fa2, fa3, fa5
+; CHECKIF-NEXT: ret
+;
+; CHECKIZFINX-LABEL: fnmadd_s_fmul_fneg:
+; CHECKIZFINX: # %bb.0:
+; CHECKIZFINX-NEXT: fmul.s a0, a1, a0
+; CHECKIZFINX-NEXT: fmsub.s a0, a2, a3, a0
+; CHECKIZFINX-NEXT: ret
+;
+; RV32I-LABEL: fnmadd_s_fmul_fneg:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT: mv s0, a3
+; RV32I-NEXT: mv s1, a2
+; RV32I-NEXT: mv a2, a1
+; RV32I-NEXT: lui a1, 524288
+; RV32I-NEXT: xor a1, a0, a1
+; RV32I-NEXT: mv a0, a2
+; RV32I-NEXT: call __mulsf3
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s0
+; RV32I-NEXT: call fmaf
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: fnmadd_s_fmul_fneg:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -32
+; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: mv s0, a3
+; RV64I-NEXT: mv s1, a2
+; RV64I-NEXT: mv a2, a1
+; RV64I-NEXT: lui a1, 524288
+; RV64I-NEXT: xor a1, a0, a1
+; RV64I-NEXT: mv a0, a2
+; RV64I-NEXT: call __mulsf3
+; RV64I-NEXT: mv a2, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: mv a1, s0
+; RV64I-NEXT: call fmaf
+; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 32
+; RV64I-NEXT: ret
+ %nega = fneg float %a
+ %mul = fmul float %b, %nega
+ %1 = call float @llvm.fma.f32(float %c, float %d, float %mul)
+ ret float %1
+}
+
define float @fnmadd_nsz(float %a, float %b, float %c) nounwind {
; RV32IF-LABEL: fnmadd_nsz:
; RV32IF: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/float-bit-preserving-dagcombines.ll b/llvm/test/CodeGen/RISCV/float-bit-preserving-dagcombines.ll
index 6aa6dedba548f..acf533a6c3e27 100644
--- a/llvm/test/CodeGen/RISCV/float-bit-preserving-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/float-bit-preserving-dagcombines.ll
@@ -197,16 +197,17 @@ define float @bitcast_xor(float %a1, float %a2) nounwind {
; RV32F-NEXT: fmv.w.x fa5, a1
; RV32F-NEXT: fmv.w.x fa4, a0
; RV32F-NEXT: fmul.s fa5, fa4, fa5
-; RV32F-NEXT: fneg.s fa5, fa5
; RV32F-NEXT: fmul.s fa5, fa4, fa5
; RV32F-NEXT: fmv.x.w a0, fa5
+; RV32F-NEXT: lui a1, 524288
+; RV32F-NEXT: xor a0, a0, a1
; RV32F-NEXT: ret
;
; RV32ZFINX-LABEL: bitcast_xor:
; RV32ZFINX: # %bb.0:
; RV32ZFINX-NEXT: fmul.s a1, a0, a1
-; RV32ZFINX-NEXT: fneg.s a1, a1
; RV32ZFINX-NEXT: fmul.s a0, a0, a1
+; RV32ZFINX-NEXT: fneg.s a0, a0
; RV32ZFINX-NEXT: ret
;
; RV32FD-LABEL: bitcast_xor:
@@ -214,9 +215,10 @@ define float @bitcast_xor(float %a1, float %a2) nounwind {
; RV32FD-NEXT: fmv.w.x fa5, a1
; RV32FD-NEXT: fmv.w.x fa4, a0
; RV32FD-NEXT: fmul.s fa5, fa4, fa5
-; RV32FD-NEXT: fneg.s fa5, fa5
; RV32FD-NEXT: fmul.s fa5, fa4, fa5
; RV32FD-NEXT: fmv.x.w a0, fa5
+; RV32FD-NEXT: lui a1, 524288
+; RV32FD-NEXT: xor a0, a0, a1
; RV32FD-NEXT: ret
;
; RV64F-LABEL: bitcast_xor:
@@ -224,16 +226,17 @@ define float @bitcast_xor(float %a1, float %a2) nounwind {
; RV64F-NEXT: fmv.w.x fa5, a1
; RV64F-NEXT: fmv.w.x fa4, a0
; RV64F-NEXT: fmul.s fa5, fa4, fa5
-; RV64F-NEXT: fneg.s fa5, fa5
; RV64F-NEXT: fmul.s fa5, fa4, fa5
; RV64F-NEXT: fmv.x.w a0, fa5
+; RV64F-NEXT: lui a1, 524288
+; RV64F-NEXT: xor a0, a0, a1
; RV64F-NEXT: ret
;
; RV64ZFINX-LABEL: bitcast_xor:
; RV64ZFINX: # %bb.0:
; RV64ZFINX-NEXT: fmul.s a1, a0, a1
-; RV64ZFINX-NEXT: fneg.s a1, a1
; RV64ZFINX-NEXT: fmul.s a0, a0, a1
+; RV64ZFINX-NEXT: fneg.s a0, a0
; RV64ZFINX-NEXT: ret
;
; RV64FD-LABEL: bitcast_xor:
@@ -241,9 +244,10 @@ define float @bitcast_xor(float %a1, float %a2) nounwind {
; RV64FD-NEXT: fmv.w.x fa5, a1
; RV64FD-NEXT: fmv.w.x fa4, a0
; RV64FD-NEXT: fmul.s fa5, fa4, fa5
-; RV64FD-NEXT: fneg.s fa5, fa5
; RV64FD-NEXT: fmul.s fa5, fa4, fa5
; RV64FD-NEXT: fmv.x.w a0, fa5
+; RV64FD-NEXT: lui a1, 524288
+; RV64FD-NEXT: xor a0, a0, a1
; RV64FD-NEXT: ret
%a3 = fmul float %a1, %a2
%bc1 = bitcast float %a3 to i32
@@ -264,11 +268,12 @@ define double @bitcast_double_xor(double %a1, double %a2) nounwind {
; RV32F-NEXT: mv s1, a0
; RV32F-NEXT: call __muldf3
; RV32F-NEXT: mv a2, a0
-; RV32F-NEXT: lui a3, 524288
-; RV32F-NEXT: xor a3, a1, a3
+; RV32F-NEXT: mv a3, a1
; RV32F-NEXT: mv a0, s1
; RV32F-NEXT: mv a1, s0
; RV32F-NEXT: call __muldf3
+; RV32F-NEXT: lui a2, 524288
+; RV32F-NEXT: xor a1, a1, a2
; RV32F-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32F-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32F-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -285,11 +290,12 @@ define double @bitcast_double_xor(double %a1, double %a2) nounwind {
; RV32ZFINX-NEXT: mv s1, a0
; RV32ZFINX-NEXT: call __muldf3
; RV32ZFINX-NEXT: mv a2, a0
-; RV32ZFINX-NEXT: lui a3, 524288
-; RV32ZFINX-NEXT: xor a3, a1, a3
+; RV32ZFINX-NEXT: mv a3, a1
; RV32ZFINX-NEXT: mv a0, s1
; RV32ZFINX-NEXT: mv a1, s0
; RV32ZFINX-NEXT: call __muldf3
+; RV32ZFINX-NEXT: lui a2, 524288
+; RV32ZFINX-NEXT: xor a1, a1, a2
; RV32ZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32ZFINX-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
; RV32ZFINX-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
@@ -306,11 +312,12 @@ define double @bitcast_double_xor(double %a1, double %a2) nounwind {
; RV32FD-NEXT: sw a1, 12(sp)
; RV32FD-NEXT: fld fa4, 8(sp)
; RV32FD-NEXT: fmul.d fa5, fa4, fa5
-; RV32FD-NEXT: fneg.d fa5, fa5
; RV32FD-NEXT: fmul.d fa5, fa4, fa5
; RV32FD-NEXT: fsd fa5, 8(sp)
-; RV32FD-NEXT: lw a0, 8(sp)
; RV32FD-NEXT: lw a1, 12(sp)
+; RV32FD-NEXT: lw a0, 8(sp)
+; RV32FD-NEXT: lui a2, 524288
+; RV32FD-NEXT: xor a1, a1, a2
; RV32FD-NEXT: addi sp, sp, 16
; RV32FD-NEXT: ret
;
@@ -321,11 +328,12 @@ define double @bitcast_double_xor(double %a1, double %a2) nounwind {
; RV64F-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
; RV64F-NEXT: mv s0, a0
; RV64F-NEXT: call __muldf3
-; RV64F-NEXT: li a1, -1
-; RV64F-NEXT: slli a1, a1, 63
-; RV64F-NEXT: xor a1, a0, a1
+; RV64F-NEXT: mv a1, a0
; RV64F-NEXT: mv a0, s0
; RV64F-NEXT: call __muldf3
+; RV64F-NEXT: li a1, -1
+; RV64F-NEXT: slli a1, a1, 63
+; RV64F-NEXT: xor a0, a0, a1
; RV64F-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64F-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
; RV64F-NEXT: addi sp, sp, 16
@@ -338,11 +346,12 @@ define double @bitcast_double_xor(double %a1, double %a2) nounwind {
; RV64ZFINX-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
; RV64ZFINX-NEXT: mv s0, a0
; RV64ZFINX-NEXT: call __muldf3
-; RV64ZFINX-NEXT: li a1, -1
-; RV64ZFINX-NEXT: slli a1, a1, 63
-; RV64ZFINX-NEXT: xor a1, a0, a1
+; RV64ZFINX-NEXT: mv a1, a0
; RV64ZFINX-NEXT: mv a0, s0
; RV64ZFINX-NEXT: call __muldf3
+; RV64ZFINX-NEXT: li a1, -1
+; RV64ZFINX-NEXT: slli a1, a1, 63
+; RV64ZFINX-NEXT: xor a0, a0, a1
; RV64ZFINX-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64ZFINX-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
; RV64ZFINX-NEXT: addi sp, sp, 16
@@ -352,10 +...
[truncated]
|
I'm pretty sure this requires the nsz fast math flag. |
I take that back. That might be for the full (fneg (fma)) case. |
Why is that, if the transform is for a single use of fneg in fmul?
It should be, especially with BINVI. |
We go from |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
InstCombine will hoist an fneg through an fmul, but not for fadd/fsub.
This prevents us from matching fmsub and fnmadd in some cases.
This patch adds a DAG combine to undo this in InstCombine, which helps some hot loops in 508.namd_r:
This gives a 1.77% improvement in both instruction count and runtime on 508.namd_r
This also causes some more fnegs to be sunk after a bitcast to integer, so they're now done as xor. From glancing at some of the schedules for WriteFSGN my guess is that this is also profitable.