-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG] Lower frem of power-2 using div/trunc/mul+sub #91148
Conversation
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-llvm-selectiondag Author: David Green (davemgreen) ChangesIf we are lowering a frem and the divisor is known to me an integer power-2, we Unfortunately Alive2 does not handle this well at the moment. I was using I found this in cpythons implementation of float_pow. I currently added it as a Full diff: https://github.com/llvm/llvm-project/pull/91148.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fe932ca68c1288..acccde1611e5ef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/IntervalMap.h"
@@ -17261,6 +17262,25 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
+ if (ConstantFPSDNode *C1 = isConstOrConstSplatFP(N1, true)) {
+ bool IsExact;
+ APSInt C1I(64, 0);
+ if (C1->getValueAPF().isInteger() && !C1->getValueAPF().isNegative() &&
+ C1->getValueAPF().convertToInteger(C1I, APFloat::rmTowardZero,
+ &IsExact) == APFloat::opOK &&
+ IsExact && isPowerOf2_64(C1I.getSExtValue()) &&
+ (Flags.hasNoSignedZeros() || N0.getOpcode() == ISD::FABS) &&
+ !TLI.isOperationLegal(ISD::FREM, VT) &&
+ TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
+ TLI.isOperationLegalOrCustom(ISD::FDIV, VT) &&
+ TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT)) {
+ SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
+ SDValue Rnd = DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, Div);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SDLoc(N), VT, Rnd, N1);
+ return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, Mul);
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
new file mode 100644
index 00000000000000..01eff2e921024c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -0,0 +1,383 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define float @frem2(float %x) {
+; CHECK-LABEL: frem2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, #2.00000000
+; CHECK-NEXT: b fmodf
+entry:
+ %fmod = frem float %x, 2.0
+ ret float %fmod
+}
+
+define float @frem2_nsz(float %x) {
+; CHECK-SD-LABEL: frem2_nsz:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov s1, #2.00000000
+; CHECK-SD-NEXT: fdiv s1, s0, s1
+; CHECK-SD-NEXT: frintz s1, s1
+; CHECK-SD-NEXT: fadd s1, s1, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem2_nsz:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %fmod = frem nsz float %x, 2.0
+ ret float %fmod
+}
+
+define float @frem2_abs(float %x) {
+; CHECK-SD-LABEL: frem2_abs:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs s0, s0
+; CHECK-SD-NEXT: fmov s1, #2.00000000
+; CHECK-SD-NEXT: fdiv s1, s0, s1
+; CHECK-SD-NEXT: frintz s1, s1
+; CHECK-SD-NEXT: fadd s1, s1, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem2_abs:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fabs s0, s0
+; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 2.0
+ ret float %fmod
+}
+
+define half @hrem2_nsz(half %x) {
+; CHECK-SD-LABEL: hrem2_nsz:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov h1, #2.00000000
+; CHECK-SD-NEXT: fdiv h1, h0, h1
+; CHECK-SD-NEXT: frintz h1, h1
+; CHECK-SD-NEXT: fadd h1, h1, h1
+; CHECK-SD-NEXT: fsub h0, h0, h1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: hrem2_nsz:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: fmov h1, #2.00000000
+; CHECK-GI-NEXT: fcvt s0, h0
+; CHECK-GI-NEXT: fcvt s1, h1
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %fmod = frem nsz half %x, 2.0
+ ret half %fmod
+}
+
+define double @drem2_nsz(double %x) {
+; CHECK-SD-LABEL: drem2_nsz:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov d1, #2.00000000
+; CHECK-SD-NEXT: fdiv d1, d0, d1
+; CHECK-SD-NEXT: frintz d1, d1
+; CHECK-SD-NEXT: fadd d1, d1, d1
+; CHECK-SD-NEXT: fsub d0, d0, d1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: drem2_nsz:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fmov d1, #2.00000000
+; CHECK-GI-NEXT: b fmod
+entry:
+ %fmod = frem nsz double %x, 2.0
+ ret double %fmod
+}
+
+define float @frem3_nsz(float %x) {
+; CHECK-LABEL: frem3_nsz:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, #3.00000000
+; CHECK-NEXT: b fmodf
+entry:
+ %fmod = frem nsz float %x, 3.0
+ ret float %fmod
+}
+
+define float @frem05_nsz(float %x) {
+; CHECK-LABEL: frem05_nsz:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, #0.50000000
+; CHECK-NEXT: b fmodf
+entry:
+ %fmod = frem nsz float %x, 0.5
+ ret float %fmod
+}
+
+define float @fremm2_nsz(float %x) {
+; CHECK-LABEL: fremm2_nsz:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, #-2.00000000
+; CHECK-NEXT: b fmodf
+entry:
+ %fmod = frem nsz float %x, -2.0
+ ret float %fmod
+}
+
+define float @frem4_abs(float %x) {
+; CHECK-SD-LABEL: frem4_abs:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs s0, s0
+; CHECK-SD-NEXT: fmov s1, #4.00000000
+; CHECK-SD-NEXT: fdiv s2, s0, s1
+; CHECK-SD-NEXT: frintz s2, s2
+; CHECK-SD-NEXT: fmul s1, s2, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem4_abs:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fabs s0, s0
+; CHECK-GI-NEXT: fmov s1, #4.00000000
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 4.0
+ ret float %fmod
+}
+
+define float @frem16_abs(float %x) {
+; CHECK-SD-LABEL: frem16_abs:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs s0, s0
+; CHECK-SD-NEXT: fmov s1, #16.00000000
+; CHECK-SD-NEXT: fdiv s2, s0, s1
+; CHECK-SD-NEXT: frintz s2, s2
+; CHECK-SD-NEXT: fmul s1, s2, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem16_abs:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fabs s0, s0
+; CHECK-GI-NEXT: fmov s1, #16.00000000
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 16.0
+ ret float %fmod
+}
+
+define float @frem4294967296_abs(float %x) {
+; CHECK-SD-LABEL: frem4294967296_abs:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs s0, s0
+; CHECK-SD-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-SD-NEXT: fmov s1, w8
+; CHECK-SD-NEXT: fdiv s2, s0, s1
+; CHECK-SD-NEXT: frintz s2, s2
+; CHECK-SD-NEXT: fmul s1, s2, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem4294967296_abs:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fabs s0, s0
+; CHECK-GI-NEXT: mov w8, #1333788672 // =0x4f800000
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 4294967296.0
+ ret float %fmod
+}
+
+define float @frem1152921504606846976_abs(float %x) {
+; CHECK-SD-LABEL: frem1152921504606846976_abs:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs s0, s0
+; CHECK-SD-NEXT: mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT: fmov s1, w8
+; CHECK-SD-NEXT: fdiv s2, s0, s1
+; CHECK-SD-NEXT: frintz s2, s2
+; CHECK-SD-NEXT: fmul s1, s2, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem1152921504606846976_abs:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fabs s0, s0
+; CHECK-GI-NEXT: mov w8, #1568669696 // =0x5d800000
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 1152921504606846976.0
+ ret float %fmod
+}
+
+define float @frem4611686018427387904_abs(float %x) {
+; CHECK-SD-LABEL: frem4611686018427387904_abs:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fabs s0, s0
+; CHECK-SD-NEXT: mov w8, #1585446912 // =0x5e800000
+; CHECK-SD-NEXT: fmov s1, w8
+; CHECK-SD-NEXT: fdiv s2, s0, s1
+; CHECK-SD-NEXT: frintz s2, s2
+; CHECK-SD-NEXT: fmul s1, s2, s1
+; CHECK-SD-NEXT: fsub s0, s0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem4611686018427387904_abs:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fabs s0, s0
+; CHECK-GI-NEXT: mov w8, #1585446912 // =0x5e800000
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 4611686018427387904.0
+ ret float %fmod
+}
+
+define float @frem9223372036854775808_abs(float %x) {
+; CHECK-LABEL: frem9223372036854775808_abs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: movi v1.2s, #95, lsl #24
+; CHECK-NEXT: b fmodf
+entry:
+ %a = tail call float @llvm.fabs.f32(float %x)
+ %fmod = frem float %a, 9223372036854775808.0
+ ret float %fmod
+}
+
+define <4 x float> @frem2_nsz_vec(<4 x float> %x) {
+; CHECK-SD-LABEL: frem2_nsz_vec:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v1.4s, #64, lsl #24
+; CHECK-SD-NEXT: fdiv v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: frintz v1.4s, v1.4s
+; CHECK-SD-NEXT: fadd v1.4s, v1.4s, v1.4s
+; CHECK-SD-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem2_nsz_vec:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #80
+; CHECK-GI-NEXT: str d10, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #56] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #72] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset b8, -16
+; CHECK-GI-NEXT: .cfi_offset b9, -24
+; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: mov s8, v0.s[1]
+; CHECK-GI-NEXT: mov s9, v0.s[2]
+; CHECK-GI-NEXT: mov s10, v0.s[3]
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: fmov s0, s8
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: fmov s0, s9
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: fmov s0, s10
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.s[2], v2.s[0]
+; CHECK-GI-NEXT: mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: add sp, sp, #80
+; CHECK-GI-NEXT: ret
+entry:
+ %fmod = frem nsz <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+ ret <4 x float> %fmod
+}
+
+define <4 x float> @frem1152921504606846976_absv(<4 x float> %x) {
+; CHECK-SD-LABEL: frem1152921504606846976_absv:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov w8, #1568669696 // =0x5d800000
+; CHECK-SD-NEXT: fabs v0.4s, v0.4s
+; CHECK-SD-NEXT: dup v1.4s, w8
+; CHECK-SD-NEXT: fdiv v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: frintz v2.4s, v2.4s
+; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT: fsub v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: frem1152921504606846976_absv:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #96
+; CHECK-GI-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 96
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
+; CHECK-GI-NEXT: .cfi_offset b10, -40
+; CHECK-GI-NEXT: .cfi_offset b11, -48
+; CHECK-GI-NEXT: mov w8, #1568669696 // =0x5d800000
+; CHECK-GI-NEXT: fabs v0.4s, v0.4s
+; CHECK-GI-NEXT: fmov s11, w8
+; CHECK-GI-NEXT: fmov s1, s11
+; CHECK-GI-NEXT: mov s8, v0.s[1]
+; CHECK-GI-NEXT: mov s9, v0.s[2]
+; CHECK-GI-NEXT: mov s10, v0.s[3]
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s1, s11
+; CHECK-GI-NEXT: fmov s0, s8
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s1, s11
+; CHECK-GI-NEXT: fmov s0, s9
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s1, s11
+; CHECK-GI-NEXT: fmov s0, s10
+; CHECK-GI-NEXT: bl fmodf
+; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
+; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.s[2], v2.s[0]
+; CHECK-GI-NEXT: mov v1.s[3], v0.s[0]
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
+; CHECK-GI-NEXT: add sp, sp, #96
+; CHECK-GI-NEXT: ret
+entry:
+ %a = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
+ %fmod = frem <4 x float> %a, <float 1152921504606846976.0, float 1152921504606846976.0, float 1152921504606846976.0, float 1152921504606846976.0>
+ ret <4 x float> %fmod
+}
|
This expansion should also work for negative power-of-two divisors like -4.0. This expansion should also work for fractional power-of-two divisors like +/ 0.25, unless there is some concern about the division overflowing? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This expansion should also work for negative power-of-two divisors like -4.0.
This expansion should also work for fractional power-of-two divisors like +/ 0.25, unless there is some concern about the division overflowing?
Thanks for the suggestions. Negative power-2 sounds good -- I'll add those in. Values smaller than 1 unfortunately make it round to infinity in cases where the frem would be valid.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Basically LGTM, except should try to emit FMA if possible
@@ -2111,6 +2115,10 @@ class SelectionDAG { | |||
/// Test whether the given SDValue is known to contain non-zero value(s). | |||
bool isKnownNeverZero(SDValue Op, unsigned Depth = 0) const; | |||
|
|||
/// Test whether the given float value is known to not be negative. 0.0 is | |||
/// considered non-negative, -0.0 is considered negative. | |||
bool isKnownNonNegativeFP(SDValue Op) const; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Someday we'll have to reinvent computeKnownFPClass in the DAG... (maybe we could do this in CGP to avoid that)
@@ -2111,6 +2115,10 @@ class SelectionDAG { | |||
/// Test whether the given SDValue is known to contain non-zero value(s). | |||
bool isKnownNeverZero(SDValue Op, unsigned Depth = 0) const; | |||
|
|||
/// Test whether the given float value is known to be positive. +0.0, +inf and | |||
/// +nan are considered positive, -0.0, -inf and -nan are not. | |||
bool isKnownPositiveOrNaNFP(SDValue Op) const; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Best to follow the ValueTracking naming and call this cannotBeOrderedLessThanZero. Makes the comment off for nan/zero
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, you can only call it that if you're happy with the semantics change too, which is that it would return true for -0.0.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like cannotBeOrderedLessThanZero is a slightly different concept. I might be misunderstanding, but at current this function is more about whether the number is bitwise negative or not, with it primarily dealing with fabs. -0.0 should be false for what we need for frem.
(Which, I might be able to add a copysign for too, but I will wait for another patch on that).
What do you think of sticking with the current name, and when a computeKnownFPClass is added for DAG we can switch it over to use the new functions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bitwise negative
Not quite, sign bit of nan is meaningless. How about cannotBeOrderedNegativeFP?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, The IEEE 754 standard specifies that the sign bit of a NaN has no significance. The Arm architecture is obviously more opinionated about what the correct answer is, and I have been testing bitwise.
That probably helps with copysign too. I can change it to the new name.
If we are lowering a frem and the divisor is known to me an integer power-2, we can use the formula 'frem = x - trunc(x / d) * d'. This avoids the more expensive call to fmod. The results are identical as fmod so long as d is a power-2 (so the mul does not round incorrectly), and the sign of the return is either always positive or signed-zero not important (nsz). Unfortunately Alive2 does not handle this well at the moment. I was using exhaustive checking to test this, hopefully I didn't make a mistake in it (https://gist.github.com/davemgreen/6078015f30d3bacd1e9572f8db5d4b64). I found this in cpythons implementation of float_pow. I currently added it as a DAG combine for frem with power-2 fp constants, with some extra utility functions for checking if a floating-point value is known non-negative or a integer power-2.
6e9c199
to
d5013f1
Compare
d5013f1
to
b2cbe5c
Compare
As a small addition to llvm#91148, this uses copysign to produce the correct sign for zero when converting frem to div/trunc/mul when we do not know that the input is positive (and we care about sign bits). The copysign lets us get the sign of zero correct. In testing, the only case this produced different results thant fmod was: frem -inf, 4.0 -> nan vs -nan
; CHECK-SD-LABEL: frem2_nsz: | ||
; CHECK-SD: // %bb.0: // %entry | ||
; CHECK-SD-NEXT: fmov s1, #2.00000000 | ||
; CHECK-SD-NEXT: fdiv s2, s0, s1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there some reason to prefer this sequence over fmov s1, #0.5; fmul s2, s0, s1
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds like a nice idea. It looks like there is an instcombiner for transforming that, but no equivalent in the backend at the moment. I'll try and take a look.
As a small addition to llvm#91148, this uses copysign to produce the correct sign for zero when converting frem to div/trunc/mul when we do not know that the input is positive (and we care about sign bits). The copysign lets us get the sign of zero correct. In testing, the only case this produced different results thant fmod was: frem -inf, 4.0 -> nan vs -nan
As a small addition to #91148, this uses copysign to produce the correct sign for zero when converting frem to div/trunc/mul when we do not know that the input is positive (and we care about sign bits). The copysign lets us get the sign of zero correct. In testing, the only case this produced different results than fmod was: frem -inf, 4.0 -> nan vs -nan
If we are lowering a frem and the divisor is known to be an integer power-2, we
can use the formula 'frem = x - trunc(x / d) * d'. This avoids the more
expensive call to fmod. The results are identical as fmod so long as d is a
power-2 (so the mul does not round incorrectly), and the sign of the return is
either always positive or not important for zeroes (nsz).
Unfortunately Alive2 does not handle this well at the moment. I was using
exhaustive checking to test this, hopefully I didn't make a mistake in it
(https://gist.github.com/davemgreen/6078015f30d3bacd1e9572f8db5d4b64).
I found this in cpythons implementation of float_pow. I currently added it as a
DAG combine for frem with power-2 fp constants.