Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAGCombiner] Combine frem into fdiv+ftrunc+fma #67642

Open
wants to merge 5 commits into
base: main
Choose a base branch
from

Conversation

ecnelises
Copy link
Member

@llvmbot
Copy link
Collaborator

llvmbot commented Sep 28, 2023

@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-selectiondag

Changes

Migrated from https://reviews.llvm.org/D108284


Full diff: https://github.com/llvm/llvm-project/pull/67642.diff

2 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+12)
  • (modified) llvm/test/CodeGen/PowerPC/frem.ll (+37-105)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0d34ebb117667aa..2f5f295e199188a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16958,6 +16958,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
+  // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
+  if (Flags.hasApproximateFuncs() && Flags.hasNoSignedZeros() &&
+      Flags.hasNoInfs() && !TLI.isOperationLegalOrCustom(ISD::FREM, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+    SDLoc Loc(N);
+    SDValue Div = DAG.getNode(ISD::FDIV, Loc, VT, N0, N1);
+    SDValue Trunc = DAG.getNode(ISD::FTRUNC, Loc, VT, Div);
+    return DAG.getNode(ISD::FMA, Loc, VT,
+                       DAG.getNode(ISD::FNEG, Loc, VT, Trunc), N1, N0);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
index 8cb68e60f7f9b71..dff9c796289e96e 100644
--- a/llvm/test/CodeGen/PowerPC/frem.ll
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -4,16 +4,13 @@
 define float @frem32(float %a, float %b) {
 ; CHECK-LABEL: frem32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mflr 0
-; CHECK-NEXT:    stdu 1, -32(1)
-; CHECK-NEXT:    std 0, 48(1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    addi 1, 1, 32
-; CHECK-NEXT:    ld 0, 16(1)
-; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    xsresp 0, 2
+; CHECK-NEXT:    fmr 4, 1
+; CHECK-NEXT:    xsmulsp 3, 1, 0
+; CHECK-NEXT:    xsnmsubasp 4, 2, 3
+; CHECK-NEXT:    xsmaddasp 3, 0, 4
+; CHECK-NEXT:    xsrdpiz 0, 3
+; CHECK-NEXT:    xsnmsubasp 1, 0, 2
 ; CHECK-NEXT:    blr
 entry:
   %rem = frem fast float %a, %b
@@ -23,16 +20,17 @@ entry:
 define double @frem64(double %a, double %b) {
 ; CHECK-LABEL: frem64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mflr 0
-; CHECK-NEXT:    stdu 1, -32(1)
-; CHECK-NEXT:    std 0, 48(1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl fmod
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    addi 1, 1, 32
-; CHECK-NEXT:    ld 0, 16(1)
-; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    vspltisw 2, -1
+; CHECK-NEXT:    xsredp 0, 2
+; CHECK-NEXT:    fmr 4, 1
+; CHECK-NEXT:    xvcvsxwdp 3, 34
+; CHECK-NEXT:    xsmaddadp 3, 2, 0
+; CHECK-NEXT:    xsnmsubadp 0, 0, 3
+; CHECK-NEXT:    xsmuldp 3, 1, 0
+; CHECK-NEXT:    xsnmsubadp 4, 2, 3
+; CHECK-NEXT:    xsmaddadp 3, 0, 4
+; CHECK-NEXT:    xsrdpiz 0, 3
+; CHECK-NEXT:    xsnmsubadp 1, 0, 2
 ; CHECK-NEXT:    blr
 entry:
   %rem = frem fast double %a, %b
@@ -42,59 +40,13 @@ entry:
 define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: frem4x32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mflr 0
-; CHECK-NEXT:    stdu 1, -96(1)
-; CHECK-NEXT:    std 0, 112(1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset v28, -64
-; CHECK-NEXT:    .cfi_offset v29, -48
-; CHECK-NEXT:    .cfi_offset v30, -32
-; CHECK-NEXT:    .cfi_offset v31, -16
-; CHECK-NEXT:    xxsldwi 0, 34, 34, 3
-; CHECK-NEXT:    stxv 60, 32(1) # 16-byte Folded Spill
-; CHECK-NEXT:    xscvspdpn 1, 0
-; CHECK-NEXT:    xxsldwi 0, 35, 35, 3
-; CHECK-NEXT:    stxv 61, 48(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 62, 64(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 63, 80(1) # 16-byte Folded Spill
-; CHECK-NEXT:    xscvspdpn 2, 0
-; CHECK-NEXT:    vmr 31, 3
-; CHECK-NEXT:    vmr 30, 2
-; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    xxsldwi 0, 62, 62, 1
-; CHECK-NEXT:    xscpsgndp 61, 1, 1
-; CHECK-NEXT:    xscvspdpn 1, 0
-; CHECK-NEXT:    xxsldwi 0, 63, 63, 1
-; CHECK-NEXT:    xscvspdpn 2, 0
-; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
-; CHECK-NEXT:    xxmrghd 0, 1, 61
-; CHECK-NEXT:    xscvspdpn 1, 62
-; CHECK-NEXT:    xscvspdpn 2, 63
-; CHECK-NEXT:    xvcvdpsp 60, 0
-; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    xxswapd 0, 62
-; CHECK-NEXT:    xscpsgndp 61, 1, 1
-; CHECK-NEXT:    xscvspdpn 1, 0
-; CHECK-NEXT:    xxswapd 0, 63
-; CHECK-NEXT:    xscvspdpn 2, 0
-; CHECK-NEXT:    bl fmodf
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
-; CHECK-NEXT:    xxmrghd 0, 61, 1
-; CHECK-NEXT:    lxv 63, 80(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 62, 64(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 61, 48(1) # 16-byte Folded Reload
-; CHECK-NEXT:    xvcvdpsp 34, 0
-; CHECK-NEXT:    vmrgew 2, 2, 28
-; CHECK-NEXT:    lxv 60, 32(1) # 16-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 96
-; CHECK-NEXT:    ld 0, 16(1)
-; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    xvresp 0, 35
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    xvmulsp 1, 34, 0
+; CHECK-NEXT:    xvnmsubasp 36, 35, 1
+; CHECK-NEXT:    xvmaddasp 1, 0, 36
+; CHECK-NEXT:    xvrspiz 0, 1
+; CHECK-NEXT:    xvnmsubasp 34, 0, 35
 ; CHECK-NEXT:    blr
 entry:
   %rem = frem fast <4 x float> %a, %b
@@ -104,38 +56,18 @@ entry:
 define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: frem2x64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mflr 0
-; CHECK-NEXT:    stdu 1, -80(1)
-; CHECK-NEXT:    std 0, 96(1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset v29, -48
-; CHECK-NEXT:    .cfi_offset v30, -32
-; CHECK-NEXT:    .cfi_offset v31, -16
-; CHECK-NEXT:    stxv 62, 48(1) # 16-byte Folded Spill
-; CHECK-NEXT:    stxv 63, 64(1) # 16-byte Folded Spill
-; CHECK-NEXT:    vmr 31, 3
-; CHECK-NEXT:    xscpsgndp 2, 63, 63
-; CHECK-NEXT:    vmr 30, 2
-; CHECK-NEXT:    xscpsgndp 1, 62, 62
-; CHECK-NEXT:    stxv 61, 32(1) # 16-byte Folded Spill
-; CHECK-NEXT:    bl fmod
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    xscpsgndp 61, 1, 1
-; CHECK-NEXT:    xxswapd 1, 62
-; CHECK-NEXT:    xxswapd 2, 63
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
-; CHECK-NEXT:    bl fmod
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
-; CHECK-NEXT:    xxmrghd 34, 61, 1
-; CHECK-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
-; CHECK-NEXT:    lxv 61, 32(1) # 16-byte Folded Reload
-; CHECK-NEXT:    addi 1, 1, 80
-; CHECK-NEXT:    ld 0, 16(1)
-; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    addis 3, 2, .LCPI3_0@toc@ha
+; CHECK-NEXT:    xvredp 0, 35
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    addi 3, 3, .LCPI3_0@toc@l
+; CHECK-NEXT:    lxv 1, 0(3)
+; CHECK-NEXT:    xvmaddadp 1, 35, 0
+; CHECK-NEXT:    xvnmsubadp 0, 0, 1
+; CHECK-NEXT:    xvmuldp 1, 34, 0
+; CHECK-NEXT:    xvnmsubadp 36, 35, 1
+; CHECK-NEXT:    xvmaddadp 1, 0, 36
+; CHECK-NEXT:    xvrdpiz 0, 1
+; CHECK-NEXT:    xvnmsubadp 34, 0, 35
 ; CHECK-NEXT:    blr
 entry:
   %rem = frem fast <2 x double> %a, %b

TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) &&
TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
SDLoc Loc(N);
SDValue Div = DAG.getNode(ISD::FDIV, Loc, VT, N0, N1);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should any flags (NSZ/NINF?) be passed on to fdiv?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a FlagInserter above, which automatically inserts flags to getNode call in the scope. (see c0f8e4c) I confirm the flags exist after transformation by debug log:

    t32: v2f64 = PPCISD::FNMSUB nnan ninf nsz arcp contract afn reassoc t4, t21, t2
  t28: v2f64 = fma nnan ninf nsz arcp contract afn reassoc t42, t32, t21
t10: v2f64 = ftrunc nnan ninf nsz arcp contract afn reassoc t28

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based off https://reviews.llvm.org/D108284#2999490 - did you investigate adding test coverage on additional targets?

@davemgreen
Copy link
Collaborator

Do you have any analysis on the expected magnitude of the inaccuracy we might expect from performing the fdiv/trunc/fma vs the call to fmod?

@ecnelises
Copy link
Member Author

I tested with a number of random floating values. In most of the cases, the expanded result is exactly the same as libcall result.

But when fmod(a,b) is very close to b (smaller than 1e-10, for example, fmod(521862.045173469, 31.048432006988875)), the result would be totally wrong.. I'm thinking about a solution.

@arsenm
Copy link
Contributor

arsenm commented Mar 15, 2024

I tested with a number of random floating values. In most of the cases, the expanded result is exactly the same as libcall result.

But when fmod(a,b) is very close to b (smaller than 1e-10, for example, fmod(521862.045173469, 31.048432006988875)), the result would be totally wrong.. I'm thinking about a solution.

I am interested an in inline expansion of frem that is 100% correct, but I think this is fine for afn

; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
%c = frem fast double %a, %b
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should trim the flags to the minimum required, and have negative tests for each missing individual flag

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

5 participants