diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2a3425a42607e..83bbe86392adf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17144,6 +17144,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; + // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) + if (Flags.hasApproximateFuncs() && Flags.hasNoSignedZeros() && + Flags.hasNoInfs() && !TLI.isOperationLegalOrCustom(ISD::FREM, VT) && + TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) && + TLI.isOperationLegalOrCustom(ISD::FMA, VT)) { + SDLoc Loc(N); + SDValue Div = DAG.getNode(ISD::FDIV, Loc, VT, N0, N1); + SDValue Trunc = DAG.getNode(ISD::FTRUNC, Loc, VT, Div); + return DAG.getNode(ISD::FMA, Loc, VT, + DAG.getNode(ISD::FNEG, Loc, VT, Trunc), N1, N0); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index ed0f6c442ece1..e7021bbb05e29 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -1597,3 +1597,150 @@ entry: %c = frem <16 x half> %a, %b ret <16 x half> %c } + +define double @frem_f64_fast(double %a, double %b) { +; CHECK-SD-LABEL: frem_f64_fast: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fdiv d2, d0, d1 +; CHECK-SD-NEXT: frintz d2, d2 +; CHECK-SD-NEXT: fmsub d0, d2, d1, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem_f64_fast: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: bl fmod +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %c = frem fast double %a, %b + ret double %c +} + +define float @frem_f32_fast(float %a, float %b) { +; CHECK-SD-LABEL: frem_f32_fast: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fdiv s2, s0, s1 +; CHECK-SD-NEXT: frintz s2, s2 +; CHECK-SD-NEXT: fmsub s0, s2, s1, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem_f32_fast: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: bl fmodf +; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %c = frem fast float %a, %b + ret float %c +} + +define <2 x double> @frem_v2f64_fast(<2 x double> %a, <2 x double> %b) { +; CHECK-SD-LABEL: frem_v2f64_fast: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fdiv v2.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: frintz v2.2d, v2.2d +; CHECK-SD-NEXT: fmls v0.2d, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem_v2f64_fast: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: mov d8, v0.d[1] +; CHECK-GI-NEXT: mov d9, v1.d[1] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-GI-NEXT: bl fmod +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: fmov d1, d9 +; CHECK-GI-NEXT: fmov d0, d8 +; CHECK-GI-NEXT: bl fmod +; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret +entry: + %c = frem fast <2 x double> %a, %b + ret <2 x double> %c +} + +define <4 x float> @frem_v4f32_fast(<4 x float> %a, <4 x float> %b) { +; CHECK-SD-LABEL: frem_v4f32_fast: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fdiv v2.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: frintz v2.4s, v2.4s +; CHECK-SD-NEXT: fmls v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: frem_v4f32_fast: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #112 +; CHECK-GI-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 112 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: .cfi_offset b10, -40 +; CHECK-GI-NEXT: .cfi_offset b11, -48 +; CHECK-GI-NEXT: .cfi_offset b12, -56 +; CHECK-GI-NEXT: .cfi_offset b13, -64 +; CHECK-GI-NEXT: mov s8, v0.s[1] +; CHECK-GI-NEXT: mov s9, v0.s[2] +; CHECK-GI-NEXT: mov s10, v0.s[3] +; CHECK-GI-NEXT: mov s11, v1.s[1] +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-GI-NEXT: mov s12, v1.s[2] +; CHECK-GI-NEXT: mov s13, v1.s[3] +; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECK-GI-NEXT: bl fmodf +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: fmov s1, s11 +; CHECK-GI-NEXT: fmov s0, s8 +; CHECK-GI-NEXT: bl fmodf +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: fmov s1, s12 +; CHECK-GI-NEXT: fmov s0, s9 +; CHECK-GI-NEXT: bl fmodf +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: fmov s1, s13 +; CHECK-GI-NEXT: fmov s0, s10 +; CHECK-GI-NEXT: bl fmodf +; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-GI-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] +; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] +; CHECK-GI-NEXT: mov v1.s[3], v0.s[0] +; CHECK-GI-NEXT: mov v0.16b, v1.16b +; CHECK-GI-NEXT: add sp, sp, #112 +; CHECK-GI-NEXT: ret +entry: + %c = frem fast <4 x float> %a, %b + ret <4 x float> %c +} diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll index 8cb68e60f7f9b..dff9c796289e9 100644 --- a/llvm/test/CodeGen/PowerPC/frem.ll +++ b/llvm/test/CodeGen/PowerPC/frem.ll @@ -4,16 +4,13 @@ define float @frem32(float %a, float %b) { ; CHECK-LABEL: frem32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stdu 1, -32(1) -; CHECK-NEXT: std 0, 48(1) -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl fmodf -; CHECK-NEXT: nop -; CHECK-NEXT: addi 1, 1, 32 -; CHECK-NEXT: ld 0, 16(1) -; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: xsresp 0, 2 +; CHECK-NEXT: fmr 4, 1 +; CHECK-NEXT: xsmulsp 3, 1, 0 +; CHECK-NEXT: xsnmsubasp 4, 2, 3 +; CHECK-NEXT: xsmaddasp 3, 0, 4 +; CHECK-NEXT: xsrdpiz 0, 3 +; CHECK-NEXT: xsnmsubasp 1, 0, 2 ; CHECK-NEXT: blr entry: %rem = frem fast float %a, %b @@ -23,16 +20,17 @@ entry: define double @frem64(double %a, double %b) { ; CHECK-LABEL: frem64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stdu 1, -32(1) -; CHECK-NEXT: std 0, 48(1) -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl fmod -; CHECK-NEXT: nop -; CHECK-NEXT: addi 1, 1, 32 -; CHECK-NEXT: ld 0, 16(1) -; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: vspltisw 2, -1 +; CHECK-NEXT: xsredp 0, 2 +; CHECK-NEXT: fmr 4, 1 +; CHECK-NEXT: xvcvsxwdp 3, 34 +; CHECK-NEXT: xsmaddadp 3, 2, 0 +; CHECK-NEXT: xsnmsubadp 0, 0, 3 +; CHECK-NEXT: xsmuldp 3, 1, 0 +; CHECK-NEXT: xsnmsubadp 4, 2, 3 +; CHECK-NEXT: xsmaddadp 3, 0, 4 +; CHECK-NEXT: xsrdpiz 0, 3 +; CHECK-NEXT: xsnmsubadp 1, 0, 2 ; CHECK-NEXT: blr entry: %rem = frem fast double %a, %b @@ -42,59 +40,13 @@ entry: define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: frem4x32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stdu 1, -96(1) -; CHECK-NEXT: std 0, 112(1) -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: .cfi_offset v28, -64 -; CHECK-NEXT: .cfi_offset v29, -48 -; CHECK-NEXT: .cfi_offset v30, -32 -; CHECK-NEXT: .cfi_offset v31, -16 -; CHECK-NEXT: xxsldwi 0, 34, 34, 3 -; CHECK-NEXT: stxv 60, 32(1) # 16-byte Folded Spill -; CHECK-NEXT: xscvspdpn 1, 0 -; CHECK-NEXT: xxsldwi 0, 35, 35, 3 -; CHECK-NEXT: stxv 61, 48(1) # 16-byte Folded Spill -; CHECK-NEXT: stxv 62, 64(1) # 16-byte Folded Spill -; CHECK-NEXT: stxv 63, 80(1) # 16-byte Folded Spill -; CHECK-NEXT: xscvspdpn 2, 0 -; CHECK-NEXT: vmr 31, 3 -; CHECK-NEXT: vmr 30, 2 -; CHECK-NEXT: bl fmodf -; CHECK-NEXT: nop -; CHECK-NEXT: xxsldwi 0, 62, 62, 1 -; CHECK-NEXT: xscpsgndp 61, 1, 1 -; CHECK-NEXT: xscvspdpn 1, 0 -; CHECK-NEXT: xxsldwi 0, 63, 63, 1 -; CHECK-NEXT: xscvspdpn 2, 0 -; CHECK-NEXT: bl fmodf -; CHECK-NEXT: nop -; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 -; CHECK-NEXT: xxmrghd 0, 1, 61 -; CHECK-NEXT: xscvspdpn 1, 62 -; CHECK-NEXT: xscvspdpn 2, 63 -; CHECK-NEXT: xvcvdpsp 60, 0 -; CHECK-NEXT: bl fmodf -; CHECK-NEXT: nop -; CHECK-NEXT: xxswapd 0, 62 -; CHECK-NEXT: xscpsgndp 61, 1, 1 -; CHECK-NEXT: xscvspdpn 1, 0 -; CHECK-NEXT: xxswapd 0, 63 -; CHECK-NEXT: xscvspdpn 2, 0 -; CHECK-NEXT: bl fmodf -; CHECK-NEXT: nop -; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 -; CHECK-NEXT: xxmrghd 0, 61, 1 -; CHECK-NEXT: lxv 63, 80(1) # 16-byte Folded Reload -; CHECK-NEXT: lxv 62, 64(1) # 16-byte Folded Reload -; CHECK-NEXT: lxv 61, 48(1) # 16-byte Folded Reload -; CHECK-NEXT: xvcvdpsp 34, 0 -; CHECK-NEXT: vmrgew 2, 2, 28 -; CHECK-NEXT: lxv 60, 32(1) # 16-byte Folded Reload -; CHECK-NEXT: addi 1, 1, 96 -; CHECK-NEXT: ld 0, 16(1) -; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: xvresp 0, 35 +; CHECK-NEXT: vmr 4, 2 +; CHECK-NEXT: xvmulsp 1, 34, 0 +; CHECK-NEXT: xvnmsubasp 36, 35, 1 +; CHECK-NEXT: xvmaddasp 1, 0, 36 +; CHECK-NEXT: xvrspiz 0, 1 +; CHECK-NEXT: xvnmsubasp 34, 0, 35 ; CHECK-NEXT: blr entry: %rem = frem fast <4 x float> %a, %b @@ -104,38 +56,18 @@ entry: define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: frem2x64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stdu 1, -80(1) -; CHECK-NEXT: std 0, 96(1) -; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: .cfi_offset v29, -48 -; CHECK-NEXT: .cfi_offset v30, -32 -; CHECK-NEXT: .cfi_offset v31, -16 -; CHECK-NEXT: stxv 62, 48(1) # 16-byte Folded Spill -; CHECK-NEXT: stxv 63, 64(1) # 16-byte Folded Spill -; CHECK-NEXT: vmr 31, 3 -; CHECK-NEXT: xscpsgndp 2, 63, 63 -; CHECK-NEXT: vmr 30, 2 -; CHECK-NEXT: xscpsgndp 1, 62, 62 -; CHECK-NEXT: stxv 61, 32(1) # 16-byte Folded Spill -; CHECK-NEXT: bl fmod -; CHECK-NEXT: nop -; CHECK-NEXT: xscpsgndp 61, 1, 1 -; CHECK-NEXT: xxswapd 1, 62 -; CHECK-NEXT: xxswapd 2, 63 -; CHECK-NEXT: # kill: def $f1 killed $f1 killed $vsl1 -; CHECK-NEXT: # kill: def $f2 killed $f2 killed $vsl2 -; CHECK-NEXT: bl fmod -; CHECK-NEXT: nop -; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1 -; CHECK-NEXT: xxmrghd 34, 61, 1 -; CHECK-NEXT: lxv 63, 64(1) # 16-byte Folded Reload -; CHECK-NEXT: lxv 62, 48(1) # 16-byte Folded Reload -; CHECK-NEXT: lxv 61, 32(1) # 16-byte Folded Reload -; CHECK-NEXT: addi 1, 1, 80 -; CHECK-NEXT: ld 0, 16(1) -; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: addis 3, 2, .LCPI3_0@toc@ha +; CHECK-NEXT: xvredp 0, 35 +; CHECK-NEXT: vmr 4, 2 +; CHECK-NEXT: addi 3, 3, .LCPI3_0@toc@l +; CHECK-NEXT: lxv 1, 0(3) +; CHECK-NEXT: xvmaddadp 1, 35, 0 +; CHECK-NEXT: xvnmsubadp 0, 0, 1 +; CHECK-NEXT: xvmuldp 1, 34, 0 +; CHECK-NEXT: xvnmsubadp 36, 35, 1 +; CHECK-NEXT: xvmaddadp 1, 0, 36 +; CHECK-NEXT: xvrdpiz 0, 1 +; CHECK-NEXT: xvnmsubadp 34, 0, 35 ; CHECK-NEXT: blr entry: %rem = frem fast <2 x double> %a, %b diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index d91d4289a5994..c2613dfa744f8 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -1448,3 +1448,135 @@ define void @frem_v4f80(<4 x x86_fp80> %a0, <4 x x86_fp80> %a1, ptr%p3) nounwind store <4 x x86_fp80> %frem, ptr%p3 ret void } + +define void @frem_f32_fast(float %a0, float %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_f32_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vroundss $11, %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm0 +; CHECK-NEXT: vmovss %xmm2, (%rdi) +; CHECK-NEXT: retq + %frem = frem fast float %a0, %a1 + store float %frem, ptr%p3 + ret void +} + +define void @frem_f64_fast(double %a0, double %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_f64_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsd %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vroundsd $11, %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vfnmadd213sd {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm0 +; CHECK-NEXT: vmovsd %xmm2, (%rdi) +; CHECK-NEXT: retq + %frem = frem fast double %a0, %a1 + store double %frem, ptr%p3 + ret void +} + +define void @frem_v16f32_fast(<16 x float> %a0, <16 x float> %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_v16f32_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcpps %ymm3, %ymm4 +; CHECK-NEXT: vmulps %ymm4, %ymm1, %ymm5 +; CHECK-NEXT: vmovaps %ymm5, %ymm6 +; CHECK-NEXT: vfmsub213ps {{.*#+}} ymm6 = (ymm3 * ymm6) - ymm1 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} ymm6 = -(ymm4 * ymm6) + ymm5 +; CHECK-NEXT: vrcpps %ymm2, %ymm4 +; CHECK-NEXT: vmulps %ymm4, %ymm0, %ymm5 +; CHECK-NEXT: vmovaps %ymm5, %ymm7 +; CHECK-NEXT: vfmsub213ps {{.*#+}} ymm7 = (ymm2 * ymm7) - ymm0 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} ymm7 = -(ymm4 * ymm7) + ymm5 +; CHECK-NEXT: vroundps $11, %ymm7, %ymm4 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm2 * ymm4) + ymm0 +; CHECK-NEXT: vroundps $11, %ymm6, %ymm0 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; CHECK-NEXT: vmovaps %ymm0, 32(%rdi) +; CHECK-NEXT: vmovaps %ymm4, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %frem = frem fast <16 x float> %a0, %a1 + store <16 x float> %frem, ptr%p3 + ret void +} + +define void @frem_v8f32_fast(<8 x float> %a0, <8 x float> %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_v8f32_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcpps %ymm1, %ymm2 +; CHECK-NEXT: vmulps %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vmovaps %ymm3, %ymm4 +; CHECK-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm0 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm2 * ymm4) + ymm3 +; CHECK-NEXT: vroundps $11, %ymm4, %ymm2 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} ymm2 = -(ymm1 * ymm2) + ymm0 +; CHECK-NEXT: vmovaps %ymm2, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %frem = frem fast <8 x float> %a0, %a1 + store <8 x float> %frem, ptr%p3 + ret void +} + +define void @frem_v4f32_fast(<4 x float> %a0, <4 x float> %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_v4f32_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcpps %xmm1, %xmm2 +; CHECK-NEXT: vmulps %xmm2, %xmm0, %xmm3 +; CHECK-NEXT: vmovaps %xmm3, %xmm4 +; CHECK-NEXT: vfmsub213ps {{.*#+}} xmm4 = (xmm1 * xmm4) - xmm0 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} xmm4 = -(xmm2 * xmm4) + xmm3 +; CHECK-NEXT: vroundps $11, %xmm4, %xmm2 +; CHECK-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm0 +; CHECK-NEXT: vmovaps %xmm2, (%rdi) +; CHECK-NEXT: retq + %frem = frem fast <4 x float> %a0, %a1 + store <4 x float> %frem, ptr%p3 + ret void +} + +define void @frem_v8f64_fast(<8 x double> %a0, <8 x double> %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_v8f64_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivpd %ymm2, %ymm0, %ymm4 +; CHECK-NEXT: vroundpd $11, %ymm4, %ymm4 +; CHECK-NEXT: vfnmadd213pd {{.*#+}} ymm4 = -(ymm2 * ymm4) + ymm0 +; CHECK-NEXT: vdivpd %ymm3, %ymm1, %ymm0 +; CHECK-NEXT: vroundpd $11, %ymm0, %ymm0 +; CHECK-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 +; CHECK-NEXT: vmovapd %ymm0, 32(%rdi) +; CHECK-NEXT: vmovapd %ymm4, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %frem = frem fast <8 x double> %a0, %a1 + store <8 x double> %frem, ptr%p3 + ret void +} + +define void @frem_v4f64_fast(<4 x double> %a0, <4 x double> %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_v4f64_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vroundpd $11, %ymm2, %ymm2 +; CHECK-NEXT: vfnmadd213pd {{.*#+}} ymm2 = -(ymm1 * ymm2) + ymm0 +; CHECK-NEXT: vmovapd %ymm2, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %frem = frem fast <4 x double> %a0, %a1 + store <4 x double> %frem, ptr%p3 + ret void +} + +define void @frem_v2f64_fast(<2 x double> %a0, <2 x double> %a1, ptr%p3) nounwind { +; CHECK-LABEL: frem_v2f64_fast: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivpd %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vroundpd $11, %xmm2, %xmm2 +; CHECK-NEXT: vfnmadd213pd {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm0 +; CHECK-NEXT: vmovapd %xmm2, (%rdi) +; CHECK-NEXT: retq + %frem = frem fast <2 x double> %a0, %a1 + store <2 x double> %frem, ptr%p3 + ret void +}