Skip to content

Commit

Permalink
[X86] fmaddsub/fmsubadd combines - add NOFMA target for reference
Browse files Browse the repository at this point in the history
  • Loading branch information
RKSimon committed Apr 5, 2022
1 parent e90d8f0 commit b1e6ca9
Show file tree
Hide file tree
Showing 2 changed files with 314 additions and 18 deletions.
247 changes: 247 additions & 0 deletions llvm/test/CodeGen/X86/fmaddsub-combine.ll
@@ -1,11 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefixes=NOFMA
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck %s -check-prefixes=FMA3,FMA3_256
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck %s -check-prefixes=FMA3,FMA3_512
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck %s -check-prefixes=FMA4

; This test checks the fusing of MUL + ADDSUB to FMADDSUB.

define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
; NOFMA-LABEL: mul_addsub_pd128:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vaddsubpd %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: mul_addsub_pd128:
; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
Expand All @@ -24,6 +31,12 @@ entry:
}

define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
; NOFMA-LABEL: mul_addsub_ps128:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vaddsubps %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: mul_addsub_ps128:
; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
Expand All @@ -42,6 +55,12 @@ entry:
}

define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
; NOFMA-LABEL: mul_addsub_pd256:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: mul_addsub_pd256:
; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
Expand All @@ -60,6 +79,12 @@ entry:
}

define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
; NOFMA-LABEL: mul_addsub_ps256:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: mul_addsub_ps256:
; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
Expand All @@ -78,6 +103,14 @@ entry:
}

define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
; NOFMA-LABEL: mul_addsub_pd512:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubpd %ymm4, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubpd %ymm5, %ymm1, %ymm1
; NOFMA-NEXT: retq
;
; FMA3_256-LABEL: mul_addsub_pd512:
; FMA3_256: # %bb.0: # %entry
; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
Expand All @@ -103,6 +136,14 @@ entry:
}

define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
; NOFMA-LABEL: mul_addsub_ps512:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubps %ymm4, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubps %ymm5, %ymm1, %ymm1
; NOFMA-NEXT: retq
;
; FMA3_256-LABEL: mul_addsub_ps512:
; FMA3_256: # %bb.0: # %entry
; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
Expand All @@ -128,6 +169,12 @@ entry:
}

define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
; NOFMA-LABEL: buildvector_mul_addsub_ps128:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vaddsubps %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_addsub_ps128:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
Expand Down Expand Up @@ -159,6 +206,12 @@ bb:
}

define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
; NOFMA-LABEL: buildvector_mul_addsub_pd128:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vaddsubpd %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_addsub_pd128:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2
Expand All @@ -182,6 +235,12 @@ bb:
}

define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
; NOFMA-LABEL: buildvector_mul_addsub_ps256:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_addsub_ps256:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
Expand Down Expand Up @@ -229,6 +288,12 @@ bb:
}

define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
; NOFMA-LABEL: buildvector_mul_addsub_pd256:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_addsub_pd256:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2
Expand Down Expand Up @@ -260,6 +325,14 @@ bb:
}

define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
; NOFMA-LABEL: buildvector_mul_addsub_ps512:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubps %ymm4, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubps %ymm5, %ymm1, %ymm1
; NOFMA-NEXT: retq
;
; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
; FMA3_256: # %bb.0: # %bb
; FMA3_256-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
Expand Down Expand Up @@ -346,6 +419,14 @@ bb:
}

define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
; NOFMA-LABEL: buildvector_mul_addsub_pd512:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubpd %ymm4, %ymm0, %ymm0
; NOFMA-NEXT: vaddsubpd %ymm5, %ymm1, %ymm1
; NOFMA-NEXT: retq
;
; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
; FMA3_256: # %bb.0: # %bb
; FMA3_256-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4
Expand Down Expand Up @@ -397,6 +478,24 @@ bb:
}

define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
; NOFMA-LABEL: buildvector_mul_subadd_ps128:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_subadd_ps128:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
Expand Down Expand Up @@ -428,6 +527,16 @@ bb:
}

define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
; NOFMA-LABEL: buildvector_mul_subadd_pd128:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_subadd_pd128:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
Expand All @@ -451,6 +560,40 @@ bb:
}

define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
; NOFMA-LABEL: buildvector_mul_subadd_ps256:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; NOFMA-NEXT: vaddss %xmm2, %xmm0, %xmm1
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; NOFMA-NEXT: vaddss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm4
; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm5
; NOFMA-NEXT: vaddss %xmm5, %xmm4, %xmm8
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
; NOFMA-NEXT: vaddss %xmm6, %xmm7, %xmm9
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm6 = xmm2[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm6, %xmm7, %xmm6
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_subadd_ps256:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
Expand Down Expand Up @@ -498,6 +641,24 @@ bb:
}

define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
; NOFMA-LABEL: buildvector_mul_subadd_pd256:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; NOFMA-NEXT: vaddsd %xmm2, %xmm0, %xmm1
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3
; NOFMA-NEXT: vextractf128 $1, %ymm2, %xmm4
; NOFMA-NEXT: vaddsd %xmm4, %xmm3, %xmm5
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
; NOFMA-NEXT: vsubsd %xmm2, %xmm0, %xmm0
; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm4[1,0]
; NOFMA-NEXT: vsubsd %xmm2, %xmm1, %xmm1
; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm5[0],xmm1[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NOFMA-NEXT: retq
;
; FMA3-LABEL: buildvector_mul_subadd_pd256:
; FMA3: # %bb.0: # %bb
; FMA3-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
Expand Down Expand Up @@ -529,6 +690,64 @@ bb:
}

define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
; NOFMA-LABEL: buildvector_mul_subadd_ps512:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: vaddss %xmm4, %xmm0, %xmm8
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0]
; NOFMA-NEXT: vaddss %xmm6, %xmm3, %xmm9
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm6
; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm7
; NOFMA-NEXT: vaddss %xmm7, %xmm6, %xmm10
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm7[1,0]
; NOFMA-NEXT: vaddss %xmm2, %xmm3, %xmm2
; NOFMA-NEXT: vinsertps {{.*#+}} xmm11 = xmm10[0,1],xmm2[0],xmm10[3]
; NOFMA-NEXT: vaddss %xmm5, %xmm1, %xmm10
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm5[1,0]
; NOFMA-NEXT: vaddss %xmm2, %xmm3, %xmm12
; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm14
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm13 = xmm14[1,0]
; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm15
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm15[1,0]
; NOFMA-NEXT: vaddss %xmm3, %xmm13, %xmm13
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm2, %xmm3, %xmm2
; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0
; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm7[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2
; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm5[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[2,3]
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm12[0],xmm3[3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1
; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm14[1,1,3,3]
; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3]
; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3
; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm13[0,0]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm14[3,3,3,3]
; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm15[3,3,3,3]
; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4
; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NOFMA-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NOFMA-NEXT: retq
;
; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
; FMA3_256: # %bb.0: # %bb
; FMA3_256-NEXT: vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
Expand Down Expand Up @@ -615,6 +834,34 @@ bb:
}

define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
; NOFMA-LABEL: buildvector_mul_subadd_pd512:
; NOFMA: # %bb.0: # %bb
; NOFMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; NOFMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; NOFMA-NEXT: vaddsd %xmm4, %xmm0, %xmm2
; NOFMA-NEXT: vextractf128 $1, %ymm0, %xmm3
; NOFMA-NEXT: vextractf128 $1, %ymm4, %xmm6
; NOFMA-NEXT: vaddsd %xmm6, %xmm3, %xmm9
; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm8
; NOFMA-NEXT: vextractf128 $1, %ymm1, %xmm1
; NOFMA-NEXT: vextractf128 $1, %ymm5, %xmm5
; NOFMA-NEXT: vaddsd %xmm5, %xmm1, %xmm7
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
; NOFMA-NEXT: vsubsd %xmm4, %xmm0, %xmm0
; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0]
; NOFMA-NEXT: vsubsd %xmm3, %xmm2, %xmm2
; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm9[0],xmm2[0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; NOFMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0]
; NOFMA-NEXT: vsubsd %xmm3, %xmm1, %xmm1
; NOFMA-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm7[0],xmm1[0]
; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
; NOFMA-NEXT: retq
;
; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
; FMA3_256: # %bb.0: # %bb
; FMA3_256-NEXT: vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
Expand Down

0 comments on commit b1e6ca9

Please sign in to comment.