Skip to content

Commit

Permalink
[X86][AVX] Fold CONCAT(HOP(X,Y),HOP(Z,W)) -> HOP(CONCAT(X,Z),CONCAT(Y…
Browse files Browse the repository at this point in the history
…,W)) for float types

We can now enable this for AVX1 targets can now assist with canonicalizeShuffleMaskWithHorizOp cleanup.

There's still a few missed opportunities for merging subvector insert/extracts into shuffles, but they shouldn't cause any regressions now.
  • Loading branch information
RKSimon committed Aug 16, 2020
1 parent 29e1d16 commit f25d47b
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 101 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -48341,7 +48341,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
case X86ISD::FHSUB:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
if (!IsSplat && VT.is256BitVector() &&
(VT.isFloatingPoint() || Subtarget.hasInt256())) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
LHS.push_back(Ops[i].getOperand(0));
Expand Down
42 changes: 12 additions & 30 deletions llvm/test/CodeGen/X86/haddsub-2.ll
Expand Up @@ -444,21 +444,12 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: avx_vhadd_pd_test:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx_vhadd_pd_test:
; AVX2: # %bb.0:
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX-LABEL: avx_vhadd_pd_test:
; AVX: # %bb.0:
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
Expand Down Expand Up @@ -486,21 +477,12 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: avx_vhsub_pd_test:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vhsubpd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx_vhsub_pd_test:
; AVX2: # %bb.0:
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vhsubpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX-LABEL: avx_vhsub_pd_test:
; AVX: # %bb.0:
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vhsubpd %ymm2, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/haddsub-shuf.ll
Expand Up @@ -53,10 +53,8 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) {
;
; AVX1_FAST-LABEL: hadd_v8f32a:
; AVX1_FAST: # %bb.0:
; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm1
; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm1, %ymm0
; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hadd_v8f32a:
Expand Down Expand Up @@ -134,10 +132,8 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) {
;
; AVX1_FAST-LABEL: hsub_v8f32a:
; AVX1_FAST: # %bb.0:
; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1_FAST-NEXT: vhsubps %xmm1, %xmm0, %xmm1
; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm1, %ymm0
; AVX1_FAST-NEXT: retq
;
; AVX2-LABEL: hsub_v8f32a:
Expand Down
90 changes: 28 additions & 62 deletions llvm/test/CodeGen/X86/haddsub-undef.ll
Expand Up @@ -709,9 +709,9 @@ define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) {
;
; AVX1-FAST-LABEL: add_pd_011:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm2
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX512-LABEL: add_pd_011:
Expand Down Expand Up @@ -1190,22 +1190,13 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR34724_add_v4f64_u123:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-FAST-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX512-FAST-LABEL: PR34724_add_v4f64_u123:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3]
; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX512-FAST-NEXT: retq
; AVX-FAST-LABEL: PR34724_add_v4f64_u123:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-FAST-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3]
; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX-FAST-NEXT: retq
%3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
%4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
%5 = fadd <2 x double> %3, %4
Expand Down Expand Up @@ -1248,19 +1239,11 @@ define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR34724_add_v4f64_0u23:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-FAST-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX512-FAST-LABEL: PR34724_add_v4f64_0u23:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX512-FAST-NEXT: retq
; AVX-FAST-LABEL: PR34724_add_v4f64_0u23:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX-FAST-NEXT: retq
%3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
%4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
%5 = fadd <2 x double> %3, %4
Expand Down Expand Up @@ -1303,21 +1286,12 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-FAST-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX512-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX512-FAST-NEXT: retq
; AVX-FAST-LABEL: PR34724_add_v4f64_01u3:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX-FAST-NEXT: retq
%3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
%4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
%5 = fadd <2 x double> %3, %4
Expand Down Expand Up @@ -1357,21 +1331,13 @@ define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) {
; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR34724_add_v4f64_012u:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-FAST-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
; AVX512-FAST-LABEL: PR34724_add_v4f64_012u:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX512-FAST-NEXT: retq
; AVX-FAST-LABEL: PR34724_add_v4f64_012u:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX-FAST-NEXT: retq
%3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2>
%4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3>
%5 = fadd <2 x double> %3, %4
Expand Down

0 comments on commit f25d47b

Please sign in to comment.