Skip to content

Commit

Permalink
[X86][SSE] canonicalizeShuffleWithBinOps - add PERMILPS/PERMILPD + PE…
Browse files Browse the repository at this point in the history
…RMPD/PERMQ + INSERTPS handling.

Bail if the INSERTPS would introduce zeros across the binop.
  • Loading branch information
RKSimon committed Mar 16, 2021
1 parent 43f2d26 commit 64687f2
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 44 deletions.
12 changes: 11 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36852,7 +36852,9 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
}
case X86ISD::VBROADCAST:
case X86ISD::MOVDDUP:
case X86ISD::PSHUFD: {
case X86ISD::PSHUFD:
case X86ISD::VPERMI:
case X86ISD::VPERMILPI: {
if (N.getOperand(0).getValueType() == ShuffleVT &&
N->isOnlyUserOf(N.getOperand(0).getNode())) {
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
Expand Down Expand Up @@ -36882,6 +36884,14 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
break;
}
// Binary and Binary+Permute Shuffles.
case X86ISD::INSERTPS: {
// Don't merge INSERTPS if it contains zero'd elements.
unsigned InsertPSMask = N.getConstantOperandVal(2);
unsigned ZeroMask = InsertPSMask & 0xF;
if (ZeroMask != 0)
break;
LLVM_FALLTHROUGH;
}
case X86ISD::BLENDI:
case X86ISD::SHUFP:
case X86ISD::UNPCKH:
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/X86/haddsub-undef.ll
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,8 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
; AVX-SLOW-LABEL: add_pd_010:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: add_pd_010:
Expand Down Expand Up @@ -607,9 +607,9 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
;
; AVX-SLOW-LABEL: add_ps_017:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: add_ps_017:
Expand Down Expand Up @@ -931,9 +931,9 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
;
; AVX-SLOW-LABEL: PR45747_1:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,2,2]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: PR45747_1:
Expand Down Expand Up @@ -963,9 +963,9 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
;
; AVX-SLOW-LABEL: PR45747_2:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: PR45747_2:
Expand Down
52 changes: 23 additions & 29 deletions llvm/test/CodeGen/X86/horizontal-sum.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,12 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: pair_sum_v4f32_v4f32:
Expand All @@ -66,13 +65,12 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: retq
%5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
%6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
Expand Down Expand Up @@ -648,17 +646,15 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm4, %xmm1
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm1, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32:
Expand All @@ -669,16 +665,14 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm4, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: retq
%5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4>
%6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5>
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1277,10 +1277,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX512VL-LABEL: negative:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpternlogq $206, %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
Expand Down

0 comments on commit 64687f2

Please sign in to comment.