Skip to content

Commit

Permalink
[X86] combineAnd - don't demand operand vector elements if the other …
Browse files Browse the repository at this point in the history
…operand element is zero

If either operand has a zero element, then we don't need the equivalent element from the other operand, as no bits will be set.
  • Loading branch information
RKSimon committed Dec 16, 2021
1 parent 333d66b commit a640f16
Show file tree
Hide file tree
Showing 10 changed files with 310 additions and 306 deletions.
27 changes: 26 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -46322,11 +46322,36 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
return R;

// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
// Attempt to recursively combine a bitmask AND with shuffles.
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
return Res;

// If either operand is a constant mask, then only the elements that aren't
// zero are actually demanded by the other operand.
auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
APInt UndefElts;
SmallVector<APInt> EltBits;
int NumElts = VT.getVectorNumElements();
int EltSizeInBits = VT.getScalarSizeInBits();
if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
return false;

APInt DemandedElts = APInt::getZero(NumElts);
for (int I = 0; I != NumElts; ++I)
if (!EltBits[I].isZero())
DemandedElts.setBit(I);

APInt KnownUndef, KnownZero;
return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
KnownZero, DCI);
};
if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
if (N->getOpcode() != ISD::DELETED_NODE)
DCI.AddToWorklist(N);
return SDValue(N, 0);
}
}

// Attempt to combine a scalar bitmask AND with an extracted shuffle.
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
Expand Up @@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
; CHECK-LABEL: test_u1tofp2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: vpmovqw %xmm0, %xmm0
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0
; CHECK-NEXT: retq
Expand Down
Expand Up @@ -613,7 +613,6 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
Expand All @@ -639,7 +638,6 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1
; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/X86/oddshuffles.ll
Expand Up @@ -840,9 +840,8 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE2-NEXT: pandn %xmm5, %xmm4
; SSE2-NEXT: por %xmm3, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; SSE2-NEXT: packuswb %xmm1, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
Expand Down Expand Up @@ -1087,9 +1086,8 @@ define void @interleave_24i16_out_reverse(<24 x i16>* %p, <8 x i16>* %q1, <8 x i
; SSE2-NEXT: por %xmm5, %xmm6
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6]
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,7,6,7]
Expand Down
274 changes: 135 additions & 139 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll

Large diffs are not rendered by default.

Expand Up @@ -919,7 +919,7 @@ define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>*
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,6,7,4,5,6,7,8,9,4,5,10,11,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,6,7,u,u,u,u,8,9,4,5,10,11,u,u]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,6]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
Expand Down

0 comments on commit a640f16

Please sign in to comment.