Skip to content

Commit

Permalink
[X86] Improve matchBinaryShuffle()'s BLEND lowering with per-elem…
Browse files Browse the repository at this point in the history
…ent all-zero/all-ones knowledge

We can use `OR` instead of `BLEND` if either the element we are not picking is zero (or masked away);
or the element we are picking overwhelms (e.g. it's all-ones) whatever the element we are not picking:
https://alive2.llvm.org/ce/z/RKejao

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D109726
  • Loading branch information
LebedevRI committed Sep 17, 2021
1 parent df1ab7d commit 358df06
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 37 deletions.
58 changes: 52 additions & 6 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -36226,12 +36226,58 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
IsBlend = false;
break;
}
if (IsBlend &&
DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
Shuffle = ISD::OR;
SrcVT = DstVT = MaskVT.changeTypeToInteger();
return true;
if (IsBlend) {
if (DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
Shuffle = ISD::OR;
SrcVT = DstVT = MaskVT.changeTypeToInteger();
return true;
}
if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
// FIXME: handle mismatched sizes?
// TODO: investigate if `ISD::OR` handling in
// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
auto computeKnownBitsElementWise = [&DAG](SDValue V) {
unsigned NumElts = V.getValueType().getVectorNumElements();
KnownBits Known(NumElts);
for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
if (PeepholeKnown.isZero())
Known.Zero.setBit(EltIdx);
if (PeepholeKnown.isAllOnes())
Known.One.setBit(EltIdx);
}
return Known;
};

KnownBits V1Known = computeKnownBitsElementWise(V1);
KnownBits V2Known = computeKnownBitsElementWise(V2);

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == SM_SentinelZero) {
IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
continue;
}
if (M == (int)i) {
IsBlend &= V2Known.Zero[i] || V1Known.One[i];
continue;
}
if (M == (int)(i + NumMaskElts)) {
IsBlend &= V1Known.Zero[i] || V2Known.One[i];
continue;
}
llvm_unreachable("will not get here.");
}
if (IsBlend) {
Shuffle = ISD::OR;
SrcVT = DstVT = MaskVT.changeTypeToInteger();
return true;
}
}
}
}

Expand Down
43 changes: 12 additions & 31 deletions llvm/test/CodeGen/X86/insertelement-ones.ll
Expand Up @@ -280,11 +280,8 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
Expand Down Expand Up @@ -315,26 +312,22 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSE2-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i8_x123456789ABCDEx:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: movl $255, %eax
; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: pandn %xmm2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; SSE3-NEXT: por %xmm1, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: retq
;
Expand All @@ -344,7 +337,7 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
Expand Down Expand Up @@ -372,41 +365,31 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: pandn %xmm3, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: movl $255, %eax
; SSE3-NEXT: movd %eax, %xmm3
; SSE3-NEXT: pandn %xmm3, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm3, %xmm4
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE3-NEXT: por %xmm4, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE3-NEXT: por %xmm4, %xmm2
; SSE3-NEXT: por %xmm2, %xmm0
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE3-NEXT: por %xmm4, %xmm3
; SSE3-NEXT: por %xmm3, %xmm1
; SSE3-NEXT: pand %xmm2, %xmm1
; SSE3-NEXT: por %xmm4, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
Expand All @@ -415,15 +398,13 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u]
; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0],zero
; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
Expand Down

0 comments on commit 358df06

Please sign in to comment.