Skip to content

Commit

Permalink
Revert rG66e7dce714fab "Revert "[X86][SSE] Shuffle combine blends to …
Browse files Browse the repository at this point in the history
…OR(X,Y) if the relevant elements are known zero.""

[X86][SSE] Shuffle combine blends to OR(X,Y) if the relevant elements are known zero (REAPPLIED)

This allows us to remove the (depth violating) code in getFauxShuffleMask where we were combining the OR(SHUFFLE,SHUFFLE) shuffle inputs as well, and not just the OR().

This is a minor step toward being able to shuffle combine from/to SELECT/BLENDV as a faux shuffle.

Reapplied with fixed signed/unsigned comparisons.
  • Loading branch information
RKSimon committed Aug 4, 2020
1 parent f765824 commit 47cea9e
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 35 deletions.
65 changes: 48 additions & 17 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -7401,8 +7401,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
SDValue N0 = peekThroughBitcasts(N.getOperand(0));
SDValue N1 = peekThroughBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
Expand All @@ -7413,34 +7413,24 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
true))
return false;

// Shuffle inputs must be the same size as the result.
if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
return VT.getSizeInBits() != Op.getValueSizeInBits();
}))
return false;
if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
return VT.getSizeInBits() != Op.getValueSizeInBits();
}))
return false;

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (size_t i = 0; i != MaskSize; ++i) {
for (int i = 0; i != (int)MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
Mask.push_back(i);
else if (Mask0[i] == SM_SentinelZero)
Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
Mask.push_back(i + MaskSize);
else
return false;
}
Ops.append(SrcInputs0.begin(), SrcInputs0.end());
Ops.append(SrcInputs1.begin(), SrcInputs1.end());
Ops.push_back(N0);
Ops.push_back(N1);
return true;
}
case ISD::INSERT_SUBVECTOR: {
Expand Down Expand Up @@ -34219,6 +34209,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

if (MaskVT.is128BitVector()) {
Expand Down Expand Up @@ -34276,6 +34267,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}

// Attempt to match against a OR if we're performing a blend shuffle and the
// non-blended source element is zero in each case.
if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
bool IsBlend = true;
unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
unsigned Scale1 = NumV1Elts / NumMaskElts;
unsigned Scale2 = NumV2Elts / NumMaskElts;
APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == SM_SentinelZero) {
DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
continue;
}
if (M == (int)i) {
DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
continue;
}
if (M == (int)(i + NumMaskElts)) {
DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
continue;
}
IsBlend = false;
break;
}
if (IsBlend &&
DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
Shuffle = ISD::OR;
SrcVT = DstVT = EVT(MaskVT).changeTypeToInteger().getSimpleVT();
return true;
}
}

return false;
}

Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/insertelement-ones.ll
Expand Up @@ -389,11 +389,9 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE2-NEXT: pandn %xmm3, %xmm5
; SSE2-NEXT: por %xmm5, %xmm1
; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: retq
Expand All @@ -411,11 +409,9 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
; SSE3-NEXT: movdqa %xmm3, %xmm4
; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
; SSE3-NEXT: por %xmm4, %xmm0
; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
; SSE3-NEXT: pand %xmm5, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
; SSE3-NEXT: pandn %xmm3, %xmm5
; SSE3-NEXT: por %xmm5, %xmm1
; SSE3-NEXT: por %xmm3, %xmm1
; SSE3-NEXT: pand %xmm2, %xmm1
; SSE3-NEXT: por %xmm4, %xmm1
; SSE3-NEXT: retq
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
Expand Up @@ -1314,10 +1314,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX1-LABEL: negative:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
Expand Up @@ -1713,9 +1713,8 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
;
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
Expand Up @@ -3358,9 +3358,9 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
Expand Down

0 comments on commit 47cea9e

Please sign in to comment.