Skip to content

Commit

Permalink
[X86] Try to zero elts when lowering 256-bit shuffle with PSHUFB.
Browse files Browse the repository at this point in the history
Otherwise we fallback to a blend of PSHUFBs later on.

Differential Revision: http://reviews.llvm.org/D19661

llvm-svn: 271113
  • Loading branch information
ahmedbougacha committed May 28, 2016
1 parent 2d39bb3 commit a3dc1ba
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 35 deletions.
101 changes: 66 additions & 35 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -7170,6 +7170,59 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}

/// Mutate a shuffle mask, replacing zeroable elements with SM_SentinelZero.
static void computeZeroableShuffleMask(MutableArrayRef<int> Mask,
SDValue V1, SDValue V2) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
Mask[i] = SM_SentinelZero;
}
}

/// Try to lower a shuffle with a single PSHUFB of V1.
/// This is only possible if V2 is unused (at all, or only for zero elements).
static SDValue lowerVectorShuffleWithPSHUFB(SDLoc DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
const int NumBytes = VT.is128BitVector() ? 16 : 32;
const int NumEltBytes = VT.getScalarSizeInBits() / 8;

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
(Subtarget.hasAVX2() && VT.is256BitVector()));

SmallVector<int, 32> ZeroableMask(Mask.begin(), Mask.end());
computeZeroableShuffleMask(ZeroableMask, V1, V2);

if (!isSingleInputShuffleMask(ZeroableMask) ||
is128BitLaneCrossingShuffleMask(VT, Mask))
return SDValue();

SmallVector<SDValue, 32> PSHUFBMask(NumBytes);
// Sign bit set in i8 mask means zero element.
SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

for (int i = 0; i < NumBytes; ++i) {
int M = ZeroableMask[i / NumEltBytes];
if (M == SM_SentinelUndef) {
PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
} else if (M == SM_SentinelZero) {
PSHUFBMask[i] = ZeroMask;
} else {
M = M * NumEltBytes + (i % NumEltBytes);
M = i < 16 ? M : M - 16;
PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
}
}

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
}

// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask,
Expand Down Expand Up @@ -11389,26 +11442,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return lowerV8I16GeneralSingleInputVectorShuffle(
DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
}

SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
continue;
}

int M = i < 8 ? Mask[i] : Mask[i] - 8;
assert(M >= 0 && M < 8 && "Invalid single-input mask!");
PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
}
return DAG.getBitcast(
MVT::v16i16,
DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
DAG.getBitcast(MVT::v32i8, V1),
DAG.getBuildVector(MVT::v32i8, DL, PSHUFBMask)));
}

if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
V2, Subtarget, DAG))
return PSHUFB;

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
Expand Down Expand Up @@ -11471,24 +11510,16 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return V;

if (isSingleInputShuffleMask(Mask)) {
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
Mask, DAG);

SDValue PSHUFBMask[32];
for (int i = 0; i < 32; ++i)
PSHUFBMask[i] =
Mask[i] < 0
? DAG.getUNDEF(MVT::i8)
: DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
MVT::i8);
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (isSingleInputShuffleMask(Mask) &&
is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
DAG);

return DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, V1,
DAG.getBuildVector(MVT::v32i8, DL, PSHUFBMask));
}
if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
V2, Subtarget, DAG))
return PSHUFB;

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
Expand Down
18 changes: 18 additions & 0 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
Expand Up @@ -2378,6 +2378,24 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
ret <16 x i16> %shuffle
}

define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
; AVX1: # BB#0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
}

define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
; AVX1: # BB#0:
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
Expand Up @@ -953,6 +953,22 @@ define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_
ret <32 x i8> %shuffle
}

define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
; AVX1: # BB#0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 2, i32 32, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
}

define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX1: # BB#0:
Expand Down

0 comments on commit a3dc1ba

Please sign in to comment.