Skip to content

Commit

Permalink
[X86][SSE] pslldq/psrldq byte shifts/rotation for SSE2
Browse files Browse the repository at this point in the history
This patch builds on http://reviews.llvm.org/D5598 to perform byte rotation shuffles (lowerVectorShuffleAsByteRotate) on pre-SSSE3 (palignr) targets - pre-SSSE3 is only enabled on i8 and i16 vector targets where it is a more definite performance gain.

I've also added a separate byte shift shuffle (lowerVectorShuffleAsByteShift) that makes use of the ability of the SLLDQ/SRLDQ instructions to implicitly shift in zero bytes to avoid the need to create a zero register if we had used palignr.

Differential Revision: http://reviews.llvm.org/D5699

llvm-svn: 222340
  • Loading branch information
RKSimon committed Nov 19, 2014
1 parent 59229dc commit 3ac3b25
Show file tree
Hide file tree
Showing 7 changed files with 303 additions and 385 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Expand Up @@ -96,7 +96,7 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
/// BUILD_VECTOR where all of the elements are ~0 or undef.
bool ISD::isBuildVectorAllOnes(const SDNode *N) {
// Look through a bit convert.
if (N->getOpcode() == ISD::BITCAST)
while (N->getOpcode() == ISD::BITCAST)
N = N->getOperand(0).getNode();

if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
Expand Down Expand Up @@ -144,7 +144,7 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
/// BUILD_VECTOR where all of the elements are 0 or undef.
bool ISD::isBuildVectorAllZeros(const SDNode *N) {
// Look through a bit convert.
if (N->getOpcode() == ISD::BITCAST)
while (N->getOpcode() == ISD::BITCAST)
N = N->getOperand(0).getNode();

if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
Expand Down
189 changes: 158 additions & 31 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -7453,12 +7453,13 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,

/// \brief Try to lower a vector shuffle as a byte rotation.
///
/// We have a generic PALIGNR instruction in x86 that will do an arbitrary
/// byte-rotation of the concatenation of two vectors. This routine will
/// try to generically lower a vector shuffle through such an instruction. It
/// does not check for the availability of PALIGNR-based lowerings, only the
/// applicability of this strategy to the given mask. This matches shuffle
/// vectors that look like:
/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
/// try to generically lower a vector shuffle through such an pattern. It
/// does not check for the profitability of lowering either as PALIGNR or
/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
/// This matches shuffle vectors that look like:
///
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
///
Expand All @@ -7471,6 +7472,7 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

Expand Down Expand Up @@ -7531,21 +7533,40 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
else if (!Hi)
Hi = Lo;

// Cast the inputs to v16i8 to match PALIGNR.
Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);

assert(VT.getSizeInBits() == 128 &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");

// The actual rotate instruction rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector.
int Scale = 16 / Mask.size();

// SSSE3 targets can use the palignr instruction
if (Subtarget->hasSSSE3()) {
// Cast the inputs to v16i8 to match PALIGNR.
Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);

return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
DAG.getConstant(Rotation * Scale, MVT::i8)));
}

// Default SSE2 implementation
int LoByteShift = 16 - Rotation * Scale;
int HiByteShift = Rotation * Scale;

// Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);

SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
DAG.getConstant(8 * LoByteShift, MVT::i8));
SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
DAG.getConstant(8 * HiByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
DAG.getConstant(Rotation * Scale, MVT::i8)));
DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}

/// \brief Compute whether each element of a shuffle is zeroable.
Expand Down Expand Up @@ -7587,6 +7608,88 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}

/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
/// byte-shift instructions. The mask must consist of a shifted sequential
/// shuffle from one of the input vectors and zeroable elements for the
/// remaining 'shifted in' elements.
///
/// Note that this only handles 128-bit vector widths currently.
static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);

int Size = Mask.size();
int Scale = 16 / Size;

auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
ArrayRef<int> Mask) {
for (int i = StartIndex; i < EndIndex; i++) {
if (Mask[i] < 0)
continue;
if (i + Base != Mask[i] - MaskOffset)
return false;
}
return true;
};

for (int Shift = 1; Shift < Size; Shift++) {
int ByteShift = Shift * Scale;

// PSRLDQ : (little-endian) right byte shift
// [ 5, 6, 7, zz, zz, zz, zz, zz]
// [ -1, 5, 6, 7, zz, zz, zz, zz]
// [ 1, 2, -1, -1, -1, -1, zz, zz]
bool ZeroableRight = true;
for (int i = Size - Shift; i < Size; i++) {
ZeroableRight &= Zeroable[i];
}

if (ZeroableRight) {
bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);

if (ValidShiftRight1 || ValidShiftRight2) {
// Cast the inputs to v2i64 to match PSRLDQ.
SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
DAG.getConstant(ByteShift * 8, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
}
}

// PSLLDQ : (little-endian) left byte shift
// [ zz, 0, 1, 2, 3, 4, 5, 6]
// [ zz, zz, -1, -1, 2, 3, 4, -1]
// [ zz, zz, zz, zz, zz, zz, -1, 1]
bool ZeroableLeft = true;
for (int i = 0; i < Shift; i++) {
ZeroableLeft &= Zeroable[i];
}

if (ZeroableLeft) {
bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);

if (ValidShiftLeft1 || ValidShiftLeft2) {
// Cast the inputs to v2i64 to match PSLLDQ.
SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
DAG.getConstant(ByteShift * 8, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
}
}
}

return SDValue();
}

/// \brief Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
Expand Down Expand Up @@ -8090,10 +8193,16 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;

// Try to use rotation instructions if available.
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v2i64, V1, V2, Mask, DAG))
return Shift;

// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v2i64, V1, V2, Mask, DAG))
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;

// We implement this with SHUFPD which is pretty lame because it will likely
Expand Down Expand Up @@ -8366,10 +8475,16 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;

// Try to use rotation instructions if available.
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v4i32, V1, V2, Mask, DAG))
return Shift;

// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v4i32, V1, V2, Mask, DAG))
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;

// We implement this with SHUFPS because it can blend from two vectors.
Expand Down Expand Up @@ -8434,11 +8549,15 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);

// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V, V, Mask, DAG))
return Rotate;
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i16, V, V, Mask, DAG))
return Shift;

// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
return Rotate;

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
Expand Down Expand Up @@ -9058,11 +9177,15 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;

// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, DAG))
return Rotate;
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i16, V1, V2, Mask, DAG))
return Shift;

// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return Rotate;

if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
Expand Down Expand Up @@ -9193,11 +9316,15 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ArrayRef<int> OrigMask = SVOp->getMask();
assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");

// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v16i8, V1, V2, OrigMask, DAG))
return Rotate;
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v16i8, V1, V2, OrigMask, DAG))
return Shift;

// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
return Rotate;

// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
Expand Down
65 changes: 14 additions & 51 deletions llvm/test/CodeGen/X86/palignr.ll
Expand Up @@ -86,16 +86,9 @@ define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
;
; CHECK-YONAH-LABEL: test6:
; CHECK-YONAH: # BB#0:
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,2,1,4,5,6,7]
; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
; CHECK-YONAH-NEXT: por %xmm1, %xmm0
; CHECK-YONAH-NEXT: retl
%C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 >
ret <8 x i16> %C
Expand All @@ -110,15 +103,9 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
;
; CHECK-YONAH-LABEL: test7:
; CHECK-YONAH: # BB#0:
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
; CHECK-YONAH-NEXT: por %xmm1, %xmm0
; CHECK-YONAH-NEXT: retl
%C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 >
ret <8 x i16> %C
Expand All @@ -133,33 +120,9 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
;
; CHECK-YONAH-LABEL: test8:
; CHECK-YONAH: # BB#0:
; CHECK-YONAH-NEXT: pxor %xmm3, %xmm3
; CHECK-YONAH-NEXT: movdqa %xmm0, %xmm2
; CHECK-YONAH-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
; CHECK-YONAH-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,0]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7]
; CHECK-YONAH-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-YONAH-NEXT: packuswb %xmm0, %xmm2
; CHECK-YONAH-NEXT: movdqa %xmm2, %xmm0
; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
; CHECK-YONAH-NEXT: por %xmm1, %xmm0
; CHECK-YONAH-NEXT: retl
%C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 >
ret <16 x i8> %C
Expand All @@ -178,11 +141,11 @@ define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind {
;
; CHECK-YONAH-LABEL: test9:
; CHECK-YONAH: # BB#0:
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7]
; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0
; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; CHECK-YONAH-NEXT: por %xmm0, %xmm1
; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0
; CHECK-YONAH-NEXT: retl
%C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 >
ret <8 x i16> %C
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/sse3.ll
Expand Up @@ -8,18 +8,18 @@
define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
; X64-LABEL: t0:
; X64: ## BB#0: ## %entry
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: movl $1, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
entry:
%tmp3 = load <8 x i16>* %old
%tmp6 = shufflevector <8 x i16> %tmp3,
<8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
<8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
<8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
store <8 x i16> %tmp6, <8 x i16>* %dest
ret void

}

define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
Expand Down

0 comments on commit 3ac3b25

Please sign in to comment.