Skip to content

Commit

Permalink
[X86][SSE] Reapplied: Simplify vector LOAD + EXTEND on pre-SSE41 hard…
Browse files Browse the repository at this point in the history
…ware

Improve vector extension of vectors on hardware without dedicated VSEXT/VZEXT instructions.

We already convert these to SIGN_EXTEND_VECTOR_INREG/ZERO_EXTEND_VECTOR_INREG but can further improve this by using the legalizer instead of prematurely splitting into legal vectors in the combine as this only properly helps for lowering to VSEXT/VZEXT.

Removes a lot of unnecessary any_extend + mask pattern - (Fix for PR25718).

Reapplied with a fix for PR26953 (missing vector widening legalization).

Differential Revision: http://reviews.llvm.org/D17932

llvm-svn: 264062
  • Loading branch information
RKSimon committed Mar 22, 2016
1 parent d83633f commit 25fb417
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 77 deletions.
2 changes: 2 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
Expand Up @@ -653,6 +653,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);

void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
Expand Down Expand Up @@ -713,6 +714,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
SDValue WidenVecRes_LOAD(SDNode* N);
Expand Down
100 changes: 100 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
Expand Up @@ -621,6 +621,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
break;

case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
case ISD::ZERO_EXTEND_VECTOR_INREG:
SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
break;

case ISD::BITREVERSE:
case ISD::BSWAP:
case ISD::CONVERT_RNDSAT:
Expand Down Expand Up @@ -917,6 +923,39 @@ void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
DAG.getValueType(HiVT));
}

void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
SDValue &Hi) {
unsigned Opcode = N->getOpcode();
SDValue N0 = N->getOperand(0);

SDLoc dl(N);
SDValue InLo, InHi;
GetSplitVector(N0, InLo, InHi);
EVT InLoVT = InLo.getValueType();
unsigned InNumElements = InLoVT.getVectorNumElements();

EVT OutLoVT, OutHiVT;
std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
unsigned OutNumElements = OutLoVT.getVectorNumElements();
assert((2 * OutNumElements) <= InNumElements &&
"Illegal extend vector in reg split");

// *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
// input vector (i.e. we only use InLo):
// OutLo will extend the first OutNumElements from InLo.
// OutHi will extend the next OutNumElements from InLo.

// Shuffle the elements from InLo for OutHi into the bottom elements to
// create a 'fake' InHi.
SmallVector<int, 8> SplitHi(InNumElements, -1);
for (unsigned i = 0; i != OutNumElements; ++i)
SplitHi[i] = i + OutNumElements;
InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);

Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
}

void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDValue Vec = N->getOperand(0);
Expand Down Expand Up @@ -2069,6 +2108,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
Res = WidenVecRes_Shift(N);
break;

case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
break;

case ISD::ANY_EXTEND:
case ISD::FP_EXTEND:
case ISD::FP_ROUND:
Expand Down Expand Up @@ -2355,6 +2400,61 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
}

SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
unsigned Opcode = N->getOpcode();
SDValue InOp = N->getOperand(0);
SDLoc DL(N);

EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
EVT WidenSVT = WidenVT.getVectorElementType();
unsigned WidenNumElts = WidenVT.getVectorNumElements();

EVT InVT = InOp.getValueType();
EVT InSVT = InVT.getVectorElementType();
unsigned InVTNumElts = InVT.getVectorNumElements();

if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
InOp = GetWidenedVector(InOp);
InVT = InOp.getValueType();
if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
switch (Opcode) {
case ISD::ANY_EXTEND_VECTOR_INREG:
return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
case ISD::SIGN_EXTEND_VECTOR_INREG:
return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
case ISD::ZERO_EXTEND_VECTOR_INREG:
return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
}
}
}

// Unroll, extend the scalars and rebuild the vector.
SmallVector<SDValue, 16> Ops;
for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
switch (Opcode) {
case ISD::ANY_EXTEND_VECTOR_INREG:
Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
break;
case ISD::SIGN_EXTEND_VECTOR_INREG:
Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
break;
case ISD::ZERO_EXTEND_VECTOR_INREG:
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
break;
default:
llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
}
Ops.push_back(Val);
}

while (Ops.size() != WidenNumElts)
Ops.push_back(DAG.getUNDEF(WidenSVT));

return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
}

SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
// If this is an FCOPYSIGN with same input types, we can treat it as a
// normal (can trap) binary op.
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -28634,7 +28634,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,

// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
// ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
if (VT.is128BitVector() || (VT.is256BitVector() && Subtarget.hasInt256())) {
// Also use this if we don't have SSE41 to allow the legalizer do its job.
if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
(VT.is256BitVector() && Subtarget.hasInt256())) {
SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
return Opcode == ISD::SIGN_EXTEND
? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
Expand Down
99 changes: 47 additions & 52 deletions llvm/test/CodeGen/X86/vector-zext.ll
Expand Up @@ -544,23 +544,20 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_zext_4i8_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i8_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i8_to_4i64:
Expand Down Expand Up @@ -625,22 +622,21 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i32:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i32:
Expand Down Expand Up @@ -674,34 +670,33 @@ entry:
define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4],zero,zero,zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; SSSE3-NEXT: pshufb %xmm5, %xmm1
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8],zero,zero,zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[12],zero,zero,zero,zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb %xmm4, %xmm2
; SSSE3-NEXT: pshufb %xmm5, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i64:
Expand Down Expand Up @@ -851,21 +846,21 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_zext_4i16_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i16_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i16_to_4i64:
Expand Down
40 changes: 28 additions & 12 deletions llvm/test/CodeGen/X86/widen_conv-3.ll
Expand Up @@ -49,18 +49,27 @@ entry:
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
; X86-SSE2: # BB#0: # %entry
; X86-SSE2-NEXT: pushl %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: pushl %esi
; X86-SSE2-NEXT: andl $-16, %esp
; X86-SSE2-NEXT: subl $32, %esp
; X86-SSE2-NEXT: movl 8(%ebp), %eax
; X86-SSE2-NEXT: movl 12(%ebp), %ecx
; X86-SSE2-NEXT: movzwl (%ecx), %edx
; X86-SSE2-NEXT: movd %edx, %xmm0
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT: shll $8, %edx
; X86-SSE2-NEXT: movzbl (%esp), %esi
; X86-SSE2-NEXT: orl %edx, %esi
; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: movd %ecx, %xmm1
; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X86-SSE2-NEXT: pslld $24, %xmm0
; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: psrad $24, %xmm0
; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
Expand All @@ -69,7 +78,9 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
; X86-SSE2-NEXT: popl %eax
; X86-SSE2-NEXT: leal -4(%ebp), %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
Expand Down Expand Up @@ -99,11 +110,16 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-SSE2-NEXT: shll $8, %eax
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: orl %eax, %ecx
; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
; X64-SSE2-NEXT: movd %eax, %xmm1
; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; X64-SSE2-NEXT: pslld $24, %xmm0
; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE2-NEXT: psrad $24, %xmm0
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
Expand Down

0 comments on commit 25fb417

Please sign in to comment.