Skip to content

Commit

Permalink
[X86] combineMulToPMADDWD - handle any pow2 vector type and split to …
Browse files Browse the repository at this point in the history
…legal types

combineMulToPMADDWD is currently limited to legal types, but there's no reason why we can't handle any larger type that the existing SplitOpsAndApply code can use to split to legal X86ISD::VPMADDWD ops.

This also exposed a missed opportunity for pre-SSE41 targets to handle SEXT ops from types smaller than vXi16 - without PMOVSX instructions these will always be expanded to unpack+shifts, so we can cheat and convert this into a ZEXT(SEXT()) sequence to make it a valid PMADDWD op.

Differential Revision: https://reviews.llvm.org/D110995
  • Loading branch information
RKSimon committed Nov 9, 2021
1 parent cba40c4 commit d510fd2
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 255 deletions.
50 changes: 36 additions & 14 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -44441,29 +44441,45 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);

// Only support vXi32 vectors.
// TODO: Can we support > 32-bit elements?
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();

// Make sure the type is legal or will be widened to a legal type.
if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
// Make sure the type is legal or can split/widen to a legal type.
// With AVX512 but without BWI, we would need to split v32i16.
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 1 || !isPowerOf2_32(NumElts))
return SDValue();

MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);

// Without BWI, we would need to split v32i16.
if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
// With AVX512 but without BWI, we would need to split v32i16.
if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return SDValue();

SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

// If we are zero extending two steps without SSE4.1, its better to reduce
// If we are zero/sign extending two steps without SSE4.1, its better to
// reduce the vmul width instead.
if (!Subtarget.hasSSE41() &&
(((N0.getOpcode() == ISD::ZERO_EXTEND &&
N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
(N1.getOpcode() == ISD::ZERO_EXTEND &&
N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
((N0.getOpcode() == ISD::SIGN_EXTEND &&
N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
(N1.getOpcode() == ISD::SIGN_EXTEND &&
N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
return SDValue();

// If we are sign extending a wide vector without SSE4.1, its better to reduce
// the vmul width instead.
if (!Subtarget.hasSSE41() &&
(N0.getOpcode() == ISD::ZERO_EXTEND &&
N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
(N1.getOpcode() == ISD::ZERO_EXTEND &&
N1.getOperand(0).getScalarValueSizeInBits() <= 8))
(N0.getOpcode() == ISD::SIGN_EXTEND &&
N0.getOperand(0).getValueSizeInBits() > 128) &&
(N1.getOpcode() == ISD::SIGN_EXTEND &&
N1.getOperand(0).getValueSizeInBits() > 128))
return SDValue();

// Sign bits must extend down to the lowest i16.
Expand All @@ -44480,12 +44496,18 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
DAG.getConstant(0xFFFF, SDLoc(N), VT));
// Convert sext(vXi16) to zext(vXi16).
if (Op.getOpcode() == ISD::SIGN_EXTEND && VT.getSizeInBits() <= 128 &&
N->isOnlyUserOf(Op.getNode())) {
if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
SDValue Src = Op.getOperand(0);
if (Src.getScalarValueSizeInBits() == 16)
// Convert sext(vXi16) to zext(vXi16).
if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
// which will expand the extension.
if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
EVT ExtVT = VT.changeVectorElementType(MVT::i16);
Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
}
}
// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
Expand Down
78 changes: 34 additions & 44 deletions llvm/test/CodeGen/X86/pmaddubsw.ll
Expand Up @@ -296,29 +296,27 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
; SSE-LABEL: pmaddubsw_bad_extend:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: psllw $8, %xmm3
; SSE-NEXT: psraw $8, %xmm3
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: pmulhw %xmm2, %xmm4
; SSE-NEXT: pmullw %xmm2, %xmm3
; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE-NEXT: psraw $8, %xmm0
; SSE-NEXT: psrlw $8, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: pmulhw %xmm0, %xmm4
; SSE-NEXT: pmullw %xmm0, %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: psllw $8, %xmm0
; SSE-NEXT: psraw $8, %xmm0
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psraw $8, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2],zero,xmm4[4],zero,xmm4[6],zero,xmm4[u,u,u,u,u,u,u,u]
; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: psrlw $8, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
; SSE-NEXT: pmaddwd %xmm4, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8],zero,xmm2[10],zero,xmm2[12],zero,xmm2[14],zero,xmm2[u,u,u,u,u,u,u,u]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[9],zero,xmm1[11],zero,xmm1[13],zero,xmm1[15],zero,xmm1[u,u,u,u,u,u,u,u]
; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
; SSE-NEXT: pmaddwd %xmm2, %xmm6
; SSE-NEXT: packssdw %xmm6, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: pmaddubsw_bad_extend:
Expand Down Expand Up @@ -395,30 +393,22 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
; SSE-LABEL: pmaddubsw_bad_indices:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa (%rsi), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
; SSE-NEXT: psraw $8, %xmm3
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: pmulhw %xmm2, %xmm4
; SSE-NEXT: pmullw %xmm2, %xmm3
; SSE-NEXT: movdqa %xmm3, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE-NEXT: psrlw $8, %xmm0
; SSE-NEXT: movdqa (%rsi), %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
; SSE-NEXT: psraw $8, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15]
; SSE-NEXT: psraw $8, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: pmulhw %xmm0, %xmm4
; SSE-NEXT: pmullw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; SSE-NEXT: pmaddwd %xmm4, %xmm5
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: pmaddwd %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm5, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: pmaddubsw_bad_indices:
Expand Down
55 changes: 25 additions & 30 deletions llvm/test/CodeGen/X86/pmulh.ll
Expand Up @@ -326,32 +326,27 @@ define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: and_mulhuw_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm8, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: packssdw %xmm7, %xmm6
; SSE2-NEXT: pmulhw %xmm2, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pand %xmm8, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: pmulhw %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
; SSE2-NEXT: packssdw %xmm1, %xmm6
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: packssdw %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pand %xmm6, %xmm2
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pand %xmm6, %xmm0
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pmaddwd %xmm3, %xmm7
; SSE2-NEXT: pand %xmm6, %xmm8
; SSE2-NEXT: pmaddwd %xmm2, %xmm8
; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pmaddwd %xmm1, %xmm5
; SSE2-NEXT: pand %xmm4, %xmm6
; SSE2-NEXT: pmaddwd %xmm6, %xmm0
; SSE2-NEXT: psrld $16, %xmm7
; SSE2-NEXT: psrld $16, %xmm8
; SSE2-NEXT: packssdw %xmm7, %xmm8
; SSE2-NEXT: psrld $16, %xmm5
; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: packssdw %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_mulhuw_v16i16:
Expand Down Expand Up @@ -382,12 +377,12 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; AVX2-LABEL: and_mulhuw_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767]
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/X86/shrink_vmul.ll
Expand Up @@ -985,16 +985,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X86-SSE-NEXT: movl c, %ecx
; X86-SSE-NEXT: movzwl (%esi,%eax), %esi
; X86-SSE-NEXT: movd %esi, %xmm0
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-SSE-NEXT: psrad $24, %xmm0
; X86-SSE-NEXT: movzwl (%edx,%eax), %edx
; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1
; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4)
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0
; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
Expand All @@ -1021,16 +1021,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X64-SSE-NEXT: psrad $24, %xmm0
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1
; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-SSE-NEXT: psraw $8, %xmm0
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-SSE-NEXT: pmaddwd %xmm1, %xmm0
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_sext_zext:
Expand Down

0 comments on commit d510fd2

Please sign in to comment.