Skip to content

Commit

Permalink
[X86] Fold ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z),…
Browse files Browse the repository at this point in the history
… SHUFFLE(Y,W))

Merge addition of VPMADDWD nodes if each element pair doesn't use the upper element in each pair (i.e. its zero) - we can generalize this to either element in the pair if we one day create VPMADDWD with zero lower elements.

There are still a number of issues with extending/shuffling with 256/512-bit VPMADDWD nodes so this initially only works for v2i32/v4i32 cases - I'm working on removing all these limitations but there's still a bit of yak shaving to go.....
  • Loading branch information
RKSimon committed Sep 26, 2021
1 parent 175c1a3 commit 3fe9767
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 82 deletions.
53 changes: 49 additions & 4 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -51098,6 +51098,50 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
PMADDBuilder);
}

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
// If upper element in each pair of both VPMADDWD are zero then we can merge
// the operand elements and use the implicit add of VPMADDWD.
// TODO: Add support for VPMADDUBSW (which isn't commutable).
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT) {
if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
return SDValue();

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
if (VT.getSizeInBits() > 128)
return SDValue();

unsigned NumElts = VT.getVectorNumElements();
MVT OpVT = N0.getOperand(0).getSimpleValueType();
APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

bool Op0HiZero =
DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
bool Op1HiZero =
DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

// TODO: Check for zero lower elements once we have actual codegen that
// creates them.
if (!Op0HiZero || !Op1HiZero)
return SDValue();

// Create a shuffle mask packing the lower elements from each VPMADDWD.
SmallVector<int> Mask;
for (int i = 0; i != (int)NumElts; ++i) {
Mask.push_back(2 * i);
Mask.push_back(2 * (i + NumElts));
}

SDValue LHS =
DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
SDValue RHS =
DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
}

/// CMOV of constants requires materializing constant operands in registers.
/// Try to fold those constants into an 'add' instruction to reduce instruction
/// count. We do this with CMOV rather the generic 'select' because there are
Expand Down Expand Up @@ -51167,13 +51211,16 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDLoc DL(N);

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
return Select;

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
return MAdd;
if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
return MAdd;
if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
return MAdd;

// Try to synthesize horizontal adds from adds of shuffles.
Expand All @@ -51190,15 +51237,13 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
SDLoc DL(N);
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
}

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
SDLoc DL(N);
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
}
Expand Down
58 changes: 19 additions & 39 deletions llvm/test/CodeGen/X86/madd.ll
Expand Up @@ -2573,37 +2573,31 @@ define <4 x i32> @pmaddwd_bad_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) {
; SSE2-LABEL: pmaddwd_bad_indices:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,0,65535,0]
; SSE2-NEXT: pand %xmm0, %xmm4
; SSE2-NEXT: pmaddwd %xmm2, %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
; SSE2-NEXT: psrld $16, %xmm2
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: pmaddwd %xmm1, %xmm0
; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: pmaddwd %xmm3, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: pmaddwd_bad_indices:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa (%rsi), %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[10,11],zero,zero,xmm0[12,13],zero,zero
; AVX-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero,xmm0[14,15],zero,zero
; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
; AVX-NEXT: retq
%A = load <8 x i16>, <8 x i16>* %Aptr
%B = load <8 x i16>, <8 x i16>* %Bptr
Expand Down Expand Up @@ -3096,28 +3090,14 @@ define <4 x i32> @output_size_mismatch_high_subvector(<16 x i16> %x, <16 x i16>
; AVX1-LABEL: output_size_mismatch_high_subvector:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX256-LABEL: output_size_mismatch_high_subvector:
; AVX256: # %bb.0:
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX256-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX256-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX256-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX256-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2
; AVX256-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX256-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX256-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
%x0 = shufflevector <16 x i16> %x, <16 x i16> undef, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
Expand Down
75 changes: 36 additions & 39 deletions llvm/test/CodeGen/X86/pmaddubsw.ll
Expand Up @@ -327,29 +327,29 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmaddwd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
; AVX1-NEXT: vpackssdw %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm6
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
; AVX1-NEXT: vpmaddwd %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpackssdw %xmm5, %xmm5, %xmm3
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-NEXT: vpackssdw %xmm4, %xmm4, %xmm3
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9],zero,xmm0[11],zero,xmm0[13],zero,xmm0[15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX256-LABEL: pmaddubsw_bad_extend:
Expand Down Expand Up @@ -430,27 +430,24 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
; AVX1-NEXT: vmovdqa (%rsi), %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmaddwd %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm4, %xmm4, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX256-LABEL: pmaddubsw_bad_indices:
Expand Down

0 comments on commit 3fe9767

Please sign in to comment.