Skip to content

Commit

Permalink
[X86] Replace support for vXi32 SMUL_LOHI/UMUL_LOHI with MULHS/MULHU …
Browse files Browse the repository at this point in the history
…support instead.

Summary:
The only time vector SMUL_LOHI/UMUL_LOHI nodes are created is during division/remainder lowering. If its created before op legalization, generic DAGCombine immediately turns that SMUL_LOHI/UMUL_LOHI into a MULHS/MULHU since only the upper half is used. That node will stick around through vector op legalization and will be turned back into UMUL_LOHI/SMUL_LOHI during op legalization. It will then be custom lowered by the X86 backend. Due to this two step lowering the vector shuffles created by the custom lowering get legalized after their inputs rather than before. This prevents the shuffles from being combined with any build_vector of constants.

This patch uses changes vXi32 to use MULHS/MULHU instead. This is what the later DAG combine did anyway. But by skipping the change back to UMUL_LOHI/SMUL_LOHI we lower it before any constant BUILD_VECTORS. This allows the vector_shuffle creation to constant fold with the build_vectors. This accounts for the test changes here.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D51254

llvm-svn: 340690
  • Loading branch information
topperc committed Aug 25, 2018
1 parent a11a3b3 commit ebec279
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 329 deletions.
183 changes: 75 additions & 108 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -782,8 +782,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
Expand Down Expand Up @@ -1087,9 +1087,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MUL, MVT::v32i8, Custom);

setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
Expand Down Expand Up @@ -1331,8 +1330,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);

setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
Expand Down Expand Up @@ -22901,6 +22900,75 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntArith(Op, DAG);

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
(VT == MVT::v16i32 && Subtarget.hasAVX512()));
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);

int NumElts = VT.getVectorNumElements();

// PMULxD operations multiply each even value (starting at 0) of LHS with
// the related value of RHS and produce a widen result.
// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
//
// In other word, to have all the results, we need to perform two PMULxD:
// 1. one with the even values.
// 2. one with the odd values.
// To achieve #2, with need to place the odd values at an even position.
//
// Place the odd value at an even position (basically, shift all values 1
// step to the left):
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
9, -1, 11, -1, 13, -1, 15, -1};
// <a|b|c|d> => <b|undef|d|undef>
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
makeArrayRef(&Mask[0], NumElts));
// <e|f|g|h> => <f|undef|h|undef>
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
makeArrayRef(&Mask[0], NumElts));

// Emit two multiplies, one for the lower 2 ints and one for the higher 2
// ints.
MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
bool IsSigned = Op->getOpcode() == ISD::MULHS;
unsigned Opcode =
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, Op0),
DAG.getBitcast(MulVT, Op1)));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, Odd0),
DAG.getBitcast(MulVT, Odd1)));

// Shuffle it back into the right order.
SmallVector<int, 16> ShufMask(NumElts);
for (int i = 0; i != NumElts; ++i)
ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

// If we have a signed multiply but no PMULDQ fix up the result of an
// unsigned multiply.
if (IsSigned && !Subtarget.hasSSE41()) {
SDValue ShAmt = DAG.getConstant(31, dl, VT);
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
}

return Res;
}

// Only i8 vectors should need custom lowering after this.
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
Expand Down Expand Up @@ -23084,105 +23152,6 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
return DAG.getBitcast(VT, CallInfo.first);
}

static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
MVT VT = Op0.getSimpleValueType();
SDLoc dl(Op);

// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
unsigned Opcode = Op.getOpcode();
unsigned NumElems = VT.getVectorNumElements();
MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
SDValue Ops[] = {
DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
};
return DAG.getMergeValues(Ops, dl);
}

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
(VT == MVT::v8i32 && Subtarget.hasInt256()) ||
(VT == MVT::v16i32 && Subtarget.hasAVX512()));

int NumElts = VT.getVectorNumElements();

// PMULxD operations multiply each even value (starting at 0) of LHS with
// the related value of RHS and produce a widen result.
// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
//
// In other word, to have all the results, we need to perform two PMULxD:
// 1. one with the even values.
// 2. one with the odd values.
// To achieve #2, with need to place the odd values at an even position.
//
// Place the odd value at an even position (basically, shift all values 1
// step to the left):
const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
// <a|b|c|d> => <b|undef|d|undef>
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
makeArrayRef(&Mask[0], NumElts));
// <e|f|g|h> => <f|undef|h|undef>
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
makeArrayRef(&Mask[0], NumElts));

// Emit two multiplies, one for the lower 2 ints and one for the higher 2
// ints.
MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
unsigned Opcode =
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, Op0),
DAG.getBitcast(MulVT, Op1)));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
DAG.getBitcast(MulVT, Odd0),
DAG.getBitcast(MulVT, Odd1)));

// Shuffle it back into the right order.
SmallVector<int, 16> HighMask(NumElts);
SmallVector<int, 16> LowMask(NumElts);
for (int i = 0; i != NumElts; ++i) {
HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
}

SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

// If we have a signed multiply but no PMULDQ fix up the high parts of a
// unsigned multiply.
if (IsSigned && !Subtarget.hasSSE41()) {
SDValue ShAmt = DAG.getConstant(
31, dl,
DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
}

// The first result of MUL_LOHI is actually the low value, followed by the
// high value.
SDValue Ops[] = {Lows, Highs};
return DAG.getMergeValues(Ops, dl);
}

// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
Expand Down Expand Up @@ -25579,8 +25548,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::MULHS:
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
case ISD::UMUL_LOHI:
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
case ISD::ROTL:
case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
Expand Down
96 changes: 44 additions & 52 deletions llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
Expand Up @@ -78,22 +78,19 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm3
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: psubd %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: psubd %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrld $31, %xmm0
Expand Down Expand Up @@ -134,13 +131,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: test_div7_4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
Expand Down Expand Up @@ -384,33 +380,30 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm3
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: psubd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrld $31, %xmm2
; SSE2-NEXT: psrad $2, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: psubd %xmm3, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrld $31, %xmm1
; SSE2-NEXT: psrad $2, %xmm2
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: psubd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
Expand Down Expand Up @@ -448,13 +441,12 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
;
; AVX2-LABEL: test_rem7_4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
Expand Down

0 comments on commit ebec279

Please sign in to comment.