diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f4f9812a402437..f7af01b2338b7e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45984,18 +45984,8 @@ static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, "Unsupported vector type for horizontal add/sub"); unsigned NumElts = VT.getVectorNumElements(); - // TODO - can we make a general helper method that does all of this for us? auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, SmallVectorImpl &ShuffleMask) { - if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!Op.getOperand(0).isUndef()) - N0 = Op.getOperand(0); - if (!Op.getOperand(1).isUndef()) - N1 = Op.getOperand(1); - ArrayRef Mask = cast(Op)->getMask(); - ShuffleMask.append(Mask.begin(), Mask.end()); - return; - } bool UseSubVector = false; if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getValueType().is256BitVector() && @@ -46004,23 +45994,24 @@ static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, UseSubVector = true; } SmallVector SrcOps; - SmallVector SrcShuffleMask; + SmallVector SrcMask, ScaledMask; SDValue BC = peekThroughBitcasts(Op); - if (isTargetShuffle(BC.getOpcode()) && - getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, - SrcOps, SrcShuffleMask)) { - if (!UseSubVector && SrcShuffleMask.size() == NumElts && - SrcOps.size() <= 2) { + if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) && + !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) { + return Op.getValueSizeInBits() == BC.getValueSizeInBits(); + })) { + resolveTargetShuffleInputsAndMask(SrcOps, SrcMask); + if (!UseSubVector && SrcOps.size() <= 2 && + scaleShuffleElements(SrcMask, NumElts, ScaledMask)) { N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); - ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); - } - if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) && - SrcOps.size() == 1) { - N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op)); - N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op)); - ArrayRef Mask = ArrayRef(SrcShuffleMask).slice(0, NumElts); - ShuffleMask.append(Mask.begin(), Mask.end()); + ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end()); + } + if (UseSubVector && SrcOps.size() == 1 && + scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) { + std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op)); + ArrayRef Mask = ArrayRef(ScaledMask).slice(0, NumElts); + ShuffleMask.assign(Mask.begin(), Mask.end()); } } }; diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll index a80645f44d9d05..fba0016b232b44 100644 --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -1863,6 +1863,63 @@ define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { ret i32 %r } +; https://bugs.chromium.org/p/chromium/issues/detail?id=1195353 +define <2 x i64> @negative_extract_v16i16_v8i16(<4 x i64> %a0) { +; SSE3-LABEL: negative_extract_v16i16_v8i16: +; SSE3: # %bb.0: +; SSE3-NEXT: paddw %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; AVX1-SLOW-LABEL: negative_extract_v16i16_v8i16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: negative_extract_v16i16_v8i16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: negative_extract_v16i16_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: negative_extract_v16i16_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: negative_extract_v16i16_v8i16: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: negative_extract_v16i16_v8i16: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq + %s = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %b = bitcast <4 x i64> %a0 to <16 x i16> + %c = bitcast <4 x i64> %s to <16 x i16> + %d = add <16 x i16> %b, %c + %e = bitcast <16 x i16> %d to <4 x i64> + %f = shufflevector <4 x i64> %e, <4 x i64> undef, <2 x i32> + ret <2 x i64> %f +} + ; PR42023 - https://bugs.llvm.org/show_bug.cgi?id=42023 define i16 @hadd16_8(<8 x i16> %x223) {