diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 46a9be5212c467..1a98c4653a31c6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41996,26 +41996,31 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG) { +static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); - assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && - "Unexpected pack opcode"); + assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || + X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || + X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && + "Unexpected hadd/hsub/pack opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - unsigned NumDstElts = VT.getVectorNumElements(); + EVT SrcVT = N0.getValueType(); // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for // truncation trees that help us avoid lane crossing shuffles. // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. + // TODO: We don't handle vXf64 shuffles yet. if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.getConstantOperandAPInt(1) == 0 && - N1.getConstantOperandAPInt(1) == (NumDstElts / 2) && + N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() && N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() && - N0.getOperand(0).getValueType().is256BitVector()) { + N0.getOperand(0).getValueType().is256BitVector() && + SrcVT.getScalarSizeInBits() <= 32) { // TODO - support target/faux shuffles. SDValue Vec = peekThroughBitcasts(N0.getOperand(0)); if (auto *SVN = dyn_cast(Vec)) { @@ -42026,12 +42031,13 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG) { scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) { SDLoc DL(N); SDValue Lo, Hi; + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL); Lo = DAG.getBitcast(N0.getValueType(), Lo); Hi = DAG.getBitcast(N1.getValueType(), Hi); SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); - Res = DAG.getBitcast(MVT::v4i32, Res); - Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); return DAG.getBitcast(VT, Res); } } @@ -42039,7 +42045,7 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG) { // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)). // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles. - if (VT.is256BitVector()) { + if (VT.is256BitVector() && Subtarget.hasInt256()) { if (auto *SVN0 = dyn_cast(N0)) { if (auto *SVN1 = dyn_cast(N1)) { SmallVector ShuffleMask0, ShuffleMask1; @@ -42058,9 +42064,10 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG) { ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); SDLoc DL(N); + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); - Res = DAG.getBitcast(MVT::v4i64, Res); - Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); return DAG.getBitcast(VT, Res); } } @@ -42145,7 +42152,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, } // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()). - if (SDValue V = combineHorizOpWithShuffle(N, DAG)) + if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) return V; // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular @@ -42197,6 +42204,21 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || + X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode) && + "Unexpected horizontal add/sub opcode"); + + // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()). + if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) + return V; + + return SDValue(); +} + static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -49087,6 +49109,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::PACKSS: case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget); case X86ISD::VSHL: case X86ISD::VSRA: case X86ISD::VSRL: diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 2741ba5af0da45..d7bacfe04be88a 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -1008,16 +1008,15 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) { ; SSSE3-LABEL: hadd_4f32_v8f32_shuffle: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSSE3-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSSE3-NEXT: haddps %xmm1, %xmm0 +; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hadd_4f32_v8f32_shuffle: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> @@ -1032,16 +1031,15 @@ define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) { define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) { ; SSSE3-LABEL: hsub_4f32_v8f32_shuffle: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSSE3-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSSE3-NEXT: haddps %xmm1, %xmm0 +; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hsub_4f32_v8f32_shuffle: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> @@ -1056,24 +1054,23 @@ define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) { define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) { ; SSSE3-LABEL: hadd_4i32_v8i32_shuffle: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: phaddd %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: retq ; ; AVX1-LABEL: hadd_4i32_v8i32_shuffle: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_4i32_v8i32_shuffle: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> @@ -1088,24 +1085,23 @@ define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) { define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) { ; SSSE3-LABEL: hsub_4i32_v8i32_shuffle: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSSE3-NEXT: phaddd %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: retq ; ; AVX1-LABEL: hsub_4i32_v8i32_shuffle: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_4i32_v8i32_shuffle: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> @@ -1129,12 +1125,18 @@ define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) ; SSSE3-NEXT: movapd %xmm2, %xmm1 ; SSSE3-NEXT: retq ; -; AVX-LABEL: hadd_4f64_v4f64_shuffle: -; AVX: # %bb.0: -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: hadd_4f64_v4f64_shuffle: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: hadd_4f64_v4f64_shuffle: +; AVX2: # %bb.0: +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> @@ -1151,12 +1153,18 @@ define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) ; SSSE3-NEXT: movapd %xmm2, %xmm1 ; SSSE3-NEXT: retq ; -; AVX-LABEL: hsub_4f64_v4f64_shuffle: -; AVX: # %bb.0: -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vhsubpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: hsub_4f64_v4f64_shuffle: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vhsubpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: hsub_4f64_v4f64_shuffle: +; AVX2: # %bb.0: +; AVX2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> @@ -1173,12 +1181,18 @@ define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { ; SSSE3-NEXT: movaps %xmm2, %xmm1 ; SSSE3-NEXT: retq ; -; AVX-LABEL: hadd_8f32_v8f32_shuffle: -; AVX: # %bb.0: -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vhaddps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: hadd_8f32_v8f32_shuffle: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: hadd_8f32_v8f32_shuffle: +; AVX2: # %bb.0: +; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> @@ -1195,12 +1209,18 @@ define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { ; SSSE3-NEXT: movaps %xmm2, %xmm1 ; SSSE3-NEXT: retq ; -; AVX-LABEL: hsub_8f32_v8f32_shuffle: -; AVX: # %bb.0: -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vhaddps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: hsub_8f32_v8f32_shuffle: +; AVX1: # %bb.0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: hsub_8f32_v8f32_shuffle: +; AVX2: # %bb.0: +; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> @@ -1228,9 +1248,8 @@ define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { ; ; AVX2-LABEL: hadd_8i32_v8i32_shuffle: ; AVX2: # %bb.0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vphaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> @@ -1259,9 +1278,8 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { ; ; AVX2-LABEL: hsub_8i32_v8i32_shuffle: ; AVX2: # %bb.0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vphsubd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> @@ -1290,9 +1308,8 @@ define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) { ; ; AVX2-LABEL: hadd_16i16_16i16_shuffle: ; AVX2: # %bb.0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vphaddw %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32>