diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fe9a598fb5611..84cca4a6eb269 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2668,6 +2668,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::AVGFLOORU, ISD::BITREVERSE, ISD::ADD, + ISD::SADDSAT, + ISD::SSUBSAT, ISD::FADD, ISD::FSUB, ISD::FNEG, @@ -8151,6 +8153,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, case X86ISD::FHSUB: case X86ISD::HADD: case X86ISD::HSUB: + case X86ISD::HADDS: + case X86ISD::HSUBS: return true; } return false; @@ -35121,6 +35125,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BLENDV) NODE_NAME_CASE(HADD) NODE_NAME_CASE(HSUB) + NODE_NAME_CASE(HADDS) + NODE_NAME_CASE(HSUBS) NODE_NAME_CASE(FHADD) NODE_NAME_CASE(FHSUB) NODE_NAME_CASE(CONFLICT) @@ -40897,8 +40903,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( })) return SDValue(); - bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || - Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); + bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB || + Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB || + Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS); bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); if (!isHoriz && !isPack) return SDValue(); @@ -54231,7 +54238,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); unsigned Opcode = N->getOpcode(); - bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD); + bool IsAdd = + (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT); + bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT); SmallVector PostShuffleMask; auto MergableHorizOp = [N](unsigned HorizOpcode) { @@ -54261,11 +54270,17 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, break; case ISD::ADD: case ISD::SUB: - if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v16i16 || VT == MVT::v8i32)) { + case ISD::SADDSAT: + case ISD::SSUBSAT: + if (!Subtarget.hasSSSE3()) + break; + if (VT == MVT::v8i16 || VT == MVT::v16i16 || + (!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) { + SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; + auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS) + : (IsAdd ? X86ISD::HADD : X86ISD::HSUB); if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, PostShuffleMask, MergableHorizOp(HorizOpcode))) { auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, @@ -61052,6 +61067,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget); + case ISD::SADDSAT: + case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget); case X86ISD::CLOAD: case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG); case X86ISD::SBB: return combineSBB(N, DAG); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b7151f65942b4..c5085299716ed 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -270,6 +270,10 @@ namespace llvm { HADD, HSUB, + /// Integer horizontal saturating add/sub. + HADDS, + HSUBS, + /// Floating point horizontal add/sub. FHADD, FHSUB, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 5321ecf0c1b2c..0803a4946b379 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; +def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>; +def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>; def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 806b02b9f9359..e4aaa1e1b594a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4864,12 +4864,12 @@ let isCommutable = 0 in { defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; - defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", - int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; - defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", - int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; + defm VPHADDSW : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v8i16, v8i16, VR128, + load, i128mem, + SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; + defm VPHSUBSW : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v8i16, v8i16, VR128, + load, i128mem, + SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; } } @@ -4907,12 +4907,12 @@ let isCommutable = 0 in { SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; - defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", - int_x86_avx2_phadd_sw, - SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; - defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", - int_x86_avx2_phsub_sw, - SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; + defm VPHADDSWY : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v16i16, v16i16, + VR256, load, i256mem, + SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; + defm VPHSUBSWY : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v16i16, v16i16, + VR256, load, i256mem, + SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; } } @@ -4935,12 +4935,10 @@ let isCommutable = 0 in { SchedWriteVecALU.XMM, memop>; defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, memop, i128mem, SchedWriteVarShuffle.XMM>; - defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", - int_x86_ssse3_phadd_sw_128, - SchedWritePHAdd.XMM, memop>; - defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", - int_x86_ssse3_phsub_sw_128, - SchedWritePHAdd.XMM, memop>; + defm PHADDSW : SS3I_binop_rm<0x03, "phaddsw", X86hadds, v8i16, v8i16, VR128, + memop, i128mem, SchedWritePHAdd.XMM>; + defm PHSUBSW : SS3I_binop_rm<0x07, "phsubsw", X86hsubs, v8i16, v8i16, VR128, + memop, i128mem, SchedWritePHAdd.XMM>; defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, v16i8, VR128, memop, i128mem, SchedWriteVecIMul.XMM>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 0f725a8eb338b..99665b5872fe2 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -724,8 +724,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phadd_sw, INTR_TYPE_2OP, X86ISD::HADDS, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_phsub_sw, INTR_TYPE_2OP, X86ISD::HSUBS, 0), X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), @@ -2017,11 +2019,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phadd_sw_128, INTR_TYPE_2OP, X86ISD::HADDS, 0), X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phsub_sw_128, INTR_TYPE_2OP, X86ISD::HSUBS, 0), X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0), diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll new file mode 100644 index 0000000000000..588f3383ec415 --- /dev/null +++ b/llvm/test/CodeGen/X86/haddsubsat.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s -check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 + +define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phaddsw_v8i16_intrinsic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phaddsw_v8i16_intrinsic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phaddsw_v8i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phaddsw_v8i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd) + ret <8 x i16> %sum +} + +define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) { +; SSSE3-LABEL: phaddsw_v16i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddsw %xmm1, %xmm0 +; SSSE3-NEXT: phaddsw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phaddsw_v16i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq + %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + %odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd) + ret <16 x i16> %sum +} + +define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phsubsw_v8i16_intrinsic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phsubsw_v8i16_intrinsic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %res +} + +define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) { +; SSSE3-LABEL: phsubsw_v8i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubsw %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phsubsw_v8i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd) + ret <8 x i16> %diff +} + +define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) { +; SSSE3-LABEL: phsubsw_v16i16_generic: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubsw %xmm1, %xmm0 +; SSSE3-NEXT: phsubsw %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; AVX2-LABEL: phsubsw_v16i16_generic: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq + %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + %odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd) + ret <16 x i16> %diff +}