Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2668,6 +2668,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::AVGFLOORU,
ISD::BITREVERSE,
ISD::ADD,
ISD::SADDSAT,
ISD::SSUBSAT,
ISD::FADD,
ISD::FSUB,
ISD::FNEG,
Expand Down Expand Up @@ -8151,6 +8153,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
case X86ISD::FHSUB:
case X86ISD::HADD:
case X86ISD::HSUB:
case X86ISD::HADDS:
case X86ISD::HSUBS:
return true;
}
return false;
Expand Down Expand Up @@ -35121,6 +35125,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BLENDV)
NODE_NAME_CASE(HADD)
NODE_NAME_CASE(HSUB)
NODE_NAME_CASE(HADDS)
NODE_NAME_CASE(HSUBS)
NODE_NAME_CASE(FHADD)
NODE_NAME_CASE(FHSUB)
NODE_NAME_CASE(CONFLICT)
Expand Down Expand Up @@ -40897,8 +40903,9 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
}))
return SDValue();

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::FHSUB ||
Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::HSUB ||
Opcode0 == X86ISD::HADDS || Opcode0 == X86ISD::HSUBS);
bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
if (!isHoriz && !isPack)
return SDValue();
Expand Down Expand Up @@ -54231,7 +54238,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
unsigned Opcode = N->getOpcode();
bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
bool IsAdd =
(Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
bool IsSat = (Opcode == ISD::SADDSAT) || (Opcode == ISD::SSUBSAT);
SmallVector<int, 8> PostShuffleMask;

auto MergableHorizOp = [N](unsigned HorizOpcode) {
Expand Down Expand Up @@ -54261,11 +54270,17 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
break;
case ISD::ADD:
case ISD::SUB:
if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v16i16 || VT == MVT::v8i32)) {
case ISD::SADDSAT:
case ISD::SSUBSAT:
if (!Subtarget.hasSSSE3())
break;
if (VT == MVT::v8i16 || VT == MVT::v16i16 ||
(!IsSat && (VT == MVT::v4i32 || VT == MVT::v8i32))) {

SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
auto HorizOpcode = IsSat ? (IsAdd ? X86ISD::HADDS : X86ISD::HSUBS)
: (IsAdd ? X86ISD::HADD : X86ISD::HSUB);
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
PostShuffleMask, MergableHorizOp(HorizOpcode))) {
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
Expand Down Expand Up @@ -61052,6 +61067,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
case ISD::SADDSAT:
case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget);
case X86ISD::CLOAD:
case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
case X86ISD::SBB: return combineSBB(N, DAG);
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,10 @@ namespace llvm {
HADD,
HSUB,

/// Integer horizontal saturating add/sub.
HADDS,
HSUBS,

/// Floating point horizontal add/sub.
FHADD,
FHSUB,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>;
def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>;
def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>;
Expand Down
34 changes: 16 additions & 18 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -4864,12 +4864,12 @@ let isCommutable = 0 in {
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
int_x86_ssse3_psign_d_128,
SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
int_x86_ssse3_phadd_sw_128,
SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
int_x86_ssse3_phsub_sw_128,
SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
defm VPHADDSW : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v8i16, v8i16, VR128,
load, i128mem,
SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
defm VPHSUBSW : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v8i16, v8i16, VR128,
load, i128mem,
SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
}
}

Expand Down Expand Up @@ -4907,12 +4907,12 @@ let isCommutable = 0 in {
SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
int_x86_avx2_phadd_sw,
SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
int_x86_avx2_phsub_sw,
SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPHADDSWY : SS3I_binop_rm<0x03, "vphaddsw", X86hadds, v16i16, v16i16,
VR256, load, i256mem,
SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
defm VPHSUBSWY : SS3I_binop_rm<0x07, "vphsubsw", X86hsubs, v16i16, v16i16,
VR256, load, i256mem,
SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
}
}

Expand All @@ -4935,12 +4935,10 @@ let isCommutable = 0 in {
SchedWriteVecALU.XMM, memop>;
defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
memop, i128mem, SchedWriteVarShuffle.XMM>;
defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
int_x86_ssse3_phadd_sw_128,
SchedWritePHAdd.XMM, memop>;
defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
int_x86_ssse3_phsub_sw_128,
SchedWritePHAdd.XMM, memop>;
defm PHADDSW : SS3I_binop_rm<0x03, "phaddsw", X86hadds, v8i16, v8i16, VR128,
memop, i128mem, SchedWritePHAdd.XMM>;
defm PHSUBSW : SS3I_binop_rm<0x07, "phsubsw", X86hsubs, v8i16, v8i16, VR128,
memop, i128mem, SchedWritePHAdd.XMM>;
defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
v16i8, VR128, memop, i128mem,
SchedWriteVecIMul.XMM>;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86IntrinsicsInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -724,8 +724,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phadd_sw, INTR_TYPE_2OP, X86ISD::HADDS, 0),
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(avx2_phsub_sw, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
Expand Down Expand Up @@ -2017,11 +2019,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_sw_128, INTR_TYPE_2OP, X86ISD::HADDS, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_sw_128, INTR_TYPE_2OP, X86ISD::HSUBS, 0),
X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
Expand Down
101 changes: 101 additions & 0 deletions llvm/test/CodeGen/X86/haddsubsat.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s -check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=AVX2

define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddsw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX2-LABEL: phaddsw_v8i16_intrinsic:
; AVX2: # %bb.0:
; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
ret <8 x i16> %res
}

define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
; SSSE3-LABEL: phaddsw_v8i16_generic:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddsw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX2-LABEL: phaddsw_v8i16_generic:
; AVX2: # %bb.0:
; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
ret <8 x i16> %sum
}

define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
; SSSE3-LABEL: phaddsw_v16i16_generic:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddsw %xmm1, %xmm0
; SSSE3-NEXT: phaddsw %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; AVX2-LABEL: phaddsw_v16i16_generic:
; AVX2: # %bb.0:
; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
%even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
ret <16 x i16> %sum
}

define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubsw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX2-LABEL: phsubsw_v8i16_intrinsic:
; AVX2: # %bb.0:
; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
ret <8 x i16> %res
}

define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
; SSSE3-LABEL: phsubsw_v8i16_generic:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubsw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX2-LABEL: phsubsw_v8i16_generic:
; AVX2: # %bb.0:
; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
ret <8 x i16> %diff
}

define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
; SSSE3-LABEL: phsubsw_v16i16_generic:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubsw %xmm1, %xmm0
; SSSE3-NEXT: phsubsw %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; AVX2-LABEL: phsubsw_v16i16_generic:
; AVX2: # %bb.0:
; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
%even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
ret <16 x i16> %diff
}
Loading