Skip to content

Commit f8bae37

Browse files
committed
optimize ssse3 horizontal saturating add/sub
1 parent 2936852 commit f8bae37

File tree

5 files changed

+181
-1
lines changed

5 files changed

+181
-1
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2654,6 +2654,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
26542654
ISD::AVGFLOORU,
26552655
ISD::BITREVERSE,
26562656
ISD::ADD,
2657+
ISD::SADDSAT,
2658+
ISD::SSUBSAT,
26572659
ISD::FADD,
26582660
ISD::FSUB,
26592661
ISD::FNEG,
@@ -8114,6 +8116,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
81148116
case X86ISD::FHSUB:
81158117
case X86ISD::HADD:
81168118
case X86ISD::HSUB:
8119+
case X86ISD::HADDS:
8120+
case X86ISD::HSUBS:
81178121
return true;
81188122
}
81198123
return false;
@@ -34984,6 +34988,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3498434988
NODE_NAME_CASE(BLENDV)
3498534989
NODE_NAME_CASE(HADD)
3498634990
NODE_NAME_CASE(HSUB)
34991+
NODE_NAME_CASE(HADDS)
34992+
NODE_NAME_CASE(HSUBS)
3498734993
NODE_NAME_CASE(FHADD)
3498834994
NODE_NAME_CASE(FHSUB)
3498934995
NODE_NAME_CASE(CONFLICT)
@@ -54034,7 +54040,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5403454040
const X86Subtarget &Subtarget) {
5403554041
EVT VT = N->getValueType(0);
5403654042
unsigned Opcode = N->getOpcode();
54037-
bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54043+
bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
5403854044
SmallVector<int, 8> PostShuffleMask;
5403954045

5404054046
auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54084,6 +54090,27 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
5408454090
}
5408554091
}
5408654092
break;
54093+
case ISD::SADDSAT:
54094+
case ISD::SSUBSAT:
54095+
if (Subtarget.hasSSSE3() && VT == MVT::v8i16) {
54096+
SDValue LHS = N->getOperand(0);
54097+
SDValue RHS = N->getOperand(1);
54098+
auto HorizOpcode = IsAdd ? X86ISD::HADDS : X86ISD::HSUBS;
54099+
if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54100+
PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54101+
auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54102+
ArrayRef<SDValue> Ops) {
54103+
return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54104+
};
54105+
SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54106+
{LHS, RHS}, HOpBuilder);
54107+
if (!PostShuffleMask.empty())
54108+
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54109+
DAG.getUNDEF(VT), PostShuffleMask);
54110+
return HorizBinOp;
54111+
}
54112+
}
54113+
break;
5408754114
}
5408854115

5408954116
return SDValue();
@@ -60793,6 +60820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
6079360820
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
6079460821
case X86ISD::ADD:
6079560822
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60823+
case ISD::SADDSAT:
60824+
case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget);
6079660825
case X86ISD::CLOAD:
6079760826
case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
6079860827
case X86ISD::SBB: return combineSBB(N, DAG);

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ namespace llvm {
270270
HADD,
271271
HSUB,
272272

273+
/// Integer horizontal saturating add/sub.
274+
HADDS,
275+
HSUBS,
276+
273277
/// Floating point horizontal add/sub.
274278
FHADD,
275279
FHSUB,

llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
7171
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
7272
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
7373
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
74+
def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>;
75+
def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>;
7476
def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
7577
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
7678
def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>;

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4949,6 +4949,12 @@ defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
49494949
VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
49504950
}
49514951

4952+
def : Pat<(v8i16 (X86hadds VR128:$src1, VR128:$src2)),
4953+
(PHADDSWrr VR128:$src1, VR128:$src2)>;
4954+
4955+
def : Pat<(v8i16 (X86hsubs VR128:$src1, VR128:$src2)),
4956+
(PHSUBSWrr VR128:$src1, VR128:$src2)>;
4957+
49524958
//===---------------------------------------------------------------------===//
49534959
// SSSE3 - Packed Align Instruction Patterns
49544960
//===---------------------------------------------------------------------===//
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 -x86-asm-syntax=intel | FileCheck %s -check-prefix=SSSE3
3+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 -x86-asm-syntax=intel | FileCheck %s -check-prefix=AVX2
4+
5+
define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
6+
; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
7+
; SSSE3: # %bb.0: # %entry
8+
; SSSE3-NEXT: phaddsw xmm0, xmm1
9+
; SSSE3-NEXT: ret
10+
;
11+
; AVX2-LABEL: phaddsw_v8i16_intrinsic:
12+
; AVX2: # %bb.0: # %entry
13+
; AVX2-NEXT: vphaddsw xmm0, xmm0, xmm1
14+
; AVX2-NEXT: ret
15+
entry:
16+
%res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
17+
ret <8 x i16> %res
18+
}
19+
20+
define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
21+
; SSSE3-LABEL: phaddsw_v8i16_generic:
22+
; SSSE3: # %bb.0: # %entry
23+
; SSSE3-NEXT: phaddsw xmm0, xmm1
24+
; SSSE3-NEXT: ret
25+
;
26+
; AVX2-LABEL: phaddsw_v8i16_generic:
27+
; AVX2: # %bb.0: # %entry
28+
; AVX2-NEXT: phaddsw xmm0, xmm1
29+
; AVX2-NEXT: ret
30+
entry:
31+
%even = shufflevector <8 x i16> %a, <8 x i16> %b,
32+
<8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
33+
%odd = shufflevector <8 x i16> %a, <8 x i16> %b,
34+
<8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
35+
%sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
36+
ret <8 x i16> %sum
37+
}
38+
39+
define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
40+
; SSSE3-LABEL: phaddsw_v16i16_generic:
41+
; SSSE3: # %bb.0: # %entry
42+
; SSSE3-NEXT: phaddsw xmm0, xmm1
43+
; SSSE3-NEXT: phaddsw xmm2, xmm3
44+
; SSSE3-NEXT: movdqa xmm1, xmm2
45+
; SSSE3-NEXT: ret
46+
;
47+
; AVX2-LABEL: phaddsw_v16i16_generic:
48+
; AVX2: # %bb.0: # %entry
49+
; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
50+
; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
51+
; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
52+
; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
53+
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
54+
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
55+
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
56+
; AVX2-NEXT: vpshufb ymm1, ymm1, ymm3
57+
; AVX2-NEXT: vpshufb ymm0, ymm0, ymm3
58+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
59+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
60+
; AVX2-NEXT: vpaddsw ymm0, ymm2, ymm0
61+
; AVX2-NEXT: ret
62+
entry:
63+
%even = shufflevector <16 x i16> %a, <16 x i16> %b,
64+
<16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
65+
i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
66+
%odd = shufflevector <16 x i16> %a, <16 x i16> %b,
67+
<16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
68+
i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
69+
%sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
70+
ret <16 x i16> %sum
71+
}
72+
73+
define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
74+
; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
75+
; SSSE3: # %bb.0: # %entry
76+
; SSSE3-NEXT: phsubsw xmm0, xmm1
77+
; SSSE3-NEXT: ret
78+
;
79+
; AVX2-LABEL: phsubsw_v8i16_intrinsic:
80+
; AVX2: # %bb.0: # %entry
81+
; AVX2-NEXT: vphsubsw xmm0, xmm0, xmm1
82+
; AVX2-NEXT: ret
83+
entry:
84+
%res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
85+
ret <8 x i16> %res
86+
}
87+
88+
define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
89+
; SSSE3-LABEL: phsubsw_v8i16_generic:
90+
; SSSE3: # %bb.0: # %entry
91+
; SSSE3-NEXT: phsubsw xmm0, xmm1
92+
; SSSE3-NEXT: ret
93+
;
94+
; AVX2-LABEL: phsubsw_v8i16_generic:
95+
; AVX2: # %bb.0: # %entry
96+
; AVX2-NEXT: phsubsw xmm0, xmm1
97+
; AVX2-NEXT: ret
98+
entry:
99+
%even = shufflevector <8 x i16> %a, <8 x i16> %b,
100+
<8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
101+
%odd = shufflevector <8 x i16> %a, <8 x i16> %b,
102+
<8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
103+
%diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
104+
ret <8 x i16> %diff
105+
}
106+
107+
define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
108+
; SSSE3-LABEL: phsubsw_v16i16_generic:
109+
; SSSE3: # %bb.0: # %entry
110+
; SSSE3-NEXT: phsubsw xmm0, xmm1
111+
; SSSE3-NEXT: phsubsw xmm2, xmm3
112+
; SSSE3-NEXT: movdqa xmm1, xmm2
113+
; SSSE3-NEXT: ret
114+
;
115+
; AVX2-LABEL: phsubsw_v16i16_generic:
116+
; AVX2: # %bb.0: # %entry
117+
; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
118+
; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
119+
; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
120+
; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
121+
; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
122+
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
123+
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
124+
; AVX2-NEXT: vpshufb ymm1, ymm1, ymm3
125+
; AVX2-NEXT: vpshufb ymm0, ymm0, ymm3
126+
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
127+
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
128+
; AVX2-NEXT: vpsubsw ymm0, ymm2, ymm0
129+
; AVX2-NEXT: ret
130+
entry:
131+
%even = shufflevector <16 x i16> %a, <16 x i16> %b,
132+
<16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
133+
i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
134+
%odd = shufflevector <16 x i16> %a, <16 x i16> %b,
135+
<16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
136+
i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
137+
%diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
138+
ret <16 x i16> %diff
139+
}

0 commit comments

Comments
 (0)