-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[X86] optimize ssse3 horizontal saturating add/sub #169591
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Folkert de Vries (folkertdev) ChangesCurrently LLVM fails to recognize a manual implementation of https://godbolt.org/z/zozrssaWb declare <8 x i16> @<!-- -->llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>)
declare <8 x i16> @<!-- -->llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
define <8 x i16> @<!-- -->phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
entry:
%res = call <8 x i16> @<!-- -->llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
ret <8 x i16> %res
}
define <8 x i16> @<!-- -->phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
entry:
%even = shufflevector <8 x i16> %a, <8 x i16> %b,
<8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%odd = shufflevector <8 x i16> %a, <8 x i16> %b,
<8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%sum = call <8 x i16> @<!-- -->llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
ret <8 x i16> %sum
}phaddsw_v8i16_intrinsic: # @<!-- -->phaddsw_v8i16_intrinsic
phaddsw xmm0, xmm1
ret
phaddsw_v8i16_generic: # @<!-- -->phaddsw_v8i16_generic
movdqa xmm2, xmmword ptr [rip + .LCPI1_0] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
movdqa xmm3, xmm1
pshufb xmm3, xmm2
movdqa xmm4, xmm0
pshufb xmm4, xmm2
punpcklqdq xmm4, xmm3 # xmm4 = xmm4[0],xmm3[0]
psrad xmm1, 16
psrad xmm0, 16
packssdw xmm0, xmm1
paddsw xmm0, xmm4
retThis PR does recognize the pattern. What I haven't been able to figure out is how to also make this work for a Full diff: https://github.com/llvm/llvm-project/pull/169591.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a950e3a..3370e2de0dbbd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2654,6 +2654,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::AVGFLOORU,
ISD::BITREVERSE,
ISD::ADD,
+ ISD::SADDSAT,
+ ISD::SSUBSAT,
ISD::FADD,
ISD::FSUB,
ISD::FNEG,
@@ -8114,6 +8116,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
case X86ISD::FHSUB:
case X86ISD::HADD:
case X86ISD::HSUB:
+ case X86ISD::HADDS:
+ case X86ISD::HSUBS:
return true;
}
return false;
@@ -34984,6 +34988,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BLENDV)
NODE_NAME_CASE(HADD)
NODE_NAME_CASE(HSUB)
+ NODE_NAME_CASE(HADDS)
+ NODE_NAME_CASE(HSUBS)
NODE_NAME_CASE(FHADD)
NODE_NAME_CASE(FHSUB)
NODE_NAME_CASE(CONFLICT)
@@ -54034,7 +54040,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
unsigned Opcode = N->getOpcode();
- bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
+ bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
SmallVector<int, 8> PostShuffleMask;
auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54084,6 +54090,27 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
}
}
break;
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
+ if (Subtarget.hasSSSE3() && VT == MVT::v8i16) {
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ auto HorizOpcode = IsAdd ? X86ISD::HADDS : X86ISD::HSUBS;
+ if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+ PostShuffleMask, MergableHorizOp(HorizOpcode))) {
+ auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+ };
+ SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+ {LHS, RHS}, HOpBuilder);
+ if (!PostShuffleMask.empty())
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+ DAG.getUNDEF(VT), PostShuffleMask);
+ return HorizBinOp;
+ }
+ }
+ break;
}
return SDValue();
@@ -60793,6 +60820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT: return combineToHorizontalAddSub(N, DAG, Subtarget);
case X86ISD::CLOAD:
case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
case X86ISD::SBB: return combineSBB(N, DAG);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..8425e18d0b35e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -270,6 +270,10 @@ namespace llvm {
HADD,
HSUB,
+ /// Integer horizontal saturating add/sub.
+ HADDS,
+ HSUBS,
+
/// Floating point horizontal add/sub.
FHADD,
FHSUB,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5321ecf0c1b2c..0803a4946b379 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,6 +71,8 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
+def X86hadds : SDNode<"X86ISD::HADDS", SDTIntBinOp>;
+def X86hsubs : SDNode<"X86ISD::HSUBS", SDTIntBinOp>;
def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
def X86comi512 : SDNode<"X86ISD::COMX", SDTX86FCmp>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 806b02b9f9359..ee16eaa0462ea 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4949,6 +4949,12 @@ defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
}
+def : Pat<(v8i16 (X86hadds VR128:$src1, VR128:$src2)),
+ (PHADDSWrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(v8i16 (X86hsubs VR128:$src1, VR128:$src2)),
+ (PHSUBSWrr VR128:$src1, VR128:$src2)>;
+
//===---------------------------------------------------------------------===//
// SSSE3 - Packed Align Instruction Patterns
//===---------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
new file mode 100644
index 0000000000000..d7fd38c623c41
--- /dev/null
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 -x86-asm-syntax=intel | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 -x86-asm-syntax=intel | FileCheck %s -check-prefix=AVX2
+
+define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: phaddsw xmm0, xmm1
+; SSSE3-NEXT: ret
+;
+; AVX2-LABEL: phaddsw_v8i16_intrinsic:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vphaddsw xmm0, xmm0, xmm1
+; AVX2-NEXT: ret
+entry:
+ %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_generic:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: phaddsw xmm0, xmm1
+; SSSE3-NEXT: ret
+;
+; AVX2-LABEL: phaddsw_v8i16_generic:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: phaddsw xmm0, xmm1
+; AVX2-NEXT: ret
+entry:
+ %even = shufflevector <8 x i16> %a, <8 x i16> %b,
+ <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %odd = shufflevector <8 x i16> %a, <8 x i16> %b,
+ <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+ ret <8 x i16> %sum
+}
+
+define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v16i16_generic:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: phaddsw xmm0, xmm1
+; SSSE3-NEXT: phaddsw xmm2, xmm3
+; SSSE3-NEXT: movdqa xmm1, xmm2
+; SSSE3-NEXT: ret
+;
+; AVX2-LABEL: phaddsw_v16i16_generic:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT: vpshufb ymm1, ymm1, ymm3
+; AVX2-NEXT: vpshufb ymm0, ymm0, ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpaddsw ymm0, ymm2, ymm0
+; AVX2-NEXT: ret
+entry:
+ %even = shufflevector <16 x i16> %a, <16 x i16> %b,
+ <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+ i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %odd = shufflevector <16 x i16> %a, <16 x i16> %b,
+ <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+ i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+ ret <16 x i16> %sum
+}
+
+define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: phsubsw xmm0, xmm1
+; SSSE3-NEXT: ret
+;
+; AVX2-LABEL: phsubsw_v8i16_intrinsic:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vphsubsw xmm0, xmm0, xmm1
+; AVX2-NEXT: ret
+entry:
+ %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_generic:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: phsubsw xmm0, xmm1
+; SSSE3-NEXT: ret
+;
+; AVX2-LABEL: phsubsw_v8i16_generic:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: phsubsw xmm0, xmm1
+; AVX2-NEXT: ret
+entry:
+ %even = shufflevector <8 x i16> %a, <8 x i16> %b,
+ <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %odd = shufflevector <8 x i16> %a, <8 x i16> %b,
+ <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+ ret <8 x i16> %diff
+}
+
+define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v16i16_generic:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: phsubsw xmm0, xmm1
+; SSSE3-NEXT: phsubsw xmm2, xmm3
+; SSSE3-NEXT: movdqa xmm1, xmm2
+; SSSE3-NEXT: ret
+;
+; AVX2-LABEL: phsubsw_v16i16_generic:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT: vpshufb ymm1, ymm1, ymm3
+; AVX2-NEXT: vpshufb ymm0, ymm0, ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpsubsw ymm0, ymm2, ymm0
+; AVX2-NEXT: ret
+entry:
+ %even = shufflevector <16 x i16> %a, <16 x i16> %b,
+ <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+ i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %odd = shufflevector <16 x i16> %a, <16 x i16> %b,
+ <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+ i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+ ret <16 x i16> %diff
+}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
f8bae37 to
46d83df
Compare
folkertdev
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I figured it out, the avx2 case also works now.
ff70647 to
e4ef817
Compare
0484baa to
4eb0f3f
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Currently LLVM fails to recognize a manual implementation of
phaddhttps://godbolt.org/z/zozrssaWb
This PR does recognize the pattern.
What I haven't been able to figure out is how to also make this work for a
v16i16vector (using the avx2 instruction). What would be the best way to go about that? My patterns give weird compile errors, and I haven't really been able to find a good analogue.