[X86] optimize ssse3 horizontal saturating add/sub #169591

folkertdev · 2025-11-26T01:13:12Z

Currently LLVM fails to recognize a manual implementation of phadd

declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>)

declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)

define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
entry:
  %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
  ret <8 x i16> %res
}

define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
entry:
  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
  %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
  ret <8 x i16> %sum
}

phaddsw_v8i16_intrinsic:                # @phaddsw_v8i16_intrinsic
        phaddsw xmm0, xmm1
        ret

phaddsw_v8i16_generic:                  # @phaddsw_v8i16_generic
        movdqa  xmm2, xmmword ptr [rip + .LCPI1_0] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
        movdqa  xmm3, xmm1
        pshufb  xmm3, xmm2
        movdqa  xmm4, xmm0
        pshufb  xmm4, xmm2
        punpcklqdq      xmm4, xmm3              # xmm4 = xmm4[0],xmm3[0]
        psrad   xmm1, 16
        psrad   xmm0, 16
        packssdw        xmm0, xmm1
        paddsw  xmm0, xmm4
        ret

This PR does recognize the pattern.

What I haven't been able to figure out is how to also make this work for a v16i16 vector (using the avx2 instruction). What would be the best way to go about that? My patterns give weird compile errors, and I haven't really been able to find a good analogue.

llvmbot · 2025-11-26T01:13:43Z

@llvm/pr-subscribers-backend-x86

Author: Folkert de Vries (folkertdev)

Changes

Currently LLVM fails to recognize a manual implementation of phadd

https://godbolt.org/z/zozrssaWb

declare &lt;8 x i16&gt; @<!-- -->llvm.x86.ssse3.phadd.sw.128(&lt;8 x i16&gt;, &lt;8 x i16&gt;)

declare &lt;8 x i16&gt; @<!-- -->llvm.sadd.sat.v8i16(&lt;8 x i16&gt;, &lt;8 x i16&gt;)

define &lt;8 x i16&gt; @<!-- -->phaddsw_v8i16_intrinsic(&lt;8 x i16&gt; %a, &lt;8 x i16&gt; %b) {
entry:
  %res = call &lt;8 x i16&gt; @<!-- -->llvm.x86.ssse3.phadd.sw.128(&lt;8 x i16&gt; %a, &lt;8 x i16&gt; %b)
  ret &lt;8 x i16&gt; %res
}

define &lt;8 x i16&gt; @<!-- -->phaddsw_v8i16_generic(&lt;8 x i16&gt; %a, &lt;8 x i16&gt; %b) {
entry:
  %even = shufflevector &lt;8 x i16&gt; %a, &lt;8 x i16&gt; %b,
    &lt;8 x i32&gt; &lt;i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14&gt;
  %odd  = shufflevector &lt;8 x i16&gt; %a, &lt;8 x i16&gt; %b,
    &lt;8 x i32&gt; &lt;i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15&gt;
  %sum = call &lt;8 x i16&gt; @<!-- -->llvm.sadd.sat.v8i16(&lt;8 x i16&gt; %even, &lt;8 x i16&gt; %odd)
  ret &lt;8 x i16&gt; %sum
}

phaddsw_v8i16_intrinsic:                # @<!-- -->phaddsw_v8i16_intrinsic
        phaddsw xmm0, xmm1
        ret

phaddsw_v8i16_generic:                  # @<!-- -->phaddsw_v8i16_generic
        movdqa  xmm2, xmmword ptr [rip + .LCPI1_0] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
        movdqa  xmm3, xmm1
        pshufb  xmm3, xmm2
        movdqa  xmm4, xmm0
        pshufb  xmm4, xmm2
        punpcklqdq      xmm4, xmm3              # xmm4 = xmm4[0],xmm3[0]
        psrad   xmm1, 16
        psrad   xmm0, 16
        packssdw        xmm0, xmm1
        paddsw  xmm0, xmm4
        ret

This PR does recognize the pattern.

What I haven't been able to figure out is how to also make this work for a v16i16 vector (using the avx2 instruction). What would be the best way to go about that? My patterns give weird compile errors, and I haven't really been able to find a good analogue.

Full diff: https://github.com/llvm/llvm-project/pull/169591.diff

5 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+30-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.h (+4)
(modified) llvm/lib/Target/X86/X86InstrFragmentsSIMD.td (+2)
(modified) llvm/lib/Target/X86/X86InstrSSE.td (+6)
(added) llvm/test/CodeGen/X86/haddsubsat.ll (+139)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a950e3a..3370e2de0dbbd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2654,6 +2654,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::AVGFLOORU,
                        ISD::BITREVERSE,
                        ISD::ADD,
+                       ISD::SADDSAT,
+                       ISD::SSUBSAT,
                        ISD::FADD,
                        ISD::FSUB,
                        ISD::FNEG,
@@ -8114,6 +8116,8 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
   case X86ISD::FHSUB:
   case X86ISD::HADD:
   case X86ISD::HSUB:
+  case X86ISD::HADDS:
+  case X86ISD::HSUBS:
     return true;
   }
   return false;
@@ -34984,6 +34988,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BLENDV)
   NODE_NAME_CASE(HADD)
   NODE_NAME_CASE(HSUB)
+  NODE_NAME_CASE(HADDS)
+  NODE_NAME_CASE(HSUBS)
   NODE_NAME_CASE(FHADD)
   NODE_NAME_CASE(FHSUB)
   NODE_NAME_CASE(CONFLICT)
@@ -54034,7 +54040,7 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   unsigned Opcode = N->getOpcode();
-  bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
+  bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD) || (Opcode == ISD::SADDSAT);
   SmallVector<int, 8> PostShuffleMask;
 
   auto MergableHorizOp = [N](unsigned HorizOpcode) {
@@ -54084,6 +54090,27 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
       }
     }
     break;
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    if (Subtarget.hasSSSE3() && VT == MVT::v8i16) {
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+      auto HorizOpcode = IsAdd ? X86ISD::HADDS : X86ISD::HSUBS;
+      if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
+                            PostShuffleMask, MergableHorizOp(HorizOpcode))) {
+        auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+                                        ArrayRef<SDValue> Ops) {
+          return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+        };
+        SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
+                                              {LHS, RHS}, HOpBuilder);
+        if (!PostShuffleMask.empty())
+          HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+                                            DAG.getUNDEF(VT), PostShuffleMask);
+        return HorizBinOp;
+      }
+    }
+    break;
   }
 
   return SDValue();
@@ -60793,6 +60820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
   case X86ISD::ADD:
   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI, Subtarget);
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:        return combineToHorizontalAddSub(N, DAG, Subtarget);
   case X86ISD::CLOAD:
   case X86ISD::CSTORE:      return combineX86CloadCstore(N, DAG);
   case X86ISD::SBB:         return combineSBB(N, DAG);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..8425e18d0b35e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -270,6 +270,10 @@ namespace llvm {
     HADD,
     HSUB,
 
+    /// Integer horizontal saturating add/sub.
+    HADDS,
+    HSUBS,
+
     /// Floating point horizontal add/sub.
     FHADD,
     FHSUB,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 5321ecf0c1b2c..0803a4946b379 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -71,6 +71,8 @@ def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
+def X86hadds   : SDNode<"X86ISD::HADDS",     SDTIntBinOp>;
+def X86hsubs   : SDNode<"X86ISD::HSUBS",     SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86FCmp>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86FCmp>;
 def X86comi512       : SDNode<"X86ISD::COMX",      SDTX86FCmp>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 806b02b9f9359..ee16eaa0462ea 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4949,6 +4949,12 @@ defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
                                  VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
 }
 
+def : Pat<(v8i16 (X86hadds VR128:$src1, VR128:$src2)),
+          (PHADDSWrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(v8i16 (X86hsubs VR128:$src1, VR128:$src2)),
+          (PHSUBSWrr VR128:$src1, VR128:$src2)>;
+
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Align Instruction Patterns
 //===---------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
new file mode 100644
index 0000000000000..d7fd38c623c41
--- /dev/null
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 -x86-asm-syntax=intel | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 -x86-asm-syntax=intel | FileCheck %s -check-prefix=AVX2
+
+define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phaddsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phaddsw_v8i16_intrinsic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vphaddsw xmm0, xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phaddsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phaddsw_v8i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    phaddsw xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+  ret <8 x i16> %sum
+}
+
+define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v16i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phaddsw xmm0, xmm1
+; SSSE3-NEXT:    phaddsw xmm2, xmm3
+; SSSE3-NEXT:    movdqa xmm1, xmm2
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phaddsw_v16i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT:    vpshufb ymm1, ymm1, ymm3
+; AVX2-NEXT:    vpshufb ymm0, ymm0, ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpaddsw ymm0, ymm2, ymm0
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+                i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+                i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+  ret <16 x i16> %sum
+}
+
+define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phsubsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phsubsw_v8i16_intrinsic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vphsubsw xmm0, xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phsubsw xmm0, xmm1
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phsubsw_v8i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    phsubsw xmm0, xmm1
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %odd  = shufflevector <8 x i16> %a, <8 x i16> %b,
+    <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+  ret <8 x i16> %diff
+}
+
+define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v16i16_generic:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    phsubsw xmm0, xmm1
+; SSSE3-NEXT:    phsubsw xmm2, xmm3
+; SSSE3-NEXT:    movdqa xmm1, xmm2
+; SSSE3-NEXT:    ret
+;
+; AVX2-LABEL: phsubsw_v16i16_generic:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT:    vpshufb ymm1, ymm1, ymm3
+; AVX2-NEXT:    vpshufb ymm0, ymm0, ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpsubsw ymm0, ymm2, ymm0
+; AVX2-NEXT:    ret
+entry:
+  %even = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+                i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %odd  = shufflevector <16 x i16> %a, <16 x i16> %b,
+    <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+                i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+  ret <16 x i16> %diff
+}