[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics #83580

RKSimon · 2024-03-01T15:21:23Z

Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling.

As an initial example I've added support for the PSADBW intrinsics (which will be further expanded along with the ISD node in #81765) as this is a good example of an intrinsic that we need to handle as early as possible.

llvmbot · 2024-03-01T15:21:55Z

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling.

As an initial example I've added support for the PSADBW intrinsics (which will be further expanded along with the ISD node in #81765) as this is a good example of an intrinsic that we need to handle as early as possible.

Full diff: https://github.com/llvm/llvm-project/pull/83580.diff

3 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+12)
(modified) llvm/test/CodeGen/X86/psadbw.ll (+16-47)
(modified) llvm/test/CodeGen/X86/sad.ll (+3-13)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0162bb65afe3b0..22e64385a3ef45 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36980,6 +36980,18 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     }
     break;
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (Op->getConstantOperandVal(0)) {
+    case Intrinsic::x86_sse2_psad_bw:
+    case Intrinsic::x86_avx2_psad_bw:
+    case Intrinsic::x86_avx512_psad_bw_512:
+      // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+      assert(VT.getScalarType() == MVT::i64 && "Unexpected PSADBW types");
+      Known.Zero.setBitsFrom(16);
+      break;
+    }
+    break;
+  }
   }
 
   // Handle target shuffles.
diff --git a/llvm/test/CodeGen/X86/psadbw.ll b/llvm/test/CodeGen/X86/psadbw.ll
index 8141b22d321f4d..daabb4d9ca4083 100644
--- a/llvm/test/CodeGen/X86/psadbw.ll
+++ b/llvm/test/CodeGen/X86/psadbw.ll
@@ -54,7 +54,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE-NEXT:    psadbw %xmm0, %xmm1
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; X86-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    retl
 ;
@@ -64,7 +63,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X64-SSE-NEXT:    psadbw %xmm0, %xmm1
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; X64-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    retq
 ;
@@ -82,28 +80,15 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
   ret <2 x i64> %ext
 }
 
-; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
+; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
 define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
 ; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
-; X86-SSE-NEXT:    movl %esp, %ebp
-; X86-SSE-NEXT:    andl $-8, %esp
-; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE-NEXT:    psadbw %xmm0, %xmm1
-; X86-SSE-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fstpl (%esp)
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; X86-SSE-NEXT:    movl %ebp, %esp
-; X86-SSE-NEXT:    popl %ebp
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
@@ -111,14 +96,8 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
 ; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X64-SSE-NEXT:    psadbw %xmm0, %xmm1
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    xorps %xmm0, %xmm0
-; X64-SSE-NEXT:    cvtsi2sd %eax, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    xorps %xmm1, %xmm1
-; X64-SSE-NEXT:    cvtsi2sd %eax, %xmm1
-; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
@@ -126,10 +105,8 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vcvtdq2pd %xmm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sd %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
@@ -137,28 +114,24 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
   ret <2 x double> %cvt
 }
 
-; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
+; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
 define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
 ; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X86-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
-; X86-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT:    addpd %xmm1, %xmm0
+; X86-SSE-NEXT:    psadbw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X64-SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT:    movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
-; X64-SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE-NEXT:    addpd %xmm1, %xmm0
+; X64-SSE-NEXT:    psadbw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
@@ -166,12 +139,8 @@ define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
-; AVX2-NEXT:    # xmm1 = mem[0,0]
-; AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 2a33e75a8357c6..ca319687da54d4 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
 ; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: sad_unroll_nonzero_initial:
@@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: sad_double_reduction:
@@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 bb:
@@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: sad_double_reduction_abs:
@@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 bb:

phoebewang

Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling.

I don't quite understand the point. Shouldn't the first combine happens after first round lowering?

phoebewang · 2024-03-02T02:50:16Z

llvm/test/CodeGen/X86/sad.ll

-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax


Why it affected given the file doesn't call the intrinsic directly?

The X86PartialReduction pass is creating them prior to isel

goldsteinn · 2024-03-02T05:47:39Z

llvm/lib/Target/X86/X86ISelLowering.cpp

+      // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+      assert(VT.getScalarType() == MVT::i64 && "Unexpected PSADBW types");
+      Known.Zero.setBitsFrom(16);
+      break;


This is obv improvement as is, but why did you drop your impl:
#81765 (comment)

It just a question of what order we put these patches in - if #81765 gets completed first then I'll just update this one to match

Why not just write a helper that does the right logic now, then your patch for #81765 will just be calling the function for X86ISD::PSADBW as well?

Don't feel strongly, just seems strange.

I know its documented as being 16 bits, but its worst case 11 bits I think. 255*8 fits in 11 bits.

RKSimon · 2024-03-04T10:56:21Z

I can merge this with the WIP patch I have for #81765 as long as everyone is happy with me adding the precendent of supporting INTRINSIC_WO_CHAIN as part of it - I tend to to prefer bitesize patches where possible, so keeping them separate made more sense in my head.

… for PSADBW intrinsics Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling. As an initial example I've added support for the PSADBW intrinsics (which can be expanded along with the ISD node in llvm#81765) as this is a good example of an intrinsic that we need to handle as early as possible.

goldsteinn · 2024-03-04T18:30:31Z

I can merge this with the WIP patch I have for #81765 as long as everyone is happy with me adding the precendent of supporting INTRINSIC_WO_CHAIN as part of it - I tend to to prefer bitesize patches where possible, so keeping them separate made more sense in my head.

Fair enough.

Im strongly in favor of supporting INTRINSIC_WO_CHAIN

Just some improvements that should hopefully strengthen analysis. Closes llvm#83580

RKSimon · 2024-03-05T14:59:06Z

Abandoning in favor of #83830

Just some improvements that should hopefully strengthen analysis. Closes llvm#83580

Just some improvements that should hopefully strengthen analysis. Closes #83580

RKSimon requested review from phoebewang and goldsteinn March 1, 2024 15:21

llvmbot added the backend:X86 label Mar 1, 2024

phoebewang reviewed Mar 2, 2024

View reviewed changes

goldsteinn reviewed Mar 2, 2024

View reviewed changes

RKSimon force-pushed the knownbits-intrinsics branch from 4924d91 to aa4d679 Compare March 4, 2024 15:39

goldsteinn added a commit to goldsteinn/llvm-project that referenced this pull request Mar 4, 2024

[KnownBits] Make nuw and nsw support in computeForAddSub optimal

1f89a3c

Just some improvements that should hopefully strengthen analysis. Closes llvm#83580

goldsteinn added a commit to goldsteinn/llvm-project that referenced this pull request Mar 4, 2024

[KnownBits] Make nuw and nsw support in computeForAddSub optimal

3da7dac

Just some improvements that should hopefully strengthen analysis. Closes llvm#83580

RKSimon closed this Mar 5, 2024

RKSimon deleted the knownbits-intrinsics branch March 5, 2024 14:59

goldsteinn added a commit to goldsteinn/llvm-project that referenced this pull request Mar 5, 2024

[KnownBits] Make nuw and nsw support in computeForAddSub optimal

d5a3bd6

Just some improvements that should hopefully strengthen analysis. Closes llvm#83580

goldsteinn added a commit that referenced this pull request Mar 5, 2024

[KnownBits] Make nuw and nsw support in computeForAddSub optimal

17162b6

Just some improvements that should hopefully strengthen analysis. Closes #83580

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics #83580

[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics #83580

RKSimon commented Mar 1, 2024

llvmbot commented Mar 1, 2024

phoebewang left a comment

phoebewang Mar 2, 2024

RKSimon Mar 2, 2024

goldsteinn Mar 2, 2024

RKSimon Mar 2, 2024

goldsteinn Mar 2, 2024

topperc Mar 3, 2024

RKSimon commented Mar 4, 2024

goldsteinn commented Mar 4, 2024

RKSimon commented Mar 5, 2024

[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics #83580

[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics #83580

Conversation

RKSimon commented Mar 1, 2024

llvmbot commented Mar 1, 2024

phoebewang left a comment

Choose a reason for hiding this comment

phoebewang Mar 2, 2024

Choose a reason for hiding this comment

RKSimon Mar 2, 2024

Choose a reason for hiding this comment

goldsteinn Mar 2, 2024

Choose a reason for hiding this comment

RKSimon Mar 2, 2024

Choose a reason for hiding this comment

goldsteinn Mar 2, 2024

Choose a reason for hiding this comment

topperc Mar 3, 2024

Choose a reason for hiding this comment

RKSimon commented Mar 4, 2024

goldsteinn commented Mar 4, 2024

RKSimon commented Mar 5, 2024