-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics #83580
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesWaiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling. As an initial example I've added support for the PSADBW intrinsics (which will be further expanded along with the ISD node in #81765) as this is a good example of an intrinsic that we need to handle as early as possible. Full diff: https://github.com/llvm/llvm-project/pull/83580.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0162bb65afe3b0..22e64385a3ef45 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36980,6 +36980,18 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
+ case ISD::INTRINSIC_WO_CHAIN: {
+ switch (Op->getConstantOperandVal(0)) {
+ case Intrinsic::x86_sse2_psad_bw:
+ case Intrinsic::x86_avx2_psad_bw:
+ case Intrinsic::x86_avx512_psad_bw_512:
+ // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+ assert(VT.getScalarType() == MVT::i64 && "Unexpected PSADBW types");
+ Known.Zero.setBitsFrom(16);
+ break;
+ }
+ break;
+ }
}
// Handle target shuffles.
diff --git a/llvm/test/CodeGen/X86/psadbw.ll b/llvm/test/CodeGen/X86/psadbw.ll
index 8141b22d321f4d..daabb4d9ca4083 100644
--- a/llvm/test/CodeGen/X86/psadbw.ll
+++ b/llvm/test/CodeGen/X86/psadbw.ll
@@ -54,7 +54,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: retl
;
@@ -64,7 +63,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: retq
;
@@ -82,28 +80,15 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
ret <2 x i64> %ext
}
-; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
+; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
-; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: fstpl (%esp)
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
@@ -111,14 +96,8 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: xorps %xmm0, %xmm0
-; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: xorps %xmm1, %xmm1
-; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
-; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
@@ -126,10 +105,8 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX2-NEXT: retq
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
@@ -137,28 +114,24 @@ define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
ret <2 x double> %cvt
}
-; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
+; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: psadbw %xmm1, %xmm0
-; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
-; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT: addpd %xmm1, %xmm0
+; X86-SSE-NEXT: psadbw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: psadbw %xmm1, %xmm0
-; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
-; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE-NEXT: addpd %xmm1, %xmm0
+; X64-SSE-NEXT: psadbw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
@@ -166,12 +139,8 @@ define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
-; AVX2-NEXT: # xmm1 = mem[0,0]
-; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX2-NEXT: retq
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 2a33e75a8357c6..ca319687da54d4 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX-LABEL: sad_unroll_nonzero_initial:
@@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX-LABEL: sad_double_reduction:
@@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
bb:
@@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX-LABEL: sad_double_reduction_abs:
@@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
bb:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling.
I don't quite understand the point. Shouldn't the first combine happens after first round lowering?
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | ||
; SSE2-NEXT: paddd %xmm0, %xmm1 | ||
; SSE2-NEXT: movd %xmm1, %eax | ||
; SSE2-NEXT: movd %xmm0, %eax |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why it affected given the file doesn't call the intrinsic directly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The X86PartialReduction pass is creating them prior to isel
// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result. | ||
assert(VT.getScalarType() == MVT::i64 && "Unexpected PSADBW types"); | ||
Known.Zero.setBitsFrom(16); | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is obv improvement as is, but why did you drop your impl:
#81765 (comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It just a question of what order we put these patches in - if #81765 gets completed first then I'll just update this one to match
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just write a helper that does the right logic now, then your patch for #81765 will just be calling the function for X86ISD::PSADBW
as well?
Don't feel strongly, just seems strange.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know its documented as being 16 bits, but its worst case 11 bits I think. 255*8 fits in 11 bits.
I can merge this with the WIP patch I have for #81765 as long as everyone is happy with me adding the precendent of supporting INTRINSIC_WO_CHAIN as part of it - I tend to to prefer bitesize patches where possible, so keeping them separate made more sense in my head. |
… for PSADBW intrinsics Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling. As an initial example I've added support for the PSADBW intrinsics (which can be expanded along with the ISD node in llvm#81765) as this is a good example of an intrinsic that we need to handle as early as possible.
4924d91
to
aa4d679
Compare
Fair enough. Im strongly in favor of supporting |
Just some improvements that should hopefully strengthen analysis. Closes llvm#83580
Just some improvements that should hopefully strengthen analysis. Closes llvm#83580
Abandoning in favor of #83830 |
Just some improvements that should hopefully strengthen analysis. Closes llvm#83580
Just some improvements that should hopefully strengthen analysis. Closes #83580
Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling.
As an initial example I've added support for the PSADBW intrinsics (which will be further expanded along with the ISD node in #81765) as this is a good example of an intrinsic that we need to handle as early as possible.