-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG] visitCTPOP - if only the upper half of the ctpop operand is zero then see if its profitable to only count the lower half. #80473
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesFull diff: https://github.com/llvm/llvm-project/pull/80473.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b17724cd07209..3d296a5ed3425 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11142,11 +11142,29 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
SDValue DAGCombiner::visitCTPOP(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ unsigned NumBits = VT.getScalarSizeInBits();
SDLoc DL(N);
// fold (ctpop c1) -> c2
if (SDValue C = DAG.FoldConstantArithmetic(ISD::CTPOP, DL, VT, {N0}))
return C;
+
+ // If the upper bits are known to be zero, then see if its profitable to
+ // only count the lower bits.
+ if (VT.isScalarInteger() && NumBits > 8 && (NumBits & 1) == 0) {
+ EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), NumBits / 2);
+ if (TLI.isTruncateFree(N0, HalfVT) && TLI.isZExtFree(HalfVT, VT) &&
+ TLI.isTypeDesirableForOp(ISD::CTPOP, HalfVT) &&
+ hasOperation(ISD::CTPOP, HalfVT)) {
+ APInt UpperBits = APInt::getHighBitsSet(NumBits, NumBits / 2);
+ if (DAG.MaskedValueIsZero(N0, UpperBits)) {
+ SDValue PopCnt = DAG.getNode(ISD::CTPOP, DL, HalfVT,
+ DAG.getZExtOrTrunc(N0, DL, HalfVT));
+ return DAG.getZExtOrTrunc(PopCnt, DL, VT);
+ }
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 1346678e51e3d..3b9c3e3ba1752 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -452,12 +452,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_and_b32 s4, s8, 0xff
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_and_b32 s4, s8, 1
-; SI-NEXT: s_mov_b32 s5, 0
-; SI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; SI-NEXT: s_add_i32 s4, s6, s4
+; SI-NEXT: s_bcnt1_i32_b32 s4, s4
+; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
+; SI-NEXT: s_add_i32 s4, s5, s4
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -470,12 +469,11 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_and_b32 s4, s8, 0xff
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_and_b32 s4, s8, 1
-; VI-NEXT: s_mov_b32 s5, 0
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_add_i32 s4, s6, s4
+; VI-NEXT: s_bcnt1_i32_b32 s4, s4
+; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
+; VI-NEXT: s_add_i32 s4, s5, s4
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index abbcf22f77e43..e0a96a9f98879 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -25,7 +25,7 @@ define i64 @ctpop_mask2(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_mask2:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $3, %edi
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_mask2:
@@ -189,7 +189,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_mask4:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $15, %edi
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_mask4:
@@ -271,7 +271,7 @@ define i64 @ctpop_mask5(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_mask5:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $31, %edi
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_mask5:
@@ -392,7 +392,7 @@ define i64 @ctpop_shifted_mask6(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_shifted_mask6:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $26112, %edi # imm = 0x6600
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask6:
@@ -556,7 +556,7 @@ define i64 @ctpop_shifted_mask8(i64 %x) nounwind readnone {
; X64-POPCOUNT-LABEL: ctpop_shifted_mask8:
; X64-POPCOUNT: # %bb.0:
; X64-POPCOUNT-NEXT: andl $65280, %edi # imm = 0xFF00
-; X64-POPCOUNT-NEXT: popcntq %rdi, %rax
+; X64-POPCOUNT-NEXT: popcntl %edi, %eax
; X64-POPCOUNT-NEXT: retq
;
; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask8:
|
Do we still need 275729a with this change? |
Do you mean with 3a75807, we still need to customize i16/32/64 so that doesn't matter? |
…o then see if its profitable to only count the lower half.
No description provided.