-
Notifications
You must be signed in to change notification settings - Fork 11.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SDAG] Simplify is-power-of-2 codegen #72275
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-selectiondag Author: Tavian Barnes (tavianator) ChangesWhen x is not known to be nonzero, ctpop(x) == 1 is expanded to
resulting in codegen like
But another expression that works is
which has nicer codegen:
Patch is 104.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72275.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ed352c86eca06e5..c47e2ad418a0e8e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4237,9 +4237,7 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC);
}
- // Expand a power-of-2 comparison based on ctpop:
- // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
- // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
+ // Expand a power-of-2 comparison based on ctpop
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
// Keep the CTPOP if it is cheap.
if (TLI.isCtpopFast(CTVT))
@@ -4248,17 +4246,22 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
SDValue Zero = DAG.getConstant(0, dl, CTVT);
SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
assert(CTVT.isInteger());
- ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT);
SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
- SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
- SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
// Its not uncommon for known-never-zero X to exist in (ctpop X) eq/ne 1, so
- // check before the emit a potentially unnecessary op.
- if (DAG.isKnownNeverZero(CTOp))
+ // check before emitting a potentially unnecessary op.
+ if (DAG.isKnownNeverZero(CTOp)) {
+ // (ctpop x) == 1 --> (x & x-1) == 0
+ // (ctpop x) != 1 --> (x & x-1) != 0
+ SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
+ SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
return RHS;
- SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
- unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
- return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
+ } else {
+ // (ctpop x) == 1 --> (x ^ x-1) > x-1
+ // (ctpop x) != 1 --> (x ^ x-1) <= x-1
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, CTVT, CTOp, Add);
+ ISD::CondCode CmpCond = Cond == ISD::SETEQ ? ISD::SETUGT : ISD::SETULE;
+ return DAG.getSetCC(dl, VT, Xor, Add, CmpCond);
+ }
}
return SDValue();
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index a33319e66d5f111..fba44218e05726b 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -120,13 +120,11 @@ define i32 @ctpop_eq_one(i64 %x) nounwind readnone {
;
; NO-POPCOUNT-LABEL: ctpop_eq_one:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rax
-; NO-POPCOUNT-NEXT: testq %rax, %rdi
-; NO-POPCOUNT-NEXT: sete %al
-; NO-POPCOUNT-NEXT: testq %rdi, %rdi
-; NO-POPCOUNT-NEXT: setne %cl
-; NO-POPCOUNT-NEXT: andb %al, %cl
-; NO-POPCOUNT-NEXT: movzbl %cl, %eax
+; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rcx
+; NO-POPCOUNT-NEXT: xorq %rcx, %rdi
+; NO-POPCOUNT-NEXT: xorl %eax, %eax
+; NO-POPCOUNT-NEXT: cmpq %rcx, %rdi
+; NO-POPCOUNT-NEXT: seta %al
; NO-POPCOUNT-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp eq i64 %count, 1
@@ -145,13 +143,11 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
;
; NO-POPCOUNT-LABEL: ctpop_ne_one:
; NO-POPCOUNT: # %bb.0:
-; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rax
-; NO-POPCOUNT-NEXT: testq %rax, %rdi
-; NO-POPCOUNT-NEXT: setne %al
-; NO-POPCOUNT-NEXT: testq %rdi, %rdi
-; NO-POPCOUNT-NEXT: sete %cl
-; NO-POPCOUNT-NEXT: orb %al, %cl
-; NO-POPCOUNT-NEXT: movzbl %cl, %eax
+; NO-POPCOUNT-NEXT: leaq -1(%rdi), %rcx
+; NO-POPCOUNT-NEXT: xorq %rcx, %rdi
+; NO-POPCOUNT-NEXT: xorl %eax, %eax
+; NO-POPCOUNT-NEXT: cmpq %rcx, %rdi
+; NO-POPCOUNT-NEXT: setbe %al
; NO-POPCOUNT-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp ne i64 %count, 1
@@ -162,29 +158,26 @@ define i32 @ctpop_ne_one(i64 %x) nounwind readnone {
define i1 @ctpop_trunc_non_power2(i255 %x) nounwind {
; CHECK-LABEL: ctpop_trunc_non_power2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: andq %rax, %r8
-; CHECK-NEXT: movq %rdi, %r9
-; CHECK-NEXT: addq $-1, %r9
-; CHECK-NEXT: movq %rsi, %r10
-; CHECK-NEXT: adcq $-1, %r10
-; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: adcq $-1, %r11
-; CHECK-NEXT: adcq %rax, %rcx
-; CHECK-NEXT: andq %rdi, %r9
-; CHECK-NEXT: andq %rdx, %r11
-; CHECK-NEXT: orq %r9, %r11
-; CHECK-NEXT: andq %r8, %rcx
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: orq %rcx, %r10
-; CHECK-NEXT: orq %r11, %r10
-; CHECK-NEXT: sete %cl
-; CHECK-NEXT: orq %rdx, %rdi
-; CHECK-NEXT: orq %rsi, %r8
-; CHECK-NEXT: orq %rdi, %r8
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: andb %cl, %al
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: addq $-1, %rax
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: adcq $-1, %r8
+; CHECK-NEXT: movq %rdx, %r9
+; CHECK-NEXT: adcq $-1, %r9
+; CHECK-NEXT: movabsq $9223372036854775807, %r10 # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: movq %rcx, %r11
+; CHECK-NEXT: adcq %r10, %r11
+; CHECK-NEXT: xorq %r11, %rcx
+; CHECK-NEXT: andq %r10, %r11
+; CHECK-NEXT: andq %r10, %rcx
+; CHECK-NEXT: xorq %r9, %rdx
+; CHECK-NEXT: xorq %r8, %rsi
+; CHECK-NEXT: xorq %rax, %rdi
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: sbbq %rsi, %r8
+; CHECK-NEXT: sbbq %rdx, %r9
+; CHECK-NEXT: sbbq %rcx, %r11
+; CHECK-NEXT: setb %al
; CHECK-NEXT: retq
%a = call i255 @llvm.ctpop.i255(i255 %x)
%b = trunc i255 %a to i8 ; largest value from ctpop is 255, fits in 8 bits.
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 4051e4d7f5b5dc0..8723432de8b6b00 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -28,25 +28,14 @@ define i1 @is_pow2_non_zero(i32 %xin) {
}
define i1 @is_pow2_non_zero_x_maybe_z(i32 %x) {
-; CHECK-NOBMI-LABEL: is_pow2_non_zero_x_maybe_z:
-; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
-; CHECK-NOBMI-NEXT: leal -1(%rdi), %eax
-; CHECK-NOBMI-NEXT: testl %eax, %edi
-; CHECK-NOBMI-NEXT: sete %cl
-; CHECK-NOBMI-NEXT: testl %edi, %edi
-; CHECK-NOBMI-NEXT: setne %al
-; CHECK-NOBMI-NEXT: andb %cl, %al
-; CHECK-NOBMI-NEXT: retq
-;
-; CHECK-BMI2-LABEL: is_pow2_non_zero_x_maybe_z:
-; CHECK-BMI2: # %bb.0:
-; CHECK-BMI2-NEXT: testl %edi, %edi
-; CHECK-BMI2-NEXT: setne %cl
-; CHECK-BMI2-NEXT: blsrl %edi, %eax
-; CHECK-BMI2-NEXT: sete %al
-; CHECK-BMI2-NEXT: andb %cl, %al
-; CHECK-BMI2-NEXT: retq
+; CHECK-LABEL: is_pow2_non_zero_x_maybe_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT: leal -1(%rdi), %eax
+; CHECK-NEXT: xorl %eax, %edi
+; CHECK-NEXT: cmpl %eax, %edi
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: retq
%cnt = call i32 @llvm.ctpop.i32(i32 %x)
%r = icmp eq i32 %cnt, 1
ret i1 %r
@@ -180,44 +169,40 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: pxor %xmm2, %xmm2
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm3
-; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm4
-; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4
-; CHECK-NOBMI-NEXT: pand %xmm1, %xmm4
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm1
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm1, %xmm5
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm4, %xmm1
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2
+; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3
+; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3
+; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm3
; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1
-; CHECK-NOBMI-NEXT: por %xmm5, %xmm1
-; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm4, %xmm5
+; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm5
+; CHECK-NOBMI-NEXT: pcmpgtd %xmm3, %xmm5
+; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm6
+; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm6
+; CHECK-NOBMI-NEXT: pxor %xmm4, %xmm6
+; CHECK-NOBMI-NEXT: pxor %xmm6, %xmm0
; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4
-; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0
-; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm0
-; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
-; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0
-; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0
-; CHECK-NOBMI-NEXT: por %xmm5, %xmm0
-; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; CHECK-NOBMI-NEXT: pcmpgtd %xmm6, %xmm4
+; CHECK-NOBMI-NEXT: movdqa %xmm4, %xmm7
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2]
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm3, %xmm1
+; CHECK-NOBMI-NEXT: pcmpeqd %xmm6, %xmm0
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; CHECK-NOBMI-NEXT: andps %xmm7, %xmm0
+; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3]
+; CHECK-NOBMI-NEXT: orps %xmm4, %xmm0
+; CHECK-NOBMI-NEXT: xorps %xmm2, %xmm0
; CHECK-NOBMI-NEXT: retq
;
; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z:
; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2
-; CHECK-AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; CHECK-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm4
-; CHECK-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
-; CHECK-AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; CHECK-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm2
+; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; CHECK-AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; CHECK-AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; CHECK-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; CHECK-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vzeroupper
@@ -235,5 +220,3 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
%r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i1> %r
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 61f0885c55be438..58cacfb0485ec6a 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -720,85 +720,37 @@ define <16 x i8> @foldv16i8() nounwind {
}
define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
-; SSE2-LABEL: eq_1_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: paddq %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: eq_1_v2i64:
-; SSE3: # %bb.0:
-; SSE3-NEXT: pxor %xmm1, %xmm1
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: paddq %xmm0, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: movdqa %xmm0, %xmm3
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT: pand %xmm3, %xmm0
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSE3-NEXT: pand %xmm2, %xmm1
-; SSE3-NEXT: pandn %xmm1, %xmm0
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: eq_1_v2i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: paddq %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: eq_1_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: paddq %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm2
-; SSE41-NEXT: pandn %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: eq_1_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddq %xmm0, %xmm1
+; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1OR2-LABEL: eq_1_v2i64:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1OR2-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; AVX1OR2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpandn %xmm0, %xmm2, %xmm0
+; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1OR2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1OR2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; XOP-LABEL: eq_1_v2i64:
; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; XOP-NEXT: vpcomneqq %xmm1, %xmm0, %xmm2
-; XOP-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; XOP-NEXT: vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpand %xmm0, %xmm2, %xmm0
+; XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: eq_1_v2i64:
@@ -818,24 +770,24 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
;
; BITALG_NOVLX-LABEL: eq_1_v2i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; BITALG_NOVLX-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpminuq %zmm1, %zmm0, %zmm1
; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpandn %xmm0, %xmm2, %xmm0
+; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: eq_1_v2i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; BITALG-NEXT: vpaddq %xmm3, %xmm0, %xmm3
-; BITALG-NEXT: vpand %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpminuq %xmm1, %xmm0, %xmm1
; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpandn %xmm0, %xmm2, %xmm0
+; BITALG-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; BITALG-NEXT: retq
%2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0)
%3 = icmp eq <2 x i64> %2, <i64 1, i64 1>
@@ -844,95 +796,40 @@ define <2 x i64> @eq_1_v2i64(<2 x i64> %0) {
}
define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
-; SSE2-LABEL: ne_1_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: paddq %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE3-LABEL: ne_1_v2i64:
-; SSE3: # %bb.0:
-; SSE3-NEXT: pxor %xmm1, %xmm1
-; SSE3-NEXT: movdqa %xmm0, %xmm2
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSE3-NEXT: pand %xmm2, %xmm3
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: movdqa %xmm0, %xmm4
-; SSE3-NEXT: paddq %xmm2, %xmm4
-; SSE3-NEXT: pand %xmm4, %xmm0
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: pxor %xmm2, %xmm0
-; SSE3-NEXT: por %xmm3, %xmm0
-; SSE3-NEXT: retq
-;
-; SSSE3-LABEL: ne_1_v2i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: paddq %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: ne_1_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: paddq %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: ne_1_v2i64:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddq %xmm1, %xmm2
+; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1OR2-LABEL: ne_1_v2i64:
; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
-; AVX1OR2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1OR2-NEXT: vpaddq %xmm3, %xmm0, %xmm4
-; AVX1OR2-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; A...
[truncated]
|
365a4af
to
0a74138
Compare
Do you have an alive2 proof please? |
https://alive2.llvm.org/ce/z/kttm55
|
When x is not known to be nonzero, ctpop(x) == 1 is expanded to x != 0 && (x & (x - 1)) == 0 resulting in codegen like leal -1(%rdi), %eax testl %eax, %edi sete %cl testl %edi, %edi setne %al andb %cl, %al But another expression that works is (x ^ (x - 1)) > x - 1 which has nicer codegen: leal -1(%rdi), %eax xorl %eax, %edi cmpl %eax, %edi seta %al
0a74138
to
2bbcec6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also try implementing this in GlobalISel
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Thanks for the reviews! It seems like most people merge their own PRs here, but I don't have write access so I'll need someone to do that for me. |
When x is not known to be nonzero, ctpop(x) == 1 is expanded to x != 0 && (x & (x - 1)) == 0 resulting in codegen like leal -1(%rdi), %eax testl %eax, %edi sete %cl testl %edi, %edi setne %al andb %cl, %al But another expression that works is (x ^ (x - 1)) > x - 1 which has nicer codegen: leal -1(%rdi), %eax xorl %eax, %edi cmpl %eax, %edi seta %al
When x is not known to be nonzero, ctpop(x) == 1 is expanded to
resulting in codegen like
But another expression that works is
which has nicer codegen: