-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG] Generalize setcc(setcc) fold to use known bits. #66503
Conversation
@llvm/pr-subscribers-backend-webassembly @llvm/pr-subscribers-backend-aarch64 ChangesIf we have a `SETCC (SETCC), 0, NE` and ZeroOrOneBooleanContent, we can remove the outer setcc as it will produce the same value as the inner. This can be generalized to anything where the top bits are known to be 0, as the value will remain as 1 or 0. -- Full diff: https://github.com//pull/66503.diff6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index bd1940994a87f0f..a544cfbe833ddff 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4652,21 +4652,25 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, DAG.getConstant(C1 & Imm, dl, ExtDstTy), Cond); } else if ((N1C->isZero() || N1C->isOne()) && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) { - // SETCC (SETCC), [0|1], [EQ|NE] -> SETCC - if (N0.getOpcode() == ISD::SETCC && + // SETCC (X), [0|1], [EQ|NE] -> X if X is known 0/1 + if ((N0.getOpcode() == ISD::SETCC || VT != MVT::i1) && isTypeLegal(VT) && VT.bitsLE(N0.getValueType()) && (N0.getValueType() == MVT::i1 || - getBooleanContents(N0.getOperand(0).getValueType()) == - ZeroOrOneBooleanContent)) { + getBooleanContents(N0.getValueType()) == ZeroOrOneBooleanContent) && + DAG.MaskedValueIsZero( + N0, APInt::getHighBitsSet(N0.getValueSizeInBits(), + N0.getValueSizeInBits() - 1))) { bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne()); if (TrueWhenTrue) return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); // Invert the condition. - ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); - CC = ISD::getSetCCInverse(CC, N0.getOperand(0).getValueType()); - if (DCI.isBeforeLegalizeOps() || - isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType())) - return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC); + if (N0.getOpcode() == ISD::SETCC) { + ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); + CC = ISD::getSetCCInverse(CC, N0.getOperand(0).getValueType()); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType())) + return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC); + } } if ((N0.getOpcode() == ISD::XOR || diff --git a/llvm/test/CodeGen/AArch64/setcc_knownbits.ll b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll new file mode 100644 index 000000000000000..9e9c814be0266ef --- /dev/null +++ b/llvm/test/CodeGen/AArch64/setcc_knownbits.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s + +define i1 @load_bv_v4i8(i1 zeroext %a) { +; CHECK-LABEL: load_bv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = zext i1 %a to i32 + %c = icmp eq i32 %b, 1 + ret i1 %c +} + +define noundef i1 @logger(i32 noundef %logLevel, ptr %ea, ptr %pll) { +; CHECK-LABEL: logger: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr w8, [x2] +; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: b.ls .LBB1_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %land.rhs +; CHECK-NEXT: ldr x8, [x1] +; CHECK-NEXT: ldrb w8, [x8] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret +entry: + %0 = load i32, ptr %pll, align 4 + %cmp.not = icmp ugt i32 %0, %logLevel + br i1 %cmp.not, label %land.end, label %land.rhs + +land.rhs: ; preds = %entry + %1 = load ptr, ptr %ea, align 8 + %2 = load i8, ptr %1, align 1, !range !14, !noundef !15 + %tobool.i = icmp ne i8 %2, 0 + br label %land.end + +land.end: ; preds = %land.rhs, %entry + %3 = phi i1 [ false, %entry ], [ %tobool.i, %land.rhs ] + ret i1 %3 +} +!14 = !{i8 0, i8 2} +!15 = !{} + + +declare i64 @llvm.ctlz.i64(i64 %in, i1) +define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) { +; CHECK-LABEL: lshr_ctlz_undef_cmpeq_one_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: clz x8, x0 +; CHECK-NEXT: lsr x0, x8, #6 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 -1) + %lshr = lshr i64 %ctlz, 6 + %icmp = icmp eq i64 %lshr, 1 + ret i1 %icmp +} + +define i32 @PR17487(i1 %tobool) { +; CHECK-LABEL: PR17487: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, w0 +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1 + %tmp1 = zext <2 x i1> %tmp to <2 x i64> + %tmp2 = xor <2 x i64> %tmp1, <i64 1, i64 1> + %tmp3 = extractelement <2 x i64> %tmp2, i32 1 + %add = add nsw i64 0, %tmp3 + %cmp6 = icmp ne i64 %add, 1 + %conv7 = zext i1 %cmp6 to i32 + ret i32 %conv7 +} diff --git a/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll b/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll index 9ef9c14084a7d7f..3dd8463d9dd1016 100644 --- a/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll +++ b/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll @@ -17,7 +17,7 @@ define i32 @reassociate_xor(float %x, float %y) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: f32.const 0x1p-23 ; CHECK-NEXT: f32.gt -; CHECK-NEXT: i32.ne +; CHECK-NEXT: i32.xor ; CHECK-NEXT: br_if 0 # 0: down to label0 ; CHECK-NEXT: # %bb.1: # %if.then.i ; CHECK-NEXT: i32.const 0 diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll index c1cce6f5d8ca10d..6c8d5c9d55a6d55 100644 --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -50,35 +50,33 @@ define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) nounwind { ; X86-BSR-NEXT: xorl $31, %eax ; X86-BSR-NEXT: addl $32, %eax ; X86-BSR-NEXT: .LBB1_2: -; X86-BSR-NEXT: testl $-64, %eax -; X86-BSR-NEXT: setne %al +; X86-BSR-NEXT: shrl $6, %eax +; X86-BSR-NEXT: # kill: def $al killed $al killed $eax ; X86-BSR-NEXT: retl ; ; X86-LZCNT-LABEL: lshr_ctlz_undef_cmpeq_one_i64: ; X86-LZCNT: # %bb.0: -; X86-LZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-LZCNT-NEXT: addl $32, %eax -; X86-LZCNT-NEXT: xorl %ecx, %ecx +; X86-LZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx +; X86-LZCNT-NEXT: addl $32, %ecx +; X86-LZCNT-NEXT: xorl %eax, %eax ; X86-LZCNT-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-LZCNT-NEXT: cmovel %eax, %ecx -; X86-LZCNT-NEXT: testb $64, %cl -; X86-LZCNT-NEXT: setne %al +; X86-LZCNT-NEXT: cmovel %ecx, %eax +; X86-LZCNT-NEXT: shrl $6, %eax +; X86-LZCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-LZCNT-NEXT: retl ; ; X64-BSR-LABEL: lshr_ctlz_undef_cmpeq_one_i64: ; X64-BSR: # %bb.0: ; X64-BSR-NEXT: bsrq %rdi, %rax ; X64-BSR-NEXT: shrl $6, %eax -; X64-BSR-NEXT: cmpl $1, %eax -; X64-BSR-NEXT: sete %al +; X64-BSR-NEXT: # kill: def $al killed $al killed $rax ; X64-BSR-NEXT: retq ; ; X64-LZCNT-LABEL: lshr_ctlz_undef_cmpeq_one_i64: ; X64-LZCNT: # %bb.0: ; X64-LZCNT-NEXT: lzcntq %rdi, %rax ; X64-LZCNT-NEXT: shrl $6, %eax -; X64-LZCNT-NEXT: cmpl $1, %eax -; X64-LZCNT-NEXT: sete %al +; X64-LZCNT-NEXT: # kill: def $al killed $al killed $rax ; X64-LZCNT-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 -1) %lshr = lshr i64 %ctlz, 6 @@ -131,33 +129,33 @@ define i1 @lshr_ctlz_undef_cmpne_zero_i64(i64 %in) nounwind { ; X86-BSR-NEXT: xorl $31, %eax ; X86-BSR-NEXT: addl $32, %eax ; X86-BSR-NEXT: .LBB3_2: -; X86-BSR-NEXT: testl $-64, %eax -; X86-BSR-NEXT: setne %al +; X86-BSR-NEXT: shrl $6, %eax +; X86-BSR-NEXT: # kill: def $al killed $al killed $eax ; X86-BSR-NEXT: retl ; ; X86-LZCNT-LABEL: lshr_ctlz_undef_cmpne_zero_i64: ; X86-LZCNT: # %bb.0: -; X86-LZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-LZCNT-NEXT: addl $32, %eax -; X86-LZCNT-NEXT: xorl %ecx, %ecx +; X86-LZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx +; X86-LZCNT-NEXT: addl $32, %ecx +; X86-LZCNT-NEXT: xorl %eax, %eax ; X86-LZCNT-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-LZCNT-NEXT: cmovel %eax, %ecx -; X86-LZCNT-NEXT: testb $64, %cl -; X86-LZCNT-NEXT: setne %al +; X86-LZCNT-NEXT: cmovel %ecx, %eax +; X86-LZCNT-NEXT: shrl $6, %eax +; X86-LZCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-LZCNT-NEXT: retl ; ; X64-BSR-LABEL: lshr_ctlz_undef_cmpne_zero_i64: ; X64-BSR: # %bb.0: ; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: testl $-64, %eax -; X64-BSR-NEXT: setne %al +; X64-BSR-NEXT: shrl $6, %eax +; X64-BSR-NEXT: # kill: def $al killed $al killed $rax ; X64-BSR-NEXT: retq ; ; X64-LZCNT-LABEL: lshr_ctlz_undef_cmpne_zero_i64: ; X64-LZCNT: # %bb.0: ; X64-LZCNT-NEXT: lzcntq %rdi, %rax -; X64-LZCNT-NEXT: testb $64, %al -; X64-LZCNT-NEXT: setne %al +; X64-LZCNT-NEXT: shrl $6, %eax +; X64-LZCNT-NEXT: # kill: def $al killed $al killed $rax ; X64-LZCNT-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 -1) %lshr = lshr i64 %ctlz, 6 diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 33f43c75cad3d7c..6f10aae8909f8a3 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -517,15 +517,13 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %ebp, %edx ; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shldl $1, %edx, %ecx ; X86-NEXT: shrdl $31, %edx, %eax -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $31, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: negl %esi -; X86-NEXT: sbbl %edi, %edi -; X86-NEXT: orl %edi, %eax -; X86-NEXT: shrdl $31, %ecx, %edx -; X86-NEXT: orl %edi, %edx +; X86-NEXT: testl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2072568b7ba75dd..8c8b7cc51610560 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -403,14 +403,19 @@ define i32 @PR17487(i1 %tobool) { ; ; X64-LIN-LABEL: PR17487: ; X64-LIN: # %bb.0: -; X64-LIN-NEXT: movl %edi, %eax -; X64-LIN-NEXT: andl $1, %eax +; X64-LIN-NEXT: movd %edi, %xmm0 +; X64-LIN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-LIN-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-LIN-NEXT: pextrw $4, %xmm0, %eax ; X64-LIN-NEXT: retq ; ; X64-WIN-LABEL: PR17487: ; X64-WIN: # %bb.0: -; X64-WIN-NEXT: andb $1, %cl ; X64-WIN-NEXT: movzbl %cl, %eax +; X64-WIN-NEXT: movd %eax, %xmm0 +; X64-WIN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-WIN-NEXT: pand __xmm@00000000000000010000000000000001(%rip), %xmm0 +; X64-WIN-NEXT: pextrw $4, %xmm0, %eax ; X64-WIN-NEXT: retq %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1 %tmp1 = zext <2 x i1> %tmp to <2 x i64> |
; X64-WIN-NEXT: movd %eax, %xmm0 | ||
; X64-WIN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] | ||
; X64-WIN-NEXT: pand __xmm@00000000000000010000000000000001(%rip), %xmm0 | ||
; X64-WIN-NEXT: pextrw $4, %xmm0, %eax |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These look like regressions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interfering with scalarizeExtractedBinop ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah these I wasn't sure about - how important they were. They get folded away under normal optimizations: https://godbolt.org/z/P4vMYd5oc, and it looked like the test was only added for correctness.
I think if the combines happened in topo order it might do better. The DAG looks like this:
t56: v4i32 = BUILD_VECTOR undef:i32, undef:i32, t2, undef:i32
t57: v2i64 = bitcast t56
t36: v2i64 = BUILD_VECTOR Constant:i64<1>, Constant:i64<1>
t39: v2i64 = and t57, t36
t50: v16i8 = bitcast t39
t51: i8 = extract_vector_elt t50, Constant:i64<8>
t45: i32 = zero_extend t51
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not really an authority on this, but the combine code makes sense so I wouldn't think its a dealbreaker, but would nice if we could fix it up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So my hunch would be that scalarizeExtractedBinop can be extended to permit the fold:
extract_vector_elt(oneusebitcast(bitlogicop(x,y)))
->
bitlogicop(extract_vector_elt(bitcast(x)),extract_vector_elt(bitcast(y)))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, would imagine the 2x extract_vector_elt
would be slower. Why is that advantagious?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
scalarizeExtractedBinop only performs this if at least one of the operands can freely extract (i.e. its a constant)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@davemgreen Any luck with this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi. Unfortunately I think this might always been a bit too far down the todo list to get to any time soon. If you happen to have improvements for it that would useful.
I do think though, in my opinion, that this shouldn't be considered the kind of change that should block for the patch. After all it is optimized away by instcombine (https://godbolt.org/z/P4vMYd5oc), it was added to protect against a crash and still does not crash, we don't have any evidence of it coming up in practice, and most importantly it gets optimized away by instcombine. It sounds like you think otherwise though, and you think it might be more important.
// SETCC (SETCC), [0|1], [EQ|NE] -> SETCC | ||
if (N0.getOpcode() == ISD::SETCC && | ||
// SETCC (X), [0|1], [EQ|NE] -> X if X is known 0/1 | ||
if ((N0.getOpcode() == ISD::SETCC || VT != MVT::i1) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't quite understand the VT != MVT::i1
here. Doesn't that break things like setcc (setcc ...))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It will do the transform if it is a setcc, or if the type isn't an i1. i1 types are trivially true for the known bits combine, and handled further below when the foldBools flag is set.
If we have a `SETCC (SETCC), 0, NE` and ZeroOrOneBooleanContent, we can remove the outer setcc as it will produce the same value as the inner. This can be generalized to anything where the top bits are known to be 0, as the value will remain as 1 or 0.
d723c95
to
243be99
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm. The small test regression is unrelated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, no objection from me
If we have a
SETCC (SETCC), 0, NE
and ZeroOrOneBooleanContent, we can removethe outer setcc as it will produce the same value as the inner. This can be
generalized to anything where the top bits are known to be 0, as the value will
remain as 1 or 0.