Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[InstCombine] Extend foldICmpAddConstant to disjoint or. #75899

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

mgudim
Copy link
Contributor

@mgudim mgudim commented Dec 19, 2023

No description provided.

@mgudim mgudim requested a review from nikic as a code owner December 19, 2023 07:03
@mgudim mgudim marked this pull request as draft December 19, 2023 07:04
@llvmbot
Copy link
Collaborator

llvmbot commented Dec 19, 2023

@llvm/pr-subscribers-llvm-transforms

Author: Mikhail Gudim (mgudim)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/75899.diff

4 Files Affected:

  • (modified) llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp (+17-15)
  • (modified) llvm/lib/Transforms/InstCombine/InstCombineInternal.h (+1-1)
  • (modified) llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp (+1-1)
  • (modified) llvm/test/Transforms/InstCombine/icmp.ll (+10)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 289976718e52f3..1cbcec3f21e8b2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2918,17 +2918,19 @@ static Value *createLogicFromTable(const std::bitset<4> &Table, Value *Op0,
 }
 
 /// Fold icmp (add X, Y), C.
-Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
-                                                   BinaryOperator *Add,
+Instruction *InstCombinerImpl::foldICmpAddLikeConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *AddLike,
                                                    const APInt &C) {
-  Value *Y = Add->getOperand(1);
-  Value *X = Add->getOperand(0);
+  Value *X = nullptr;
+  Value *Y = nullptr;
+  if (!match(AddLike, m_AddLike(m_Value(X), m_Value(Y))))
+    return nullptr;
 
   Value *Op0, *Op1;
   Instruction *Ext0, *Ext1;
   const CmpInst::Predicate Pred = Cmp.getPredicate();
-  if (match(Add,
-            m_Add(m_CombineAnd(m_Instruction(Ext0), m_ZExtOrSExt(m_Value(Op0))),
+  if (match(AddLike,
+            m_AddLike(m_CombineAnd(m_Instruction(Ext0), m_ZExtOrSExt(m_Value(Op0))),
                   m_CombineAnd(m_Instruction(Ext1),
                                m_ZExtOrSExt(m_Value(Op1))))) &&
       Op0->getType()->isIntOrIntVectorTy(1) &&
@@ -2949,7 +2951,7 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
     Table[2] = ComputeTable(true, false);
     Table[3] = ComputeTable(true, true);
     if (auto *Cond =
-            createLogicFromTable(Table, Op0, Op1, Builder, Add->hasOneUse()))
+            createLogicFromTable(Table, Op0, Op1, Builder, AddLike->hasOneUse()))
       return replaceInstUsesWith(Cmp, Cond);
   }
   const APInt *C2;
@@ -2957,14 +2959,15 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
     return nullptr;
 
   // Fold icmp pred (add X, C2), C.
-  Type *Ty = Add->getType();
+  Type *Ty = AddLike->getType();
 
   // If the add does not wrap, we can always adjust the compare by subtracting
   // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
   // are canonicalized to SGT/SLT/UGT/ULT.
-  if ((Add->hasNoSignedWrap() &&
+  if (AddLike->getOpcode() == Instruction::Or ||
+    (AddLike->hasNoSignedWrap() &&
        (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) ||
-      (Add->hasNoUnsignedWrap() &&
+      (AddLike->hasNoUnsignedWrap() &&
        (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) {
     bool Overflow;
     APInt NewC =
@@ -3021,7 +3024,7 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
       return new ICmpInst(ICmpInst::ICMP_ULE, X, ConstantInt::get(Ty, C));
   }
 
-  if (!Add->hasOneUse())
+  if (!AddLike->hasOneUse())
     return nullptr;
 
   // X+C <u C2 -> (X & -C2) == C
@@ -3674,6 +3677,9 @@ InstCombinerImpl::foldICmpInstWithConstantAllowUndef(ICmpInst &Cmp,
 Instruction *InstCombinerImpl::foldICmpBinOpWithConstant(ICmpInst &Cmp,
                                                          BinaryOperator *BO,
                                                          const APInt &C) {
+  if (Instruction *I = foldICmpAddLikeConstant(Cmp, BO, C))
+    return I;
+
   switch (BO->getOpcode()) {
   case Instruction::Xor:
     if (Instruction *I = foldICmpXorConstant(Cmp, BO, C))
@@ -3716,10 +3722,6 @@ Instruction *InstCombinerImpl::foldICmpBinOpWithConstant(ICmpInst &Cmp,
     if (Instruction *I = foldICmpSubConstant(Cmp, BO, C))
       return I;
     break;
-  case Instruction::Add:
-    if (Instruction *I = foldICmpAddConstant(Cmp, BO, C))
-      return I;
-    break;
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f86db698ef8f12..756da48a1800e0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -671,7 +671,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                    const APInt &C);
   Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub,
                                    const APInt &C);
-  Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add,
+  Instruction *foldICmpAddLikeConstant(ICmpInst &Cmp, BinaryOperator *AddLike,
                                    const APInt &C);
   Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And,
                                      const APInt &C1);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 20bf00344b144b..f5a91f5a84f012 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1550,7 +1550,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
   if (C0->getType() != Sel.getType())
     return nullptr;
 
-  // ULT with 'add' of a constant is canonical. See foldICmpAddConstant().
+  // ULT with 'add' of a constant is canonical. See foldICmpAddLikeConstant().
   // FIXME: Are there more magic icmp predicate+constant pairs we must avoid?
   //        Or should we just abandon this transform entirely?
   if (Pred == CmpInst::ICMP_ULT && match(X, m_Add(m_Value(), m_Constant())))
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 1c7bb36f0d34c0..33d09bfad856a7 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -4912,3 +4912,13 @@ define i1 @or_positive_sgt_zero_multi_use(i8 %a) {
   %cmp = icmp sgt i8 %b, 0
   ret i1 %cmp
 }
+
+define i1 @icmp_disjoint_or(i32 %x) {
+; CHECK-LABEL: @icmp_disjoint_or(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[X:%.*]], 35
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %or_ = or disjoint i32 %x, 6
+  %C = icmp sgt i32 %or_, 41
+  ret i1 %C
+}

Copy link

github-actions bot commented Dec 19, 2023

✅ With the latest revision this PR passed the C/C++ code formatter.

@dtcxzyw
Copy link
Member

dtcxzyw commented Dec 20, 2023

@mgudim Could you please have a look at dtcxzyw/llvm-opt-benchmark#10?
Our CI found that this patch introduced some unnecessary and %size, mask patterns before calling operator new(size_t).
Some file changes:

bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
bench/proxygen/optimized/Service.cpp.ll
bench/re2/optimized/compile.cc.ll
bench/re2/optimized/dfa.cc.ll
bench/re2/optimized/prefilter_tree.cc.ll
bench/re2/optimized/regexp.cc.ll
bench/velox/optimized/ArrayDistinct.cpp.ll
bench/velox/optimized/ArrayDuplicates.cpp.ll
bench/velox/optimized/ArrayIntersectExcept.cpp.ll
bench/velox/optimized/Filter.cpp.ll
...

@mgudim
Copy link
Contributor Author

mgudim commented Dec 20, 2023

@mgudim Could you please have a look at dtcxzyw/llvm-opt-benchmark#10? Our CI found that this patch introduced some unnecessary and %size, ~signmask patterns before calling operator new(size_t). Some file changes:

@dtcxzyw Sure, I'll take a look. But I didn't understand what file exactly the regressions are at. Can you paste the IR before and after the transformation. Preferably the smallest function which shows the regression.

@dtcxzyw
Copy link
Member

dtcxzyw commented Dec 20, 2023

@mgudim Could you please have a look at dtcxzyw/llvm-opt-benchmark#10? Our CI found that this patch introduced some unnecessary and %size, ~signmask patterns before calling operator new(size_t). Some file changes:

@dtcxzyw Sure, I'll take a look. But I didn't understand what file exactly the regressions are at. Can you paste the IR before and after the transformation. Preferably the smallest function which shows the regression.

One example:

diff --git a/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll b/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
index 54ccb32b..9d7fe815 100644
--- a/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
+++ b/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
@@ -7152,7 +7152,7 @@ entry:
   %mul3.i31 = shl i64 %newChunkCount, 8
   %retval.0.i32 = select i1 %cmp.i28, i64 %add.i30, i64 %mul3.i31
   store i64 %retval.0.i32, ptr %newAllocSize, align 8
-  %cmp.i.i.i.i.i = icmp slt i64 %retval.0.i32, 0
+  %cmp.i.i.i.i.i = icmp slt i64 %retval.0.i32, -15
   br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN5folly3f146detail10BasePolicyImPN8proxygen18HTTP2PriorityQueue4NodeEvvvSt4pairIKmS6_EE12beforeRehashEmmmmRPh.exit
 
 if.then.i.i.i.i.i:                                ; preds = %entry
@@ -7160,7 +7160,8 @@ if.then.i.i.i.i.i:                                ; preds = %entry
   unreachable
 
 _ZN5folly3f146detail10BasePolicyImPN8proxygen18HTTP2PriorityQueue4NodeEvvvSt4pairIKmS6_EE12beforeRehashEmmmmRPh.exit: ; preds = %entry
-  %call5.i.i2.i.i1.i = tail call noalias noundef nonnull ptr @_Znwm(i64 noundef %retval.0.i32) #30
+  %div1.i.i.i = and i64 %retval.0.i32, 9223372036854775792
+  %call5.i.i2.i.i1.i = tail call noalias noundef nonnull ptr @_Znwm(i64 noundef %div1.i.i.i) #30
   store ptr %call5.i.i2.i.i1.i, ptr %rawAllocation, align 8
   store i8 0, ptr %undoState, align 1
   %cmp5.not.i = icmp ne i64 %newChunkCount, 0

I will post a reduced test case later :)

@dtcxzyw
Copy link
Member

dtcxzyw commented Dec 29, 2023

@mgudim Could you please have a look at dtcxzyw/llvm-opt-benchmark#10? Our CI found that this patch introduced some unnecessary and %size, ~signmask patterns before calling operator new(size_t). Some file changes:

@dtcxzyw Sure, I'll take a look. But I didn't understand what file exactly the regressions are at. Can you paste the IR before and after the transformation. Preferably the smallest function which shows the regression.

One example:

diff --git a/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll b/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
index 54ccb32b..9d7fe815 100644
--- a/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
+++ b/bench/proxygen/optimized/HTTP2PriorityQueue.cpp.ll
@@ -7152,7 +7152,7 @@ entry:
   %mul3.i31 = shl i64 %newChunkCount, 8
   %retval.0.i32 = select i1 %cmp.i28, i64 %add.i30, i64 %mul3.i31
   store i64 %retval.0.i32, ptr %newAllocSize, align 8
-  %cmp.i.i.i.i.i = icmp slt i64 %retval.0.i32, 0
+  %cmp.i.i.i.i.i = icmp slt i64 %retval.0.i32, -15
   br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZN5folly3f146detail10BasePolicyImPN8proxygen18HTTP2PriorityQueue4NodeEvvvSt4pairIKmS6_EE12beforeRehashEmmmmRPh.exit
 
 if.then.i.i.i.i.i:                                ; preds = %entry
@@ -7160,7 +7160,8 @@ if.then.i.i.i.i.i:                                ; preds = %entry
   unreachable
 
 _ZN5folly3f146detail10BasePolicyImPN8proxygen18HTTP2PriorityQueue4NodeEvvvSt4pairIKmS6_EE12beforeRehashEmmmmRPh.exit: ; preds = %entry
-  %call5.i.i2.i.i1.i = tail call noalias noundef nonnull ptr @_Znwm(i64 noundef %retval.0.i32) #30
+  %div1.i.i.i = and i64 %retval.0.i32, 9223372036854775792
+  %call5.i.i2.i.i1.i = tail call noalias noundef nonnull ptr @_Znwm(i64 noundef %div1.i.i.i) #30
   store ptr %call5.i.i2.i.i1.i, ptr %rawAllocation, align 8
   store i8 0, ptr %undoState, align 1
   %cmp5.not.i = icmp ne i64 %newChunkCount, 0

I will post a reduced test case later :)

Sorry, I cannot provide a reduced test since it is hard to do semantic-preserving reduction.
It seems that we lose the information %retval.0.i32 >= 0.

@mgudim
Copy link
Contributor Author

mgudim commented Dec 30, 2023

@dtcxzyw

Sorry, I cannot provide a reduced test since it is hard to do semantic-preserving reduction.
It seems that we lose the information %retval.0.i32 >= 0.

No worries. I'll take a look, but sometime later. I'm in the middle of something else right now.

@mgudim
Copy link
Contributor Author

mgudim commented Dec 30, 2023

@dtcxzyw Is the diff shown for output of the optimizer before / after my optimization. If so, where is the original source of the benchmark?

Basically, I am asking for steps to reproduce the regression.

@dtcxzyw
Copy link
Member

dtcxzyw commented Dec 31, 2023

@dtcxzyw Is the diff shown for output of the optimizer before / after my optimization. If so, where is the original source of the benchmark?

Basically, I am asking for steps to reproduce the regression.

wget https://raw.githubusercontent.com/dtcxzyw/llvm-opt-benchmark/main/bench/proxygen/original/HTTP2PriorityQueue.cpp.ll
opt -O3 HTTP2PriorityQueue.cpp.ll -S | grep -E "and i64.+9223372036854775792"

@mgudim
Copy link
Contributor Author

mgudim commented Jan 4, 2024

OK, I see what's going on. First, note that 9223372036854775792 has 0 in position 63, bits [0, 3] are zero and all other bits are 1. Below is the simplified test case:

define i64 @foo(i64 %x, i64 %y, i1 %c) {
  %shlx_ = shl i64 %x, 8
  %shly_ = shl i64 %y, 4
  %add2 = add i64 %shly_, 16
  %select_ = select i1 %c, i64 %shlx_, i64 %add2
  %add_ = add i64 %select_, 15
  %cmp_ = icmp slt i64 %add_, 0
  br i1 %cmp_, label %t, label %f

t:
	unreachable

f:
	%and_ = and i64 %add_, 9223372036854775792
	ret i64 %and_
}

Before my change, the code looks like this when we come to visitAnd in InstCombine:

define i64 @foo(i64 %x, i64 %y, i1 %c) {
  %shlx_ = shl i64 %x, 8
  %shly_ = shl i64 %y, 4
  %add2 = add i64 %shly_, 16
  %select_ = select i1 %c, i64 %shlx_, i64 %add2
  %cmp_ = icmp slt i64 %select_, 0
  br i1 %cmp_, label %t, label %f

t:                                                ; preds = %0
   unreachable

f:                                                ; preds = %0
  %add_ = or disjoint i64 %select_, 15
  %and = and i64 %select_, 9223372036854775792
  ret i64 %and
}

In other words, the cmp_ = icmp slt i64 %select_, 0 was NOT simplified, so we know that if we come to f highest bit is zero. We now see that add and and are not needed.

After my change:

define i64 @foo(i64 %x, i64 %y, i1 %c) {
  %shlx_ = shl i64 %x, 8
  %shly_ = shl i64 %y, 4
  %add2 = add i64 %shly_, 16
  %select_ = select i1 %c, i64 %shlx_, i64 %add2
  %cmp_ = icmp slt i64 %select_, -15
  br i1 %cmp_, label %t, label %f

t:                                                ; preds = %0
  unreachable

f:                                                ; preds = %0
  %add_ = or disjoint i64 %select_, 15
  %and = and i64 %add_, 9223372036854775792
  ret i64 %and
}

The compare instruction is now changed to %cmp_ = icmp slt i64 %select_, -15, so now when we analyze and we can't deduce that the highest bit is zero anyway.

There are a couple of ways to fix this but I am not sure which one is the best. I was thinking we could add some capabilities to simplifyICmpWithZero. Or maybe foldICmpUsingKnownBits?

@nikic What do you think?

@nikic
Copy link
Contributor

nikic commented Jan 4, 2024

After my change:

I don't really get what your change does in that example. Why is it changing the comparison operand from 0 to -15? Where is the or + icmp pattern in this sample?

@mgudim
Copy link
Contributor Author

mgudim commented Jan 4, 2024

@nikic

I don't really get what your change does in that example. Why is it changing the comparison operand from 0 to -15? Where is the or + icmp pattern in this sample?

In the "simplified testcase" we have this:

  %add_ = add i64 %select_, 15
  %cmp_ = icmp slt i64 %add_, 0

The add instruction first changes to or. Then during visitICmp, due to my change it gets deleted, at expance of replacing 0 with -15 in the comparison. BUT, now, since it's -15 we can't deduce that the highest bit is nonzero if we come to f block

@nikic
Copy link
Contributor

nikic commented Jan 4, 2024

I see, thanks. A possible way to fix this would be to try to take into account already known bits when adding known bits from conditions, i.e. x sgt -15 implies x sgt 0 if the low 4 bits of x are known zero. But I don't think this is worth bothering with. (An alternative to avoid the problem in the first place would be a one-use limitation, but that can cause other problems.)

(It looks like the patch needs a rebase.)

@mgudim
Copy link
Contributor Author

mgudim commented Jan 6, 2024

@nikic @dtcxzyw is this good to merge?

@mgudim
Copy link
Contributor Author

mgudim commented Jan 6, 2024

I'll try to put in a fix for that degradation in the near future @dtcxzyw

@dtcxzyw
Copy link
Member

dtcxzyw commented Jan 6, 2024

I'll try to put in a fix for that degradation in the near future @dtcxzyw

You can file an issue to track the work.

@mgudim
Copy link
Contributor Author

mgudim commented Jan 8, 2024

ping

@mgudim
Copy link
Contributor Author

mgudim commented Jan 16, 2024

@nikic @dtcxzyw ping

@mgudim
Copy link
Contributor Author

mgudim commented Jan 20, 2024

@dtcxzyw @nikic Is this OK to merge?

@mgudim
Copy link
Contributor Author

mgudim commented Jan 25, 2024

I have created an MR to fix the degradation: #79405

@mgudim
Copy link
Contributor Author

mgudim commented Jan 27, 2024

@nikic @dtcxzyw @goldsteinn
ping

@mgudim
Copy link
Contributor Author

mgudim commented Jan 30, 2024

@nikic @dtcxzyw @goldsteinn
Ping.

@mgudim
Copy link
Contributor Author

mgudim commented Feb 6, 2024

@nikic @dtcxzyw @goldsteinn
Ping.

@dtcxzyw
Copy link
Member

dtcxzyw commented Feb 6, 2024

@mgudim Are you willing to block this patch until #79405 is landed? If not, I will give a quick review for this patch.

@mgudim
Copy link
Contributor Author

mgudim commented Feb 6, 2024

@mgudim Are you willing to block this patch until #79405 is landed? If not, I will give a quick review for this patch.

@dtcxzyw If you think it's better to wait for #79405 to be merged I am good with that. I just don't want the patches to become abandoned and waste my work.

@@ -3721,10 +3728,6 @@ Instruction *InstCombinerImpl::foldICmpBinOpWithConstant(ICmpInst &Cmp,
if (Instruction *I = foldICmpSubConstant(Cmp, BO, C))
return I;
break;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
break;
break;
case Instruction::Or:
if (Instruction *I = foldICmpOrConstant(Cmp, BO, C))
return I;
[[fallthrough]];

I prefer the switch+fallthrough approach.

%or_ = or disjoint i32 %x, 5
%C = icmp ne i32 %or_, 5
ret i1 %C
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also add some negative tests.

@dtcxzyw
Copy link
Member

dtcxzyw commented Mar 25, 2024

Reverse ping :)

@mgudim
Copy link
Contributor Author

mgudim commented Mar 26, 2024

Reverse ping :)

I am busy working on other things now, but I plan to come back to this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

5 participants