Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BDCE] Handle multi-use and/or/xor on demanded bits #79688

Conversation

antoniofrighetto
Copy link
Contributor

@antoniofrighetto antoniofrighetto commented Jan 27, 2024

Simplify multi-use and or ors when these last do not affect the demanded bits being considered.

Fixes: #78596.

Proofs: https://alive2.llvm.org/ce/z/kmVqE-.

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 27, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Antonio Frighetto (antoniofrighetto)

Changes

Simplify multi-use select of ands when these last do not affect the demanded bits being considered.

Fixes: #78596.

Proofs: https://alive2.llvm.org/ce/z/BBdrCz.


Full diff: https://github.com/llvm/llvm-project/pull/79688.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Scalar/BDCE.cpp (+27)
  • (added) llvm/test/Transforms/BDCE/select-multiuse.ll (+122)
diff --git a/llvm/lib/Transforms/Scalar/BDCE.cpp b/llvm/lib/Transforms/Scalar/BDCE.cpp
index 1fa2c75b0f42ac1..178b5a0da469cdd 100644
--- a/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -125,6 +125,33 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
       }
     }
 
+    // Simplify select of ands when the mask does not affect the demanded bits.
+    if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
+      APInt Demanded = DB.getDemandedBits(SI);
+      if (!Demanded.isAllOnes()) {
+        for (Use &U : I.operands()) {
+          auto *And = dyn_cast<BinaryOperator>(&U);
+          if (!And || !And->hasOneUse() || And->getOpcode() != Instruction::And)
+            continue;
+
+          auto *CV = dyn_cast<ConstantInt>(And->getOperand(1));
+          if (!CV)
+            continue;
+
+          APInt Mask = CV->getValue();
+          if ((Mask | ~Demanded).isAllOnes()) {
+            clearAssumptionsOfUsers(And, DB);
+            U.set(And->getOperand(0));
+            Worklist.push_back(And);
+            Changed = true;
+          }
+        }
+      }
+
+      if (Changed)
+        continue;
+    }
+
     for (Use &U : I.operands()) {
       // DemandedBits only detects dead integer uses.
       if (!U->getType()->isIntOrIntVectorTy())
diff --git a/llvm/test/Transforms/BDCE/select-multiuse.ll b/llvm/test/Transforms/BDCE/select-multiuse.ll
new file mode 100644
index 000000000000000..8ad91cc7458d114
--- /dev/null
+++ b/llvm/test/Transforms/BDCE/select-multiuse.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=bdce < %s | FileCheck %s
+
+define void @select(i1 %c, i64 %a, i64 %b) {
+; CHECK-LABEL: define void @select(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i64 [[A]], i64 [[B]]
+; CHECK-NEXT:    [[RET1:%.*]] = and i64 [[S]], 8
+; CHECK-NEXT:    [[RET2:%.*]] = and i64 [[S]], 16
+; CHECK-NEXT:    call void @use(i64 [[RET1]])
+; CHECK-NEXT:    call void @use(i64 [[RET2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %and1 = and i64 %a, 24                          ; Mask:          0001 1000
+  %and2 = and i64 %b, 25                          ; Mask:          0001 1001
+  %s = select i1 %c, i64 %and1, i64 %and2
+  %ret1 = and i64 %s, 8                           ; Demanded bits: 0000 1000
+  %ret2 = and i64 %s, 16                          ; Demanded bits: 0001 0000
+  call void @use(i64 %ret1)
+  call void @use(i64 %ret2)
+  ret void
+}
+
+define void @select_2(i1 %c, i64 %a, i64 %b) {
+; CHECK-LABEL: define void @select_2(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[B]], 23
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i64 [[A]], i64 [[AND2]]
+; CHECK-NEXT:    [[RET1:%.*]] = and i64 [[S]], 8
+; CHECK-NEXT:    [[RET2:%.*]] = and i64 [[S]], 16
+; CHECK-NEXT:    call void @use(i64 [[RET1]])
+; CHECK-NEXT:    call void @use(i64 [[RET2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %and1 = and i64 %a, 25                          ; Mask:          0001 1001
+  %and2 = and i64 %b, 23                          ; Mask:          0001 0111
+  %s = select i1 %c, i64 %and1, i64 %and2
+  %ret1 = and i64 %s, 8                           ; Demanded bits: 0000 1000
+  %ret2 = and i64 %s, 16                          ; Demanded bits: 0001 0000
+  call void @use(i64 %ret1)
+  call void @use(i64 %ret2)
+  ret void
+}
+
+define void @select_different_demanded(i1 %c, i64 %a, i64 %b) {
+; CHECK-LABEL: define void @select_different_demanded(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND1:%.*]] = and i64 0, 24
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[B]], 25
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i64 [[AND1]], i64 [[AND2]]
+; CHECK-NEXT:    [[RET1:%.*]] = and i64 [[S]], 3
+; CHECK-NEXT:    [[RET2:%.*]] = and i64 [[S]], 7
+; CHECK-NEXT:    call void @use(i64 [[RET1]])
+; CHECK-NEXT:    call void @use(i64 [[RET2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %and1 = and i64 %a, 24                          ; Mask:          0001 1000
+  %and2 = and i64 %b, 25                          ; Mask:          0001 1001
+  %s = select i1 %c, i64 %and1, i64 %and2
+  %ret1 = and i64 %s, 3                           ; Demanded bits: 0000 0011
+  %ret2 = and i64 %s, 7                           ; Demanded bits: 0000 0111
+  call void @use(i64 %ret1)
+  call void @use(i64 %ret2)
+  ret void
+}
+
+define void @and_multiuse(i1 %c, i64 %a, i64 %b) {
+; CHECK-LABEL: define void @and_multiuse(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[B]], 25
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i64 [[A]], i64 [[AND2]]
+; CHECK-NEXT:    [[RET1:%.*]] = and i64 [[S]], 8
+; CHECK-NEXT:    [[RET2:%.*]] = and i64 [[S]], 16
+; CHECK-NEXT:    call void @use(i64 [[RET1]])
+; CHECK-NEXT:    call void @use2(i64 [[RET2]], i64 [[AND2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %and1 = and i64 %a, 24
+  %and2 = and i64 %b, 25
+  %s = select i1 %c, i64 %and1, i64 %and2
+  %ret1 = and i64 %s, 8
+  %ret2 = and i64 %s, 16
+  call void @use(i64 %ret1)
+  call void @use2(i64 %ret2, i64 %and2)
+  ret void
+}
+
+define void @select_vectorized(i1 %c, <2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: define void @select_vectorized(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i8> [[A]], <i8 28, i8 28>
+; CHECK-NEXT:    [[AND2:%.*]] = and <2 x i8> [[B]], <i8 29, i8 29>
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], <2 x i8> [[AND1]], <2 x i8> [[AND2]]
+; CHECK-NEXT:    [[RET1:%.*]] = and <2 x i8> [[S]], <i8 4, i8 4>
+; CHECK-NEXT:    [[RET2:%.*]] = and <2 x i8> [[S]], <i8 12, i8 12>
+; CHECK-NEXT:    call void @use3(<2 x i8> [[RET1]])
+; CHECK-NEXT:    call void @use3(<2 x i8> [[RET2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %and1 = and <2 x i8> %a, <i8 28, i8 28>
+  %and2 = and <2 x i8> %b, <i8 29, i8 29>
+  %s = select i1 %c, <2 x i8> %and1, <2 x i8> %and2
+  %ret1 = and <2 x i8> %s, <i8 4, i8 4>
+  %ret2 = and <2 x i8> %s, <i8 12, i8 12>
+  call void @use3(<2 x i8> %ret1)
+  call void @use3(<2 x i8> %ret2)
+  ret void
+}
+
+declare void @use(i64)
+declare void @use2(i64, i64)
+declare void @use3(<2 x i8>)

llvm/lib/Transforms/Scalar/BDCE.cpp Outdated Show resolved Hide resolved
llvm/lib/Transforms/Scalar/BDCE.cpp Outdated Show resolved Hide resolved
llvm/lib/Transforms/Scalar/BDCE.cpp Outdated Show resolved Hide resolved
llvm/lib/Transforms/Scalar/BDCE.cpp Outdated Show resolved Hide resolved
llvm/lib/Transforms/Scalar/BDCE.cpp Show resolved Hide resolved
dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 27, 2024
Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...wat? This should not be looking at selects. You should only be looking at the demanded bits of the and.

@antoniofrighetto
Copy link
Contributor Author

Could you kindly provide an example showing how not restricting this to the select may be beneficial? We don't need to specialize it for the select, but restricting the input space looked reasonable to me.

You should only be looking at the demanded bits of the and.

You mean we should only be looking at the demanded bits of the instruction itself when the operand analyzed is the and? I don't see how the optimization can take place if the instruction examined is not the multi-use instruction.

@nikic
Copy link
Contributor

nikic commented Jan 27, 2024

Could you kindly provide an example showing how not restricting this to the select may be beneficial? We don't need to specialize it for the select, but restricting the input space looked reasonable to me.

You can replace the select with pretty much any instruction that transfers demanded bits, e.g. https://alive2.llvm.org/ce/z/MF1Ry3.

You should only be looking at the demanded bits of the and.

You mean we should only be looking at the demanded bits of the instruction itself when the operand analyzed is the and? I don't see how the optimization can take place if the instruction examined is not the multi-use instruction.

I would expect you to query the demanded bits of the and instruction, similar to the code directly above that handles sext. Unless I'm missing something, that will cover the motivating pattern.

@XChy
Copy link
Member

XChy commented Jan 27, 2024

For a single BDCE pass, I think a minimal testcase can be like:

define void @src(i64 %a) {
entry:
  %and = and i64 %a, 24
  %ret1 = and i64 %and, 8
  %ret2 = and i64 %and, 16
  call void @use(i64 %ret1)
  call void @use(i64 %ret2)
  ret void
}

define void @tgt(i64 %a) {
entry:
  %ret1 = and i64 %a, 8
  %ret2 = and i64 %a, 16
  call void @use(i64 %ret1)
  call void @use(i64 %ret2)
  ret void
}

@antoniofrighetto
Copy link
Contributor Author

I would expect you to query the demanded bits of the and instruction, similar to the code directly above that handles sext. Unless I'm missing something, that will cover the motivating pattern.

Reviewed, thanks, somehow I thought it would be easier to handle the and/or as operands of the instruction being analyzed (and get its current demanded bits).

@antoniofrighetto antoniofrighetto force-pushed the feature/bdce-multiuse-select-of-ands branch from ac76f66 to c862fbb Compare January 27, 2024 18:38
@antoniofrighetto antoniofrighetto changed the title [BDCE] Handle multi-use select of ands on demanded bits [BDCE] Handle multi-use and/or on demanded bits Jan 27, 2024
@antoniofrighetto antoniofrighetto force-pushed the feature/bdce-multiuse-select-of-ands branch from c862fbb to 524bf49 Compare January 27, 2024 18:58
llvm/lib/Transforms/Scalar/BDCE.cpp Outdated Show resolved Hide resolved
llvm/lib/Transforms/Scalar/BDCE.cpp Outdated Show resolved Hide resolved
dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 28, 2024
Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, this looks a lot more like what I had in mind.

entry:
%and1 = and i64 %a, 24 ; Mask: 0001 1000
%and2 = and i64 %b, 25 ; Mask: 0001 1001
%or = or i64 %and1, %and2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add a test variant with or disjoint, to show that the flag gets dropped correctly?

@@ -9,8 +9,7 @@ declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
; First fshr operand is dead.
define i32 @pr39771_fshr_multi_use_instr(i32 %a) {
; CHECK-LABEL: @pr39771_fshr_multi_use_instr(
; CHECK-NEXT: [[X:%.*]] = or i32 [[A:%.*]], 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you change the or operand to 1 (or something else that doesn't optimize away) instead? Dropping it is not really the point of this tests.

@antoniofrighetto antoniofrighetto force-pushed the feature/bdce-multiuse-select-of-ands branch from 524bf49 to f98b584 Compare January 29, 2024 07:23
@antoniofrighetto
Copy link
Contributor Author

Reviewed, thanks.

Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@XChy XChy changed the title [BDCE] Handle multi-use and/or on demanded bits [BDCE] Handle multi-use and/or/xor on demanded bits Jan 29, 2024
Copy link
Member

@XChy XChy left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@antoniofrighetto antoniofrighetto force-pushed the feature/bdce-multiuse-select-of-ands branch from f98b584 to 3b03a6a Compare January 29, 2024 17:59
Simplify multi-use `and`/`or`/`xor` when these last
do not affect the demanded bits being considered.

Fixes: llvm#78596.

Proofs: https://alive2.llvm.org/ce/z/EjuWHa.
@antoniofrighetto antoniofrighetto force-pushed the feature/bdce-multiuse-select-of-ands branch from 3b03a6a to 2073782 Compare January 29, 2024 18:03
@antoniofrighetto antoniofrighetto merged commit 2073782 into llvm:main Jan 29, 2024
3 of 4 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[BDCE] Missed optimization: simplify demanded bits on multi-use instructions
5 participants