-
Notifications
You must be signed in to change notification settings - Fork 15k
[InstCombine] Allow folding cross-lane operations into PHIs/selects #164388
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Previously, cross-lane operations were disallowed here, but they are only problematic if the `select` condition is a vector, as the input of the operation is not simply one of the arms of the phi/select.
|
@llvm/pr-subscribers-llvm-transforms Author: Benjamin Maxwell (MacDue) ChangesPreviously, cross-lane operations were disallowed here, but they are only problematic if the Full diff: https://github.com/llvm/llvm-project/pull/164388.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dab200d86d4a0..4a9128a3384f0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4003,18 +4003,28 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
// Try to fold intrinsic into select/phi operands. This is legal if:
// * The intrinsic is speculatable.
- // * The select condition is not a vector, or the intrinsic does not
- // perform cross-lane operations.
- if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) &&
- isNotCrossLaneOperation(II))
+ // * The operand is one of the following:
+ // - a phi.
+ // - a select with a scalar condition.
+ // - a select with a vector condition and II is not a cross lane operation.
+ if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) {
for (Value *Op : II->args()) {
- if (auto *Sel = dyn_cast<SelectInst>(Op))
- if (Instruction *R = FoldOpIntoSelect(*II, Sel))
+ if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+ bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy();
+ if (IsVectorCond && !isNotCrossLaneOperation(II))
+ continue;
+ // Don't replace a scalar select with a more expensive vector select if
+ // we can't simplify both arms of the select.
+ bool SimplifyBothArms = !Op->getType()->isVectorTy() && II->getType()->isVectorTy();
+ if (Instruction *R = FoldOpIntoSelect(
+ *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms))
return R;
+ }
if (auto *Phi = dyn_cast<PHINode>(Op))
if (Instruction *R = foldOpIntoPhi(*II, Phi))
return R;
}
+ }
if (Instruction *Shuf = foldShuffledIntrinsicOperands(II))
return Shuf;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 943c223e516fb..ede73f86f1944 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -664,7 +664,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
/// This also works for Cast instructions, which obviously do not have a
/// second operand.
Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
- bool FoldWithMultiUse = false);
+ bool FoldWithMultiUse = false,
+ bool SimplifyBothArms = false);
/// This is a convenience wrapper function for the above two functions.
Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3f11cae143b81..c52d030efd229 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
}
Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
- bool FoldWithMultiUse) {
+ bool FoldWithMultiUse,
+ bool SimplifyBothArms) {
// Don't modify shared select instructions unless set FoldWithMultiUse
if (!SI->hasOneUse() && !FoldWithMultiUse)
return nullptr;
@@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
if (!NewTV && !NewFV)
return nullptr;
+ if (SimplifyBothArms)
+ return nullptr;
+
// Create an instruction for the arm that did not fold.
if (!NewTV)
NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this);
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index 2f1f9fc2b6f9e..18f068856951e 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -222,8 +222,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
define i32 @vec_to_scalar_select_scalar(i1 %b) {
; CHECK-LABEL: @vec_to_scalar_select_scalar(
-; CHECK-NEXT: [[S:%.*]] = select i1 [[B:%.*]], <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
-; CHECK-NEXT: [[C:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[S]])
+; CHECK-NEXT: [[C:%.*]] = select i1 [[B:%.*]], i32 3, i32 7
; CHECK-NEXT: ret i32 [[C]]
;
%s = select i1 %b, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 3454835d3ad65..c8a1cf6efb742 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -3026,6 +3026,31 @@ join:
ret i32 %umax
}
+define i32 @cross_lane_intrinsic_over_phi(i1 %c, i1 %c2, <4 x i32> %a) {
+; CHECK-LABEL: @cross_lane_intrinsic_over_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK: if:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A:%.*]])
+; CHECK-NEXT: br label [[JOIN]]
+; CHECK: join:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: call void @may_exit()
+; CHECK-NEXT: ret i32 [[PHI]]
+;
+entry:
+ br i1 %c, label %if, label %join
+
+if:
+ br label %join
+
+join:
+ %phi = phi <4 x i32> [ %a, %if ], [ zeroinitializer, %entry ]
+ call void @may_exit()
+ %sum = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %phi)
+ ret i32 %sum
+}
+
define i32 @multiple_intrinsics_with_multiple_phi_uses(i1 %c, i32 %arg) {
; CHECK-LABEL: @multiple_intrinsics_with_multiple_phi_uses(
; CHECK-NEXT: entry:
diff --git a/llvm/test/Transforms/InstCombine/select_frexp.ll b/llvm/test/Transforms/InstCombine/select_frexp.ll
index d025aedda7170..ccfcca1fe7cb5 100644
--- a/llvm/test/Transforms/InstCombine/select_frexp.ll
+++ b/llvm/test/Transforms/InstCombine/select_frexp.ll
@@ -115,10 +115,10 @@ define float @test_select_frexp_no_const(float %x, float %y, i1 %cond) {
define i32 @test_select_frexp_extract_exp(float %x, i1 %cond) {
; CHECK-LABEL: define i32 @test_select_frexp_extract_exp(
; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SEL]])
+; CHECK-NEXT: [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
; CHECK-NEXT: [[FREXP_1:%.*]] = extractvalue { float, i32 } [[FREXP]], 1
-; CHECK-NEXT: ret i32 [[FREXP_1]]
+; CHECK-NEXT: [[FREXP_2:%.*]] = select i1 [[COND]], i32 1, i32 [[FREXP_1]]
+; CHECK-NEXT: ret i32 [[FREXP_2]]
;
%sel = select i1 %cond, float 1.000000e+00, float %x
%frexp = call { float, i32 } @llvm.frexp.f32.i32(float %sel)
@@ -132,7 +132,7 @@ define float @test_select_frexp_fast_math_select(float %x, i1 %cond) {
; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT: [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
; CHECK-NEXT: [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0
-; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
+; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
; CHECK-NEXT: ret float [[SELECT_FREXP]]
;
%sel = select nnan ninf nsz i1 %cond, float 1.000000e+00, float %x
diff --git a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
index 63279b04bda28..92ea2c608fe53 100644
--- a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
+++ b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
@@ -35,9 +35,8 @@ define void @ham() #1 {
; CHECK-NEXT: [[SNORK_EXIT:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP1]], <vscale x 16 x float> [[TMP2]], <vscale x 16 x float> undef
-; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[SPEC_SELECT]], i64 0)
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> undef, i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP1]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP2]]
; CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP3]])
; CHECK-NEXT: ret void
;
|
| ; CHECK-NEXT: [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]]) | ||
| ; CHECK-NEXT: [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0 | ||
| ; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]] | ||
| ; CHECK-NEXT: [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The FMFs end up getting dropped here as this was previously handled by foldFrexpOfSelect, but now the more general FoldOpIntoSelect handles it first.
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| // Don't replace a scalar select with a more expensive vector select if | ||
| // we can't simplify both arms of the select. | ||
| bool SimplifyBothArms = | ||
| !Op->getType()->isVectorTy() && II->getType()->isVectorTy(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please point out the test coverage for this special case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was due to regressions in LoopVectorize/AArch64 as a get.active.lane.mask(%base, select %cond, %n, 0) was replaced with select between the mask and zeroinitializer, which is a more expensive operation.
I'll add a negative test in intrinsic-select.ll.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
| if (Instruction *R = FoldOpIntoSelect( | ||
| *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms)) | ||
| return R; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some overflow intrinsic calls (e.g., llvm.uadd.with.overflow.i64) now get folded into the PHI node. It looks unexpected. Can you add some tests to cover these cases (overflow intrinsic + select/PHI)?
See https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2962/files.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added some test cases, I don't think it's entirely unexpected as isNotCrossLaneOperation is ~the same as isTriviallyVectorizable(), and the overflow intrinsics currently are not trivially vectorizable.
So now isNotCrossLaneOperation is only checked for vector select conditions, it means these can also get folded.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
|
Thanks for the reviews and pre-merge testing, it's a cool system 👍 |
…lvm#164388) Previously, cross-lane operations were disallowed here, but they are only problematic if the `select` condition is a vector, as the input of the operation is not simply one of the arms of the phi/select.
Previously, cross-lane operations were disallowed here, but they are only problematic if the
selectcondition is a vector, as the input of the operation is not simply one of the arms of the phi/select.