-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[VPlan] Merge fcmp uno feeding AnyOf.
#166823
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesFold This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM Combine suggested as part of #161735 Full diff: https://github.com/llvm/llvm-project/pull/166823.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b57c44872c1b6..8b2931637113f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -417,6 +417,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
}
+inline VPInstruction_match<VPInstruction::AnyOf> m_AnyOf() {
+ return m_VPInstruction<VPInstruction::AnyOf>();
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::AnyOf, Op0_t>
m_AnyOf(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48bd697397f41..937c420dcd6ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1221,6 +1221,29 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}
}
+ // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
+ // any-of (fcmp uno %A, %B), ...
+ if (match(Def, m_AnyOf()) && Def->getNumOperands() % 2 == 0) {
+ SmallVector<VPValue *, 4> NewOps;
+ unsigned NumOps = Def->getNumOperands();
+ for (unsigned I = 0; I < NumOps; I += 2) {
+ VPValue *A, *B;
+ if (!match(
+ Def->getOperand(I),
+ m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(A), m_Deferred(A))) ||
+ !match(Def->getOperand(I + 1),
+ m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(B), m_Deferred(B))))
+ break;
+
+ NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO, A, B));
+ }
+
+ if (NewOps.size() == NumOps / 2) {
+ VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
+ return Def->replaceAllUsesWith(NewAnyOf);
+ }
+ }
+
// Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 3c83c01929aae..f4e5085c604f6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -60,12 +60,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP18:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP18]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index 711a9cd03ac15..89b5d1395a4a0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -60,12 +60,9 @@ define float @fminnum(ptr %src, i64 %n) {
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index af648df9fc5c7..eab3492ebe2fa 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -60,12 +60,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP19:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
|
Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> any-of (fcmp uno %A, %B), ... This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM
c707451 to
c83adad
Compare
| if (NewOps.size() == NumOps / 2) { | ||
| VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps); | ||
| return Def->replaceAllUsesWith(NewAnyOf); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we clean up the recipes in NewOps if NewOps.size() != NumOps / 2?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest update should always use the newly created recipes, thanks
| // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> | ||
| // any-of (fcmp uno %A, %B), ... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps better (than all-adjacent-pairs-or-nothing is) to search for a single pair of any two operands one doing (fcmp uno %A, %A) and the other (fcmp uno %B, %B) and replace them by one operand doing (fcmp uno %A, %B), to be repeated while found?
Tests below seem to cover the numOperands == 2 case only?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to keep track of unpaired compares, and merge the next matching pair. Should be tested now with tests with IC = 3 and IC =5, thanks
This adds additional test coverage for folding FCMP uno (#166823)
…r fmaxnum. This adds additional test coverage for folding FCMP uno (llvm/llvm-project#166823)
Fold
any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
any-of (fcmp uno %A, %B), ...
This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions.
Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM
Combine suggested as part of #161735