-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GlobalIsel] Combine logic of floating point compares #81886
Conversation
It is purely based on symmetry. Registers can be scalars, vectors, and non-constants. X < 5.0 || X > 5.0 -> X != 5.0 X < Y && X > Y -> FCMP_FALSE X < Y && X < Y -> FCMP_TRUE see InstCombinerImpl::foldLogicOfFCmps
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-globalisel Author: Thorsten Schütt (tschuett) ChangesIt is purely based on symmetry. Registers can be scalars, vectors, and non-constants. X < 5.0 || X > 5.0 X < Y && X > Y X < Y && X < Y see InstCombinerImpl::foldLogicOfFCmps Full diff: https://github.com/llvm/llvm-project/pull/81886.diff 3 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 10eeafdd09a8ee..5d458240929289 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -931,6 +931,9 @@ class CombinerHelper {
/// into a single comparison using range-based reasoning.
bool tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
BuildFnTy &MatchInfo);
+
+ // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
+ bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
};
} // namespace llvm
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1b199cfd41d231..5c37095bb66e10 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -37,6 +37,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <cmath>
+#include <llvm/Analysis/CmpInstAnalysis.h>
#include <optional>
#include <tuple>
@@ -6814,12 +6815,90 @@ bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
return true;
}
+bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic,
+ BuildFnTy &MatchInfo) {
+ assert(Logic->getOpcode() != TargetOpcode::G_XOR && "unexpecte xor");
+ Register DestReg = Logic->getReg(0);
+ Register LHS = Logic->getLHSReg();
+ Register RHS = Logic->getRHSReg();
+ bool IsAnd = Logic->getOpcode() == TargetOpcode::G_AND;
+
+ // We need a compare on the LHS register.
+ GFCmp *Cmp1 = getOpcodeDef<GFCmp>(LHS, MRI);
+ if (!Cmp1)
+ return false;
+
+ // We need a compare on the RHS register.
+ GFCmp *Cmp2 = getOpcodeDef<GFCmp>(RHS, MRI);
+ if (!Cmp2)
+ return false;
+
+ LLT CmpTy = MRI.getType(Cmp1->getReg(0));
+ LLT CmpOperandTy = MRI.getType(Cmp1->getLHSReg());
+
+ // We build one fcmp, want to fold the fcmps, replace the logic op,
+ // and the fcmps must have the same shape.
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_FCMP, {CmpTy, CmpOperandTy}}) ||
+ !MRI.hasOneNonDBGUse(Logic->getReg(0)) ||
+ !MRI.hasOneNonDBGUse(Cmp1->getReg(0)) ||
+ !MRI.hasOneNonDBGUse(Cmp2->getReg(0)) ||
+ MRI.getType(Cmp1->getLHSReg()) != MRI.getType(Cmp2->getLHSReg()))
+ return false;
+
+ CmpInst::Predicate PredL = Cmp1->getCond();
+ CmpInst::Predicate PredR = Cmp2->getCond();
+ Register LHS0 = Cmp1->getLHSReg();
+ Register LHS1 = Cmp1->getRHSReg();
+ Register RHS0 = Cmp2->getLHSReg();
+ Register RHS1 = Cmp2->getRHSReg();
+
+ if (LHS0 == RHS1 && LHS1 == RHS0) {
+ // Swap RHS operands to match LHS.
+ PredR = CmpInst::getSwappedPredicate(PredR);
+ std::swap(RHS0, RHS1);
+ }
+
+ if (LHS0 == RHS0 && LHS1 == RHS1) {
+ // We determine the new predicate.
+ unsigned CmpCodeL = getFCmpCode(PredL);
+ unsigned CmpCodeR = getFCmpCode(PredR);
+ unsigned NewPred = IsAnd ? CmpCodeL & CmpCodeR : CmpCodeL | CmpCodeR;
+ unsigned Flags = Cmp1->getFlags() | Cmp2->getFlags();
+ MatchInfo = [=](MachineIRBuilder &B) {
+ // The fcmp predicates fill the lower part of the enum.
+ FCmpInst::Predicate Pred = static_cast<FCmpInst::Predicate>(NewPred);
+ if (Pred == FCmpInst::FCMP_FALSE &&
+ isConstantLegalOrBeforeLegalizer(CmpTy)) {
+ auto False = B.buildConstant(CmpTy, 0);
+ B.buildZExtOrTrunc(DestReg, False);
+ } else if (Pred == FCmpInst::FCMP_TRUE &&
+ isConstantLegalOrBeforeLegalizer(CmpTy)) {
+ auto True =
+ B.buildConstant(CmpTy, getICmpTrueVal(getTargetLowering(),
+ CmpTy.isVector() /*isVector*/,
+ true /*isFP*/));
+ B.buildZExtOrTrunc(DestReg, True);
+ } else { // We take the predicate without predicate optimizations.
+ auto Cmp = B.buildFCmp(Pred, CmpTy, LHS0, LHS1, Flags);
+ B.buildZExtOrTrunc(DestReg, Cmp);
+ }
+ };
+ return true;
+ }
+
+ return false;
+}
+
bool CombinerHelper::matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) {
GAnd *And = cast<GAnd>(&MI);
if (tryFoldAndOrOrICmpsUsingRanges(And, MatchInfo))
return true;
+ if (tryFoldLogicOfFCmps(And, MatchInfo))
+ return true;
+
return false;
}
@@ -6829,5 +6908,8 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
if (tryFoldAndOrOrICmpsUsingRanges(Or, MatchInfo))
return true;
+ if (tryFoldLogicOfFCmps(Or, MatchInfo))
+ return true;
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir
index f667a83bf21a8b..d050823e3b9494 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir
@@ -260,3 +260,149 @@ body: |
%zext:_(<2 x s64>) = G_ZEXT %and(<2 x s1>)
$q0 = COPY %zext
...
+---
+# fcmp (x, y) || fcmp (x, y) -> fcmp(x, y)
+name: test_fcmp_or_fcmp_with_x_y
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fcmp_or_fcmp_with_x_y
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ueq), [[COPY]](s64), [[COPY1]]
+ ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+ ; CHECK-NEXT: $x0 = COPY %zext(s64)
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %cmp1:_(s1) = G_FCMP floatpred(oeq), %0(s64), %1
+ %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %1
+ %or:_(s1) = G_OR %cmp1, %cmp2
+ %zext:_(s64) = G_ZEXT %or(s1)
+ $x0 = COPY %zext
+...
+---
+# fcmp (5, y) || fcmp (y, 5) -> fcmp(x, y)
+name: test_fcmp_or_fcmp_with_5_y
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fcmp_or_fcmp_with_5_y
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e+00
+ ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(une), [[COPY]](s64), [[C]]
+ ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+ ; CHECK-NEXT: $x0 = COPY %zext(s64)
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %2:_(s64) = G_FCONSTANT double 5.0
+ %cmp1:_(s1) = G_FCMP floatpred(one), %0(s64), %2
+ %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %2
+ %or:_(s1) = G_OR %cmp1, %cmp2
+ %zext:_(s64) = G_ZEXT %or(s1)
+ $x0 = COPY %zext
+...
+---
+# fcmp (x, y) || fcmp (y, x) -> fcmp(x, y)
+name: test_fcmp_or_fcmp_with_anti
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fcmp_or_fcmp_with_anti
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(une), [[COPY1]](s64), [[COPY]]
+ ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+ ; CHECK-NEXT: $x0 = COPY %zext(s64)
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %cmp1:_(s1) = G_FCMP floatpred(one), %1(s64), %0
+ %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %1
+ %or:_(s1) = G_OR %cmp1, %cmp2
+ %zext:_(s64) = G_ZEXT %or(s1)
+ $x0 = COPY %zext
+...
+---
+# fcmp (x, y) && fcmp (x, y) -> fcmp(x, y)
+name: test_fcmp_and_fcmp_with_x_y
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fcmp_and_fcmp_with_x_y
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(uno), [[COPY1]](s64), [[COPY]]
+ ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+ ; CHECK-NEXT: $x0 = COPY %zext(s64)
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %cmp1:_(s1) = G_FCMP floatpred(une), %1(s64), %0
+ %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %1
+ %and:_(s1) = G_AND %cmp1, %cmp2
+ %zext:_(s64) = G_ZEXT %and(s1)
+ $x0 = COPY %zext
+...
+---
+# fcmp (x, y) && fcmp (x, y) -> fcmp(x, y)
+name: test_fcmp_and_fcmp_with_x_y_multi_use
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fcmp_and_fcmp_with_x_y_multi_use
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+ ; CHECK-NEXT: %cmp1:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s64), [[COPY]]
+ ; CHECK-NEXT: %cmp2:_(s1) = G_FCMP floatpred(ugt), [[COPY]](s64), [[COPY1]]
+ ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+ ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+ ; CHECK-NEXT: %zext2:_(s64) = G_ZEXT %and(s1)
+ ; CHECK-NEXT: $x0 = COPY %zext(s64)
+ ; CHECK-NEXT: $x2 = COPY %zext2(s64)
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %cmp1:_(s1) = G_FCMP floatpred(ogt), %1(s64), %0
+ %cmp2:_(s1) = G_FCMP floatpred(ugt), %0(s64), %1
+ %and:_(s1) = G_AND %cmp1, %cmp2
+ %zext:_(s64) = G_ZEXT %and(s1)
+ %zext2:_(s64) = G_ZEXT %and(s1)
+ $x0 = COPY %zext
+ $x2 = COPY %zext2
+...
+---
+# fcmp (x, y) && fcmp (x, y) -> fcmp(x, y)
+name: test_fcmp_and_fcmp_with_vectors
+body: |
+ bb.1:
+ liveins: $x0, $x1
+ ; CHECK-LABEL: name: test_fcmp_and_fcmp_with_vectors
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1)
+ ; CHECK-NEXT: %zext:_(<2 x s64>) = G_ZEXT [[BUILD_VECTOR]](<2 x s1>)
+ ; CHECK-NEXT: $q0 = COPY %zext(<2 x s64>)
+ %0:_(s64) = COPY $x0
+ %1:_(s64) = COPY $x1
+ %2:_(s64) = COPY $x2
+ %3:_(s64) = COPY $x3
+ %4:_(s64) = COPY $x4
+ %5:_(s64) = COPY $x5
+ %6:_(s64) = COPY $x6
+ %7:_(s64) = COPY $x7
+ %v8:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
+ %v9:_(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64)
+ %cmp1:_(<2 x s1>) = G_FCMP floatpred(oeq), %v8(<2 x s64>), %v9
+ %cmp2:_(<2 x s1>) = G_FCMP floatpred(olt), %v8(<2 x s64>), %v9
+ %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+ %zext:_(<2 x s64>) = G_ZEXT %and(<2 x s1>)
+ $q0 = COPY %zext
+...
|
Is there a motivating real use case for this? My concern is if we're implementing IR InstCombines in the combiner purely for symmetry we add compile time cost for little benefit. |
The motivation was to fold more compares. The DAG combiner has the same combine in |
I am not interested in feature parity with Instcombine. I am interested in making this combiner more aggressive (up to a point). Instcombine has for some Opcodes harsher combines than the Dag that are 10 lines of code. I am interested in these 10 lines combines. |
For DAGCombine, the general policy is that it should only contain combines that are either target-dependent, or arise during legalization/lowering. Everything else should be handled (only) in InstCombine instead. I'd expect GlobalISel to follow the same approach... (I don't work on GlobalISel though, so do feel free to ignore me!) |
|
||
// We build one fcmp, want to fold the fcmps, replace the logic op, | ||
// and the fcmps must have the same shape. | ||
if (!isLegalOrBeforeLegalizer( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The result of isLegalOrBeforeLaglizer
is target-dependent. Some targets support fcmp with the given types some do not, e.g., CmpOperandTy
could be a scalable vector.
I always thought DagCombine combines as much as possible, but checks whether the operations are legal on the current device. |
InstCombine combines the overflow intrinsics to the fullest. GlobalIsel and the Dag have named opcodes for overflow intrinsics. What is the policy for the GlobalIsel combiner:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is trying to mirror DAGCombiner then I think it's ok, it'll likely be useful for GlobalISel too, although I generally prefer seeing some real IR showing benefits.
@@ -6814,12 +6815,90 @@ bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic, | |||
return true; | |||
} | |||
|
|||
bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic, | |||
BuildFnTy &MatchInfo) { | |||
assert(Logic->getOpcode() != TargetOpcode::G_XOR && "unexpecte xor"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why checking for XOR? should be AND or OR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
!=
! The parameter is GLogicalBinOp
which includes xor.
Register DestReg = Logic->getReg(0); | ||
Register LHS = Logic->getLHSReg(); | ||
Register RHS = Logic->getRHSReg(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Register DestReg = Logic->getReg(0); | |
Register LHS = Logic->getLHSReg(); | |
Register RHS = Logic->getRHSReg(); | |
auto [DestReg, LHS, RHS] = Logic->getFirst3Regs(); |
// We determine the new predicate. | ||
unsigned CmpCodeL = getFCmpCode(PredL); | ||
unsigned CmpCodeR = getFCmpCode(PredR); | ||
unsigned NewPred = IsAnd ? CmpCodeL & CmpCodeR : CmpCodeL | CmpCodeR; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is interesting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The DAG uses:
ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
: ISD::getSetCCOrOperation(CC0, CC1, OpVT);
unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR; |
unsigned CmpCodeL = getFCmpCode(PredL); | ||
unsigned CmpCodeR = getFCmpCode(PredR); | ||
unsigned NewPred = IsAnd ? CmpCodeL & CmpCodeR : CmpCodeL | CmpCodeR; | ||
unsigned Flags = Cmp1->getFlags() | Cmp2->getFlags(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure about this? I don't see any prior art in foldLogicOfSetCCs()
about flags.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The DAG is very lazy about flags.
// TODO: We can union the fast math flags unless this is a logical select. |
Thanks! |
It is purely based on symmetry. Registers can be scalars, vectors, and non-constants.
X < 5.0 || X > 5.0
->
X != 5.0
X < Y && X > Y
->
FCMP_FALSE
X < Y && X < Y
->
FCMP_TRUE
see InstCombinerImpl::foldLogicOfFCmps