Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GlobalIsel] Combine logic of floating point compares #81886

Merged
merged 2 commits into from
Feb 20, 2024

Conversation

tschuett
Copy link
Member

It is purely based on symmetry. Registers can be scalars, vectors, and non-constants.

X < 5.0 || X > 5.0
->
X != 5.0

X < Y && X > Y
->
FCMP_FALSE

X < Y && X < Y
->
FCMP_TRUE

see InstCombinerImpl::foldLogicOfFCmps

It is purely based on symmetry. Registers can be scalars, vectors, and
non-constants.

X < 5.0 || X > 5.0
  ->
X != 5.0

X < Y && X > Y
  ->
  FCMP_FALSE

X < Y && X < Y
  ->
  FCMP_TRUE

see InstCombinerImpl::foldLogicOfFCmps
@llvmbot
Copy link
Collaborator

llvmbot commented Feb 15, 2024

@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-globalisel

Author: Thorsten Schütt (tschuett)

Changes

It is purely based on symmetry. Registers can be scalars, vectors, and non-constants.

X < 5.0 || X > 5.0
->
X != 5.0

X < Y && X > Y
->
FCMP_FALSE

X < Y && X < Y
->
FCMP_TRUE

see InstCombinerImpl::foldLogicOfFCmps


Full diff: https://github.com/llvm/llvm-project/pull/81886.diff

3 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
  • (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+82)
  • (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir (+146)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 10eeafdd09a8ee..5d458240929289 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -931,6 +931,9 @@ class CombinerHelper {
   /// into a single comparison using range-based reasoning.
   bool tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
                                       BuildFnTy &MatchInfo);
+
+  // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
+  bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
 };
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1b199cfd41d231..5c37095bb66e10 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>
+#include <llvm/Analysis/CmpInstAnalysis.h>
 #include <optional>
 #include <tuple>
 
@@ -6814,12 +6815,90 @@ bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
   return true;
 }
 
+bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic,
+                                         BuildFnTy &MatchInfo) {
+  assert(Logic->getOpcode() != TargetOpcode::G_XOR && "unexpecte xor");
+  Register DestReg = Logic->getReg(0);
+  Register LHS = Logic->getLHSReg();
+  Register RHS = Logic->getRHSReg();
+  bool IsAnd = Logic->getOpcode() == TargetOpcode::G_AND;
+
+  // We need a compare on the LHS register.
+  GFCmp *Cmp1 = getOpcodeDef<GFCmp>(LHS, MRI);
+  if (!Cmp1)
+    return false;
+
+  // We need a compare on the RHS register.
+  GFCmp *Cmp2 = getOpcodeDef<GFCmp>(RHS, MRI);
+  if (!Cmp2)
+    return false;
+
+  LLT CmpTy = MRI.getType(Cmp1->getReg(0));
+  LLT CmpOperandTy = MRI.getType(Cmp1->getLHSReg());
+
+  // We build one fcmp, want to fold the fcmps, replace the logic op,
+  // and the fcmps must have the same shape.
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_FCMP, {CmpTy, CmpOperandTy}}) ||
+      !MRI.hasOneNonDBGUse(Logic->getReg(0)) ||
+      !MRI.hasOneNonDBGUse(Cmp1->getReg(0)) ||
+      !MRI.hasOneNonDBGUse(Cmp2->getReg(0)) ||
+      MRI.getType(Cmp1->getLHSReg()) != MRI.getType(Cmp2->getLHSReg()))
+    return false;
+
+  CmpInst::Predicate PredL = Cmp1->getCond();
+  CmpInst::Predicate PredR = Cmp2->getCond();
+  Register LHS0 = Cmp1->getLHSReg();
+  Register LHS1 = Cmp1->getRHSReg();
+  Register RHS0 = Cmp2->getLHSReg();
+  Register RHS1 = Cmp2->getRHSReg();
+
+  if (LHS0 == RHS1 && LHS1 == RHS0) {
+    // Swap RHS operands to match LHS.
+    PredR = CmpInst::getSwappedPredicate(PredR);
+    std::swap(RHS0, RHS1);
+  }
+
+  if (LHS0 == RHS0 && LHS1 == RHS1) {
+    // We determine the new predicate.
+    unsigned CmpCodeL = getFCmpCode(PredL);
+    unsigned CmpCodeR = getFCmpCode(PredR);
+    unsigned NewPred = IsAnd ? CmpCodeL & CmpCodeR : CmpCodeL | CmpCodeR;
+    unsigned Flags = Cmp1->getFlags() | Cmp2->getFlags();
+    MatchInfo = [=](MachineIRBuilder &B) {
+      // The fcmp predicates fill the lower part of the enum.
+      FCmpInst::Predicate Pred = static_cast<FCmpInst::Predicate>(NewPred);
+      if (Pred == FCmpInst::FCMP_FALSE &&
+          isConstantLegalOrBeforeLegalizer(CmpTy)) {
+        auto False = B.buildConstant(CmpTy, 0);
+        B.buildZExtOrTrunc(DestReg, False);
+      } else if (Pred == FCmpInst::FCMP_TRUE &&
+                 isConstantLegalOrBeforeLegalizer(CmpTy)) {
+        auto True =
+            B.buildConstant(CmpTy, getICmpTrueVal(getTargetLowering(),
+                                                  CmpTy.isVector() /*isVector*/,
+                                                  true /*isFP*/));
+        B.buildZExtOrTrunc(DestReg, True);
+      } else { // We take the predicate without predicate optimizations.
+        auto Cmp = B.buildFCmp(Pred, CmpTy, LHS0, LHS1, Flags);
+        B.buildZExtOrTrunc(DestReg, Cmp);
+      }
+    };
+    return true;
+  }
+
+  return false;
+}
+
 bool CombinerHelper::matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) {
   GAnd *And = cast<GAnd>(&MI);
 
   if (tryFoldAndOrOrICmpsUsingRanges(And, MatchInfo))
     return true;
 
+  if (tryFoldLogicOfFCmps(And, MatchInfo))
+    return true;
+
   return false;
 }
 
@@ -6829,5 +6908,8 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
   if (tryFoldAndOrOrICmpsUsingRanges(Or, MatchInfo))
     return true;
 
+  if (tryFoldLogicOfFCmps(Or, MatchInfo))
+    return true;
+
   return false;
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir
index f667a83bf21a8b..d050823e3b9494 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-logic-of-compare.mir
@@ -260,3 +260,149 @@ body:             |
     %zext:_(<2 x s64>) = G_ZEXT %and(<2 x s1>)
     $q0 = COPY %zext
 ...
+---
+# fcmp (x, y) || fcmp (x, y) -> fcmp(x, y)
+name:            test_fcmp_or_fcmp_with_x_y
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: test_fcmp_or_fcmp_with_x_y
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ueq), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %cmp1:_(s1) = G_FCMP floatpred(oeq), %0(s64), %1
+    %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %1
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or(s1)
+    $x0 = COPY %zext
+...
+---
+# fcmp (5, y) || fcmp (y, 5) -> fcmp(x, y)
+name:            test_fcmp_or_fcmp_with_5_y
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: test_fcmp_or_fcmp_with_5_y
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e+00
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(une), [[COPY]](s64), [[C]]
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = G_FCONSTANT double 5.0
+    %cmp1:_(s1) = G_FCMP floatpred(one), %0(s64), %2
+    %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %2
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or(s1)
+    $x0 = COPY %zext
+...
+---
+# fcmp (x, y) || fcmp (y, x) -> fcmp(x, y)
+name:            test_fcmp_or_fcmp_with_anti
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: test_fcmp_or_fcmp_with_anti
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(une), [[COPY1]](s64), [[COPY]]
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %cmp1:_(s1) = G_FCMP floatpred(one), %1(s64), %0
+    %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %1
+    %or:_(s1) = G_OR %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %or(s1)
+    $x0 = COPY %zext
+...
+---
+# fcmp (x, y) && fcmp (x, y) -> fcmp(x, y)
+name:            test_fcmp_and_fcmp_with_x_y
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: test_fcmp_and_fcmp_with_x_y
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(uno), [[COPY1]](s64), [[COPY]]
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[FCMP]](s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %cmp1:_(s1) = G_FCMP floatpred(une), %1(s64), %0
+    %cmp2:_(s1) = G_FCMP floatpred(uno), %0(s64), %1
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and(s1)
+    $x0 = COPY %zext
+...
+---
+# fcmp (x, y) && fcmp (x, y) -> fcmp(x, y)
+name:            test_fcmp_and_fcmp_with_x_y_multi_use
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: test_fcmp_and_fcmp_with_x_y_multi_use
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: %cmp1:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s64), [[COPY]]
+    ; CHECK-NEXT: %cmp2:_(s1) = G_FCMP floatpred(ugt), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: %and:_(s1) = G_AND %cmp1, %cmp2
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: %zext2:_(s64) = G_ZEXT %and(s1)
+    ; CHECK-NEXT: $x0 = COPY %zext(s64)
+    ; CHECK-NEXT: $x2 = COPY %zext2(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %cmp1:_(s1) = G_FCMP floatpred(ogt), %1(s64), %0
+    %cmp2:_(s1) = G_FCMP floatpred(ugt), %0(s64), %1
+    %and:_(s1) = G_AND %cmp1, %cmp2
+    %zext:_(s64) = G_ZEXT %and(s1)
+    %zext2:_(s64) = G_ZEXT %and(s1)
+    $x0 = COPY %zext
+    $x2 = COPY %zext2
+...
+---
+# fcmp (x, y) && fcmp (x, y) -> fcmp(x, y)
+name:            test_fcmp_and_fcmp_with_vectors
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: test_fcmp_and_fcmp_with_vectors
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1)
+    ; CHECK-NEXT: %zext:_(<2 x s64>) = G_ZEXT [[BUILD_VECTOR]](<2 x s1>)
+    ; CHECK-NEXT: $q0 = COPY %zext(<2 x s64>)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %5:_(s64) = COPY $x5
+    %6:_(s64) = COPY $x6
+    %7:_(s64) = COPY $x7
+    %v8:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
+    %v9:_(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64)
+    %cmp1:_(<2 x s1>) = G_FCMP floatpred(oeq), %v8(<2 x s64>), %v9
+    %cmp2:_(<2 x s1>) = G_FCMP floatpred(olt), %v8(<2 x s64>), %v9
+    %and:_(<2 x s1>) = G_AND %cmp1, %cmp2
+    %zext:_(<2 x s64>) = G_ZEXT %and(<2 x s1>)
+    $q0 = COPY %zext
+...

@aemerson
Copy link
Contributor

Is there a motivating real use case for this? My concern is if we're implementing IR InstCombines in the combiner purely for symmetry we add compile time cost for little benefit.

@tschuett
Copy link
Member Author

The motivation was to fold more compares. The DAG combiner has the same combine in foldLogicOfSetCCs, but it is tight to the ISD namespace. The Instcombine method of calculating the new predicate is more portable.

@tschuett
Copy link
Member Author

I am not interested in feature parity with Instcombine. I am interested in making this combiner more aggressive (up to a point).

Instcombine has for some Opcodes harsher combines than the Dag that are 10 lines of code. I am interested in these 10 lines combines.

@nikic
Copy link
Contributor

nikic commented Feb 16, 2024

Is there a motivating real use case for this? My concern is if we're implementing IR InstCombines in the combiner purely for symmetry we add compile time cost for little benefit.

For DAGCombine, the general policy is that it should only contain combines that are either target-dependent, or arise during legalization/lowering. Everything else should be handled (only) in InstCombine instead. I'd expect GlobalISel to follow the same approach...

(I don't work on GlobalISel though, so do feel free to ignore me!)


// We build one fcmp, want to fold the fcmps, replace the logic op,
// and the fcmps must have the same shape.
if (!isLegalOrBeforeLegalizer(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The result of isLegalOrBeforeLaglizer is target-dependent. Some targets support fcmp with the given types some do not, e.g., CmpOperandTy could be a scalable vector.

@tschuett
Copy link
Member Author

I always thought DagCombine combines as much as possible, but checks whether the operations are legal on the current device.

@tschuett
Copy link
Member Author

tschuett commented Feb 16, 2024

InstCombine combines the overflow intrinsics to the fullest. GlobalIsel and the Dag have named opcodes for overflow intrinsics.
GlobalIsel: G_SADDO reads as signed addition overflow.

What is the policy for the GlobalIsel combiner:

  • refuse to combine them
  • combine them to the fullest with even new tricks
  • something in the middle
    ?

Copy link
Contributor

@aemerson aemerson left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is trying to mirror DAGCombiner then I think it's ok, it'll likely be useful for GlobalISel too, although I generally prefer seeing some real IR showing benefits.

@@ -6814,12 +6815,90 @@ bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
return true;
}

bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic,
BuildFnTy &MatchInfo) {
assert(Logic->getOpcode() != TargetOpcode::G_XOR && "unexpecte xor");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why checking for XOR? should be AND or OR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!= ! The parameter is GLogicalBinOp which includes xor.

Comment on lines +6821 to +6823
Register DestReg = Logic->getReg(0);
Register LHS = Logic->getLHSReg();
Register RHS = Logic->getRHSReg();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Register DestReg = Logic->getReg(0);
Register LHS = Logic->getLHSReg();
Register RHS = Logic->getRHSReg();
auto [DestReg, LHS, RHS] = Logic->getFirst3Regs();

// We determine the new predicate.
unsigned CmpCodeL = getFCmpCode(PredL);
unsigned CmpCodeR = getFCmpCode(PredR);
unsigned NewPred = IsAnd ? CmpCodeL & CmpCodeR : CmpCodeL | CmpCodeR;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DAG uses:

ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
                                : ISD::getSetCCOrOperation(CC0, CC1, OpVT);

unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR;

unsigned CmpCodeL = getFCmpCode(PredL);
unsigned CmpCodeR = getFCmpCode(PredR);
unsigned NewPred = IsAnd ? CmpCodeL & CmpCodeR : CmpCodeL | CmpCodeR;
unsigned Flags = Cmp1->getFlags() | Cmp2->getFlags();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure about this? I don't see any prior art in foldLogicOfSetCCs() about flags.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DAG is very lazy about flags.

// TODO: We can union the fast math flags unless this is a logical select.

@tschuett
Copy link
Member Author

Thanks!

@tschuett tschuett merged commit 63a4b4f into llvm:main Feb 20, 2024
4 checks passed
@tschuett tschuett deleted the gisel-logic-of-fcmps branch February 20, 2024 08:56
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants