[GISel][CombinerHelper] Combine op(trunc(x), trunc(y)) -> trunc(op(x, y)) #89023

dc03-work · 2024-04-17T06:21:17Z

No description provided.

dc03-work · 2024-04-17T06:21:47Z

Updates to test cases are in 46f65d2.

llvmbot · 2024-04-17T06:21:48Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Author: Dhruv Chawla (dc03-work)

Changes

The "match_ands" pattern is also enabled in the
AArch64PostLegalizerCombiner.

Full diff: https://github.com/llvm/llvm-project/pull/89023.diff

6 Files Affected:

(modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
(modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+60)
(modified) llvm/lib/Target/AArch64/AArch64Combine.td (+1-1)
(added) llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir (+167)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir (+8-9)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir (+3-4)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 3af32043391fec..0148911ae1ecc6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -963,6 +963,9 @@ class CombinerHelper {
 
   // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
   bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
+
+  // Simplify (trunc v1) && (trunc v2) -> trunc (v1 && v2)
+  bool tryFoldAndOfTruncs(GLogicalBinOp *Logical, BuildFnTy &MatchInfo);
 };
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3829c33369b275..6933589d713fe6 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6982,9 +6982,69 @@ bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic,
   return false;
 }
 
+bool CombinerHelper::tryFoldAndOfTruncs(GLogicalBinOp *Logical,
+                                        BuildFnTy &MatchInfo) {
+  assert(Logical->getOpcode() == TargetOpcode::G_AND &&
+         "Expected to be called with G_AND!");
+  Register Dst = Logical->getOperand(0).getReg();
+  Register V1 = Logical->getOperand(1).getReg();
+  Register V2 = Logical->getOperand(2).getReg();
+
+  MachineInstr *V1MI = MRI.getUniqueVRegDef(V1);
+  MachineInstr *V2MI = MRI.getUniqueVRegDef(V2);
+  if (!V1MI || !V2MI)
+    return false;
+
+  bool V1Freeze = V1MI->getOpcode() == TargetOpcode::G_FREEZE;
+  bool V2Freeze = V2MI->getOpcode() == TargetOpcode::G_FREEZE;
+  if (V1Freeze)
+    V1 = V1MI->getOperand(1).getReg();
+  if (V2Freeze)
+    V2 = V2MI->getOperand(1).getReg();
+
+  Register V1Src, V2Src;
+  if (!mi_match(V1, MRI, m_GTrunc(m_Reg(V1Src))) ||
+      !mi_match(V2, MRI, m_GTrunc(m_Reg(V2Src))))
+    return false;
+  if (!MRI.hasOneNonDBGUse(V1) || !MRI.hasOneNonDBGUse(V2))
+    return false;
+
+  LLT V1Ty = MRI.getType(V1);
+  LLT V2Ty = MRI.getType(V2);
+  LLT V1SrcTy = MRI.getType(V1Src);
+  LLT V2SrcTy = MRI.getType(V2Src);
+
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {V1SrcTy, V2SrcTy}}))
+    return false;
+
+  if (V1Ty != V2Ty || V1SrcTy != V2SrcTy)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    Register Op0 = V1Src;
+    Register Op1 = V2Src;
+
+    if (V1Freeze)
+      Op0 = B.buildFreeze(V1SrcTy, V1Src).getReg(0);
+    if (V2Freeze)
+      Op1 = B.buildFreeze(V1SrcTy, V2Src).getReg(0);
+
+    auto And = B.buildAnd(V1SrcTy, Op0, Op1);
+    B.buildTrunc(Dst, And);
+
+    MRI.getUniqueVRegDef(V1)->eraseFromParent();
+    MRI.getUniqueVRegDef(V2)->eraseFromParent();
+  };
+
+  return true;
+}
+
 bool CombinerHelper::matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) {
   GAnd *And = cast<GAnd>(&MI);
 
+  if (tryFoldAndOfTruncs(And, MatchInfo))
+    return true;
+
   if (tryFoldAndOrOrICmpsUsingRanges(And, MatchInfo))
     return true;
 
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 10cad6d1924407..eda7d925dade47 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -295,5 +295,5 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs]> {
+                        commute_constant_to_rhs, match_ands]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir
new file mode 100644
index 00000000000000..7f664850885836
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir
@@ -0,0 +1,167 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs  %s | FileCheck %s
+
+---
+name:            and_trunc
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: and_trunc
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %4:_(s16) = G_AND %2, %3
+    %5:_(s32) = G_ANYEXT %4
+    $w0 = COPY %5
+...
+---
+name:            and_trunc_multiuse_1
+body:             |
+  bb.0:
+    liveins: $w0, $w1, $x2
+    ; CHECK-LABEL: name: and_trunc_multiuse_1
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[COPY2]](p0) :: (store (s16))
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %5:_(p0) = COPY $x2
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    G_STORE %2, %5 :: (store (s16))
+    %4:_(s16) = G_AND %2, %3
+    %6:_(s32) = G_ANYEXT %4
+    $w0 = COPY %6
+...
+---
+name:            and_trunc_multiuse_2
+body:             |
+  bb.0:
+    liveins: $w0, $w1, $x2
+    ; CHECK-LABEL: name: and_trunc_multiuse_2
+    ; CHECK: liveins: $w0, $w1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[COPY2]](p0) :: (store (s16))
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %5:_(p0) = COPY $x2
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    G_STORE %2, %5 :: (store (s16))
+    %4:_(s16) = G_AND %2, %3
+    %6:_(s32) = G_ANYEXT %4
+    $w0 = COPY %6
+...
+---
+name:            and_trunc_vector
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+    ; CHECK-LABEL: name: and_trunc_vector
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[AND]](<4 x s32>)
+    ; CHECK-NEXT: $x0 = COPY [[TRUNC]](<4 x s16>)
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = COPY $q1
+    %2:_(<4 x s16>) = G_TRUNC %0
+    %3:_(<4 x s16>) = G_TRUNC %1
+    %4:_(<4 x s16>) = G_AND %2, %3
+    $x0 = COPY %4
+...
+---
+name:            and_trunc_vector_multiuse
+body:             |
+  bb.0:
+    liveins: $q0, $q1, $x0
+    ; CHECK-LABEL: name: and_trunc_vector_multiuse
+    ; CHECK: liveins: $q0, $q1, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $x2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY1]](<4 x s32>)
+    ; CHECK-NEXT: G_STORE [[TRUNC]](<4 x s16>), [[COPY2]](p0) :: (store (<4 x s16>))
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: $x0 = COPY [[AND]](<4 x s16>)
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = COPY $q1
+    %5:_(p0) = COPY $x2
+    %2:_(<4 x s16>) = G_TRUNC %0
+    %3:_(<4 x s16>) = G_TRUNC %1
+    G_STORE %2, %5 :: (store (<4 x s16>))
+    %4:_(<4 x s16>) = G_AND %2, %3
+    $x0 = COPY %4
+...
+---
+name:            and_trunc_freeze
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: and_trunc_freeze
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FREEZE]], [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %6:_(s16) = G_FREEZE %2
+    %4:_(s16) = G_AND %6, %3
+    %5:_(s32) = G_ANYEXT %4
+    $w0 = COPY %5
+...
+---
+name:            and_trunc_freeze_both
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: and_trunc_freeze_both
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FREEZE]], [[FREEZE1]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %6:_(s16) = G_FREEZE %2
+    %7:_(s16) = G_FREEZE %3
+    %4:_(s16) = G_AND %6, %7
+    %5:_(s32) = G_ANYEXT %4
+    $w0 = COPY %5
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 2bf7e84a379ba0..d4bca5dacf0a2b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -1,7 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown- --aarch64postlegalizercombiner-only-enable-rule="select_to_logical" %s -o - | FileCheck %s
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
 # RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
 # REQUIRES: asserts
+
 ---
 # select (c, x, x) -> x
 name:            test_combine_select_same_res
@@ -200,10 +201,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[FREEZE]]
+    ; CHECK-NEXT: %sel:_(s1) = G_TRUNC [[AND]](s64)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -228,10 +228,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[FREEZE]]
+    ; CHECK-NEXT: %sel:_(s1) = G_TRUNC [[AND]](s64)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir
index fb19cda303d365..b2a9a802261252 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir
@@ -84,10 +84,9 @@ body:             |
     ; CHECK: liveins: $x0, $x1
     ; CHECK: %binop_lhs:_(s64) = COPY $x0
     ; CHECK: %binop_rhs:_(s64) = COPY $x1
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
-    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32)
+    ; CHECK: %binop:_(s64) = G_AND %binop_lhs, %binop_rhs
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop(s64)
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
     ; CHECK: $x0 = COPY [[ZEXT]](s64)
     ; CHECK: RET_ReallyLR implicit $x0
     %binop_lhs:_(s64) = COPY $x0

arsenm · 2024-04-17T06:56:12Z

Should move this to tablegen patterns

dc03-work · 2024-04-17T08:11:02Z

Should move this to tablegen patterns

@arsenm Which file would this be? I had tried searching for something like this originally but couldn't find anything.

arsenm · 2024-04-17T13:03:34Z

Should move this to tablegen patterns

@arsenm Which file would this be? I had tried searching for something like this originally but couldn't find anything.

https://llvm.org/docs/GlobalISel/MIRPatterns.html

It would go in the generic Combine.td, just with most of the C++ boilerplate replaced with tablegen

dc03-work · 2024-04-17T13:19:45Z

Should move this to tablegen patterns

@arsenm Which file would this be? I had tried searching for something like this originally but couldn't find anything.

https://llvm.org/docs/GlobalISel/MIRPatterns.html

It would go in the generic Combine.td, just with most of the C++ boilerplate replaced with tablegen

Hmm, one of the issues is that we need to match the types as well (the types of x and y in trunc(x) and trunc(y) need to be the same) so that the and(x, y) is well formed. I am not sure if this is possible through TableGen.

I am considering moving the code into https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp#L2960, which is where the folds for logic (hand x, ...), (hand y, ...) -> hand (logic x, y), ... currently live. I think this patch fits in quite neatly there, and the logic for matching is already written.

jayfoad

Why is this a good idea? Intuitively it means we'll do the and in a wider type than necessary, which could end up generating more code if the wider type is wider than a register. Is it a canonicalization? Does it enable other optimizations? Is there a general strategy to pull truncs out of other expressions?

jayfoad · 2024-04-17T13:22:12Z

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

@@ -963,6 +963,9 @@ class CombinerHelper {

  // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
  bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
+
+  // Simplify (trunc v1) && (trunc v2) -> trunc (v1 && v2)


&& seems wrong here?

Suggested change

// Simplify (trunc v1) && (trunc v2) -> trunc (v1 && v2)

// Simplify (trunc v1) & (trunc v2) -> trunc (v1 & v2)

arsenm · 2024-04-17T13:29:14Z

Hmm, one of the issues is that we need to match the types as well (the types of x and y in trunc(x) and trunc(y) need to be the same) so that the and(x, y) is well formed. I am not sure if this is possible through TableGen.

If we need to add something to support this, we should add the support to tablegen. This is not an uncommon type of scenario

dc03-work · 2024-04-17T13:50:30Z

Why is this a good idea? Intuitively it means we'll do the and in a wider type than necessary, which could end up generating more code if the wider type is wider than a register. Is it a canonicalization? Does it enable other optimizations? Is there a general strategy to pull truncs out of other expressions?

This fold is something I noticed while looking into the s1279 kernel in the TSVC benchmarks for AArch64. GlobalISel was generating two truncs (actually one trunc and the other with much worse codegen, which I am trying to address with [AArch64][GISel] Avoid scalarizing G_IMPLICIT_DEF and G_FREEZE in the Legalizer #88469), whereas SelectionDAG was generating only one.
I agree that this can be an issue for wider types. I was only looking from the perspective of legal vector types, sorry.
I don't know if it is a canonicalization, though I feel it should be when x and y have legal types that can be represented with one register, because of the following point:
This does enable a separate optimization when ext(and(trunc, trunc)) occurs, where ext(trunc(and)) can generally be folded to a simpler form. This did occur in the s1279 kernel, and is one of the reasons I made this patch.
Not sure.

As in my comment above, I am planning on moving this code into matchHoistLogicOpWithSameOpcodeHands. Maybe some special handling for trunc can also be added to check for point 3.

Hmm, one of the issues is that we need to match the types as well (the types of x and y in trunc(x) and trunc(y) need to be the same) so that the and(x, y) is well formed. I am not sure if this is possible through TableGen.

If we need to add something to support this, we should add the support to tablegen. This is not an uncommon type of scenario

I think that is beyond the scope of this patch, and is probably not possible to do immediately. We also need to check if the node we are generating is legal if we are not in the pre-legalizer combiner, as well as checking if the nodes have one non-dbg use. Not sure if TableGen should be extended to handle these cases as well.

tschuett · 2024-04-17T15:05:25Z

The freeze looks unsafe. It should be a separate patch.

tschuett · 2024-04-17T18:03:45Z

You can still put the pattern in MIR and the business logic in the CombinerHelper.

dc03-work · 2024-04-18T02:12:54Z

I have moved the code to matchHoistLogicOpWithSameOpcodeHands. I think the patch makes more sense now, though I am not sure how to check if a given LLT will fit within one register or not.

tschuett · 2024-04-23T09:54:54Z

llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir

+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND %8, [[COPY1]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[COPY]]


the freeze is dead. $w0 is not freezed.

tschuett · 2024-04-23T09:55:26Z

llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir

+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND %9, %10
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[COPY]]
+    ; CHECK-NEXT: [[FREEZE1:%[0-9]+]]:_(s32) = G_FREEZE [[COPY1]]


both freezes are dead. The registers are not freezed anymore.

Ah, this looks to be an issue with using buildInstr in applyBuildInstructionSteps and then running the OperandFns. I believe this can be fixed by inserting the built instruction into the function after running the OperandFns instead of before.

tschuett

Remove the freeze code. You cannot look through freeze. It is unsafe.

dc03-work · 2024-04-23T11:49:32Z

Remove the freeze code. You cannot look through freeze. It is unsafe.

Hmm, the problem with the test case I'm looking at is that select gets optimized to an and with a freeze, so removing the freeze fold will make this patch unable to fix that test case. Is it still not ok to look through a freeze if it only has one use? I would really like the freeze to be optimized how it is with this patch... At the very least, if the semantics of freeze in GMIR match LLVM IR, then Alive2 does not have issues with optimizing this way for a single-use freeze: https://alive2.llvm.org/ce/z/dDWS5u.

tschuett · 2024-04-23T19:26:33Z

Instcombine handles freeze separately. We are going to follow that pattern:

llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp

Line 6935 in dc8f6a8

/// Given operands for a Freeze, see if we can fold the result.

dc03-work · 2024-04-24T04:07:50Z

Instcombine handles freeze separately. We are going to follow that pattern:

llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp

Line 6935 in dc8f6a8

/// Given operands for a Freeze, see if we can fold the result.

Okay, I guess that would be the visit here:

llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Line 4481 in dc8f6a8

Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {

? I believe my patch would require porting

llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Line 4298 in dc8f6a8

    
           InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {

to the Combiner.

tschuett · 2024-04-24T04:37:01Z

#89363

dc03-work · 2024-04-24T04:48:14Z

#89363

This fixes the case for G_FREEZE %x when %x is known non-undef non-poison. However, this doesn't handle the case we're looking at here, because I'm not eliminating the freeze, I am pushing it through the trunc. So, I am not sure how this PR is relevant here...

dc03-work · 2024-04-24T08:25:34Z

#89023 (comment) ping @jayfoad

Instcombine handles freeze separately. We are going to follow that pattern:

llvm-project/llvm/lib/Analysis/InstructionSimplify.cpp

Line 6935 in dc8f6a8

/// Given operands for a Freeze, see if we can fold the result.

Okay, I guess that would be the visit here:

llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Line 4481 in dc8f6a8

Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {

? I believe my patch would require porting

llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Line 4298 in dc8f6a8

InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) {

to the Combiner.

Also ping @arsenm and @davemgreen, I'm interested in hearing your thoughts on this. I am currently leaning towards adding this transform that InstCombine does to GlobalISel, which will handle the case I'm matching with this patch.

jayfoad · 2024-04-24T08:47:22Z

ping @jayfoad

I don't have anything to add. I haven't been following this review closely.

tschuett · 2024-04-24T15:17:43Z

#89363 has the poison symbols. There is no combine on freeze yet.

dc03-work · 2024-04-30T14:56:01Z

I have moved the freeze fold to #90618.

arsenm

Needs the testing for all opcodes.

The GIsel implementation seems to be missing some of the free truncate/zext logic that the DAG version has. Arguably that would be decided by targets choosing to add the combine or not (provided that we had some more useful type predicate infrastructure)

arsenm · 2024-05-09T14:14:11Z

llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir

+    %7:_(s16) = G_FREEZE %3
+    %4:_(s16) = G_AND %6, %7
+    %5:_(s32) = G_ANYEXT %4
+    $w0 = COPY %5


The test only covers and, but this will also impact or and xor

I have moved the tests to combine-op-trunc.mir, and have added test cases for G_OR and G_XOR.

tschuett · 2024-05-11T15:22:02Z

llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir

+    %1:_(s32) = COPY $w1
+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %6:_(s16) = G_FREEZE %2


Why is there still a freeze?

I will be rebasing this PR on top of #90618 once it lands, then the folds through the freezes will occur.

tschuett · 2024-05-11T15:22:14Z

llvm/test/CodeGen/AArch64/GlobalISel/combine-and-trunc.mir

+    %2:_(s16) = G_TRUNC %0
+    %3:_(s16) = G_TRUNC %1
+    %6:_(s16) = G_FREEZE %2
+    %7:_(s16) = G_FREEZE %3


dc03-work · 2024-05-14T06:12:57Z

llvm/test/CodeGen/AArch64/GlobalISel/combine-op-trunc.mir

+    $x0 = COPY %4
+...
+
+# Freezes should get pushed through truncs.


These folds will trigger once #90618 lands.

This is done now.

dc03-work · 2024-05-14T15:02:44Z

Looks like CI is failing because of flang causing OOM and stack overflow on windows with MSVC. Fun.

arsenm · 2024-05-20T16:54:57Z

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

    // Match: logic (ext X), (ext Y) --> ext (logic X, Y)
+    // Match: logic (trunc X), (trunc Y) -> trunc (logic X, Y)


The logic for the DAG combine treats the extend and truncate cases a bit differently. The truncate case considers whether the extend and truncate are free, and doesn't do this if they are. Arguably this could be handled by the target not adding the combine, but I don't think we have a way to add combines for specific types right now

I will look into this.

I was able to do this using TLI. Please have a look.

dc03-work · 2024-05-30T04:10:49Z

Ping @arsenm @tschuett . Would like to get this merged soon.

dc03-work · 2024-05-30T04:13:04Z

Rebased on main.

arsenm · 2024-05-30T13:05:35Z

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

+    EVT DstEVT = getApproximateEVTForLLT(MRI.getType(Dst), MF->getDataLayout(),
+                                         MF->getFunction().getContext());
+    EVT XEVT = getApproximateEVTForLLT(XTy, MF->getDataLayout(),
+                                       MF->getFunction().getContext());


We already have an LLT native isTruncateFree/isZExtFree, you don't need to figure out the LLT here

Okay, done.

arsenm · 2024-05-30T14:51:10Z

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_alignbit_b32 v0, s1, v0, 24
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0


This is worse, but not the fault of this patch

dc03-work · 2024-05-31T02:27:00Z

@tschuett Any changes you would like to see on this patch? You had posted a review with changes requested, please let me know if it seems ok before I land this.

… y))

dc03-work · 2024-06-03T05:11:58Z

Looks like no issues. I'm landing this patch.

dc03-work requested review from madhur13490 and davemgreen April 17, 2024 06:21

llvmbot added backend:AArch64 llvm:globalisel labels Apr 17, 2024

arsenm requested a review from Pierre-vh April 17, 2024 06:56

jayfoad reviewed Apr 17, 2024

View reviewed changes

dc03-work requested a review from jayfoad April 18, 2024 02:11

llvmbot added the backend:AMDGPU label Apr 18, 2024

tschuett reviewed Apr 23, 2024

View reviewed changes

tschuett requested changes Apr 23, 2024

View reviewed changes

dc03-work force-pushed the fold-and-patterns branch from 41d695a to e4d221e Compare April 30, 2024 14:55

arsenm requested changes May 9, 2024

View reviewed changes

tschuett reviewed May 11, 2024

View reviewed changes

dc03-work force-pushed the fold-and-patterns branch from e4d221e to 94bf09a Compare May 14, 2024 06:11

dc03-work commented May 14, 2024

View reviewed changes

dc03-work changed the title ~~[GISel][CombinerHelper] Combine and(trunc(x), trunc(y)) -> trunc(and(x, y))~~ [GISel][CombinerHelper] Combine op(trunc(x), trunc(y)) -> trunc(op(x, y)) May 14, 2024

dc03-work requested a review from arsenm May 14, 2024 06:14

arsenm reviewed May 20, 2024

View reviewed changes

dc03-work force-pushed the fold-and-patterns branch from 94bf09a to 9eb6def Compare May 23, 2024 09:02

dc03-work requested review from tschuett and arsenm May 23, 2024 09:03

dc03-work force-pushed the fold-and-patterns branch from eec208e to 376c026 Compare May 30, 2024 04:12

arsenm requested changes May 30, 2024

View reviewed changes

arsenm approved these changes May 30, 2024

View reviewed changes

dc03-work added 4 commits June 3, 2024 09:54

[GISel] Tests for op(trunc(x), trunc(y)) fold

8bd76e2

[GISel][CombinerHelper] Combine op(trunc(x), trunc(y)) -> trunc(op(x,…

d63456e

… y))

Use TLI.isZExtFree and TLI.isTruncateFree

087160b

Use LLT instead of EVT in isZExtFree/isTruncateFree

f58bbe4

dc03-work force-pushed the fold-and-patterns branch from f3de4e1 to f58bbe4 Compare June 3, 2024 04:24

dc03-work merged commit e12bf36 into llvm:main Jun 3, 2024
7 checks passed

	// Simplify (trunc v1) && (trunc v2) -> trunc (v1 && v2)
	// Simplify (trunc v1) & (trunc v2) -> trunc (v1 & v2)

		// Match: logic (ext X), (ext Y) --> ext (logic X, Y)
		// Match: logic (trunc X), (trunc Y) -> trunc (logic X, Y)

[GISel][CombinerHelper] Combine op(trunc(x), trunc(y)) -> trunc(op(x, y)) #89023

[GISel][CombinerHelper] Combine op(trunc(x), trunc(y)) -> trunc(op(x, y)) #89023

Conversation

dc03-work commented Apr 17, 2024 • edited Loading

dc03-work commented Apr 17, 2024

llvmbot commented Apr 17, 2024 • edited Loading

arsenm commented Apr 17, 2024

dc03-work commented Apr 17, 2024

arsenm commented Apr 17, 2024

dc03-work commented Apr 17, 2024

jayfoad left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

arsenm commented Apr 17, 2024

dc03-work commented Apr 17, 2024

tschuett commented Apr 17, 2024

tschuett commented Apr 17, 2024

dc03-work commented Apr 18, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tschuett left a comment

Choose a reason for hiding this comment

dc03-work commented Apr 23, 2024 • edited Loading

tschuett commented Apr 23, 2024

dc03-work commented Apr 24, 2024

tschuett commented Apr 24, 2024

dc03-work commented Apr 24, 2024

dc03-work commented Apr 24, 2024

jayfoad commented Apr 24, 2024

tschuett commented Apr 24, 2024

dc03-work commented Apr 30, 2024

arsenm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

dc03-work commented May 14, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

dc03-work commented May 30, 2024

dc03-work commented May 30, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

dc03-work commented May 31, 2024

dc03-work commented Jun 3, 2024

dc03-work commented Apr 17, 2024 •

edited

Loading

llvmbot commented Apr 17, 2024 •

edited

Loading

dc03-work commented Apr 23, 2024 •

edited

Loading