Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VPlan] Simplify (X && Y) || (X && !Y) -> X. #89386

Merged
merged 15 commits into from
May 19, 2024
Merged

Conversation

fhahn
Copy link
Contributor

@fhahn fhahn commented Apr 19, 2024

Simplify a common pattern generated for masks when folding the tail.

Simplify a common pattern generated for masks when folding the tail.
@llvmbot
Copy link

llvmbot commented Apr 19, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

Changes

Simplify a common pattern generated for masks when folding the tail.


Patch is 27.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89386.diff

8 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h (+59)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+12)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll (+6-12)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll (+1-2)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll (+1-2)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (+4-16)
  • (modified) llvm/test/Transforms/LoopVectorize/uniform-blend.ll (+1-4)
  • (modified) llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll (+5-14)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index ae7717eb7cc97f..76bd25d939523b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -185,6 +185,47 @@ using AllBinaryRecipe_match =
     BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
                        VPWidenCastRecipe, VPInstruction>;
 
+template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutable,
+          typename... RecipeTys>
+struct LogicalRecipe_match {
+  Op0_t LHS;
+  Op1_t RHS;
+
+  LogicalRecipe_match(Op0_t LHS, Op1_t RHS) : LHS(LHS), RHS(RHS) {}
+
+  bool match(const VPValue *V) {
+    auto *DefR = V->getDefiningRecipe();
+    return DefR && match(DefR);
+  }
+
+  bool match(const VPSingleDefRecipe *R) {
+    return match(static_cast<const VPRecipeBase *>(R));
+  }
+
+  bool match(const VPRecipeBase *R) {
+    if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) {
+      if (!detail::MatchRecipeAndOpcode<Instruction::Select,
+                                        RecipeTys...>::match(R))
+        return false;
+      if (Opcode == Instruction::And) {
+        if (!m_SpecificInt(0).match(R->getOperand(2)))
+          return false;
+      } else if (Opcode == Instruction::Or) {
+        if (!m_SpecificInt(1).match(R->getOperand(1)))
+          return false;
+      } else {
+        llvm_unreachable("unsupported opcode");
+      }
+    } else {
+      assert(R->getNumOperands() == 2 &&
+             "recipe with matched opcode does not have 2 operands");
+    }
+    return (LHS.match(R->getOperand(0)) && RHS.match(R->getOperand(1))) ||
+           (Commutable && LHS.match(R->getOperand(1)) &&
+            RHS.match(R->getOperand(0)));
+  }
+};
+
 template <unsigned Opcode, typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Opcode>
 m_VPInstruction(const Op0_t &Op0) {
@@ -266,6 +307,24 @@ inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
 m_Or(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
+
+template <typename Op0_t, typename Op1_t>
+inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe,
+                           VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>
+m_c_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
+  return LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe,
+                             VPReplicateRecipe, VPWidenCastRecipe,
+                             VPInstruction>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
+                           VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>
+m_c_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
+  return LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true,
+                             VPWidenRecipe, VPReplicateRecipe,
+                             VPWidenCastRecipe, VPInstruction>(Op0, Op1);
+}
 } // namespace VPlanPatternMatch
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 007dc3f89b3fb9..5453c47ecf7000 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -939,6 +939,18 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 #endif
   }
 
+  VPValue *B;
+  VPValue *C;
+  VPValue *D;
+  // Simplify (X && Y) || (X && !Y) -> X.
+  if (match(&R,
+            m_c_LogicalOr(m_c_LogicalAnd(m_VPValue(A), m_VPValue(B)),
+                          m_c_LogicalAnd(m_VPValue(C), m_Not(m_VPValue(D))))) &&
+      A == C && B == D) {
+    R.getVPSingleValue()->replaceAllUsesWith(A);
+    return;
+  }
+
   if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
                             m_Mul(m_SpecificInt(1), m_VPValue(A)))))
     return R.getVPSingleValue()->replaceAllUsesWith(A);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index b91579106261a6..200c2adcf0e666 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -223,10 +223,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]])
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]]
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP10]]
 ; TFCOMMON-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP13]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -272,16 +271,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP14]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i1> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = or <vscale x 2 x i1> [[TMP15]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = or <vscale x 2 x i1> [[TMP16]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP17]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP18]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[TMP23]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[TMP24]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 2
@@ -405,10 +402,9 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP10]])
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i1> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP12]])
-; TFCOMMON-NEXT:    [[TMP14:%.*]] = or <vscale x 2 x i1> [[TMP10]], [[TMP12]]
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP13]]
 ; TFCOMMON-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[TMP14]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -456,16 +452,14 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP21]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP22]])
-; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = or <vscale x 2 x i1> [[TMP17]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = or <vscale x 2 x i1> [[TMP18]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP23]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i64> [[TMP20]], <vscale x 2 x i64> [[TMP24]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i64 [[TMP29]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[TMP25]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[TMP26]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ad6e8534f31800..e22f70e436f330 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1241,9 +1241,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 2b2742ca7ccbc3..63ad98b2d8ab2a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -480,11 +480,10 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1ce4cb928e8085..ee70f4aa35850f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -462,13 +462,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison)
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP15]]
 ; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -510,13 +507,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP4]]
 ; TF-FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
+; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
...
[truncated]

@fhahn
Copy link
Contributor Author

fhahn commented Apr 23, 2024

Updated after landing #89384. Note that for the logical AND/OR matching, we need to actually check the provided bit width instead of asserting.

@fhahn
Copy link
Contributor Author

fhahn commented Apr 25, 2024

ping :)

Copy link
Collaborator

@ayalz ayalz left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Long-term, optimizing masks may be addressed when generated by a VPlan2VPlan Predicator, rather than by a separate subsequent redundancy eliminator.

}

bool match(const VPSingleDefRecipe *R) {
return match(static_cast<const VPRecipeBase *>(R));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this upcast and overloaded method needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed, thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, overloaded match(const VPSingleDefRecipe*) is needed because having both match(const VPValue*) and match(const VPRecipeBase *) its unclear which one to upcast to? Perhaps worth a comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not needed for now, I removed it for real this time. (From the comment I thought I already removed it , but apparently not)

RecipeTys...>::match(R))
return false;
if (Opcode == Instruction::And) {
if (!m_False().match(R->getOperand(2)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This use of m_False applies to a general bitwise AND, rather than logical AND of boolean mask operands. But the redundancy holds if the operand is all-false, rather than restricting to a single zero bit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if I am missing something, but this here doesn't match the opcode of the recipe (that is matched above to be a select), it matches the operation that the matcher was instantiated to match.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to see why the above "Trying the match constant with unexpected bitwidth" assert needs to be relaxed into a check, when applied to m_False/m_True whose operand is still expected to be boolean?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would also match other selects where the last operand is 0, e.g. select i1 %c, i32 %a, i32 0

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This stems from searching for select i1 %c, i1 %a, i1 0 as the poison-preserving form of and i1 %c, %a? Would it be clearer for VPlan to represent Poison-Preserving AND/OR directly, possibly via a flag, whose execute() emits the desired select patterns, rather than reverse-engineer them?

Copy link
Collaborator

@ayalz ayalz May 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would doing so earlier simplify this "Simplify (X && Y) || (X && !Y) -> X" patch, and potentially additional mask optimizations, by keeping it focused on AND/OR/NOT only, w/o having to reverse engineer selects, nor affect how false/true are matched?
When considering operands of AND/OR, in general, redundancy of constant false/true operands extends beyond i1 (of logical AND/OR) to all-zero/all-ones (of bitwise AND/OR) - in contrast to operands of select, whose matching to identify logical AND is restricted to i1.
Regarding handling (all) input - is that needed or expected, given that this matching tries to simplify mask computations that were generated earlier by LV. I.e., the select introduced by VPRecipeBuilder::createEdgeMask(). Representing LogicalAnd/LogicalOr is also consistent with the representation of Not, introduced there (rather than its xor-with-true form, simply because it was introduced before VPlan could represent constants including true).
(Another option is for VPlan to represent an AND-NOT recipe, which is admittedly a non-basic logic gate.)

Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?

The only AND/OR VPInstructions currently in use are those generated by create[Edge,BlockIn]Mask(), which are logical(ly dealing with masks), right? Actually, VPBuilder::createAnd() appears to be useless, after createEdgeMask() switched to calling VPBuilder::createSelect() instead, so no AND VPInstructions appear to exist, neither bitwise nor logical.

Note that the distinction between logical and bitwise effectively involves type analysis: is the bitwidth 1 or more. Vectorization turns logical boolean operations on (non-uniform) i1 values into bitwise ones.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put up to add a logical-and opcode and use that when creating edge masks: #91897

Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?

Hmm, I am not sure. Would probably warrant a separate look.

The only AND/OR VPInstructions currently in use are those generated by create[Edge,BlockIn]Mask(), which are logical(ly dealing with masks), right? Actually, VPBuilder::createAnd() appears to be useless, after createEdgeMask() switched to calling VPBuilder::createSelect() instead, so no AND VPInstructions appear to exist, neither bitwise nor logical.

I think so yes, let me clean this up separately.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?

Hmm, I am not sure. Would probably warrant a separate look.

Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated now to be based on LogicalAnd.

Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?

I am not entirely sure about changing the IR we generate here, the XOR variant appears to be the canonical for expected by other passes (i.e. that's what IRBuilder.CreateNot generates)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?

Hmm, I am not sure. Would probably warrant a separate look.

Looking at the original issue https://bugs.llvm.org/show_bug.cgi?id=48832 and its fix, it appears that masks of unreachable edges can no longer convey poison, so ORing them to produce block-in-masks should continue to be bitwise rather than logical. OTOH, logical ORs require their operands to be suitably ordered, whereas predecessors are in general unordered (unlike employing a logical AND for edge masks where the mask of the block naturally precedes the condition of its terminal branch).

Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?

I am not entirely sure about changing the IR we generate here, the XOR variant appears to be the canonical for expected by other passes (i.e. that's what IRBuilder.CreateNot generates)

The condition of a terminal branch may be poison, so may its negating NOT, provided the edge mask uses a logical AND to avoid propagating it in case the block is unreachable (has a false block-in-mask).

Note: better eliminate dead edges and nodes from the CFG prior to applying predication, once this is done via VPlan-to-VPlan transformation. Doing so may allow to use bitwise ANDs rather than logical ones, as before.

VPValue *B;
VPValue *C;
VPValue *D;
// Simplify (X && Y) || (X && !Y) -> X.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep comments and code consistent: have both use A,B,C,D or both X,Y,Z.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to use A, B, C, D. The intention was to avoid A == C && B == D in the description, but it is probably more confusing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about keeping the description (X && Y) || (X && !Y) -> X and use X, Y, X1, Y1 instead of A, B, C, D in the code, respectively?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent, updated!

VPValue *C;
VPValue *D;
// Simplify (X && Y) || (X && !Y) -> X.
if (match(&R,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More general and modular: first simplify (A && B) || (C && D) where A == C into A && (B || D). Also for A == D, B == C, B == D. Followed by simplifying (X || !X).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could do, but we will also add logic to simplify newly created recipes. Left as is for now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, worth leaving behind a TODO?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

@fhahn
Copy link
Contributor Author

fhahn commented May 3, 2024

ping :)

Copy link

github-actions bot commented May 9, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

RecipeTys...>::match(R))
return false;
if (Opcode == Instruction::And) {
if (!m_False().match(R->getOperand(2)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This stems from searching for select i1 %c, i1 %a, i1 0 as the poison-preserving form of and i1 %c, %a? Would it be clearer for VPlan to represent Poison-Preserving AND/OR directly, possibly via a flag, whose execute() emits the desired select patterns, rather than reverse-engineer them?

VPValue *Y;
VPValue *X1;
VPValue *Y1;
// Simplify (X && Y) || (X1 && !Y1) -> X.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Simplify (X && Y) || (X1 && !Y1) -> X.
// Simplify (X && Y) || (X && !Y) -> X.

This now more closely matches the code, where repeated occurrences of X and Y are enumerated.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

fhahn added a commit to fhahn/llvm-project that referenced this pull request May 12, 2024
Add a new opcode to mode non-poison propagating logical AND operations
used when generating edge masks. This follows the similar decision to
model Not as dedicated opcode as well, to improve clarity.

This also helps to simplify the matchers for
llvm#89386.
"Trying the match constant with unexpected bitwidth.");
return APInt::isSameValue(CI->getValue(), Val);
return (BitWidth == 0 || CI->getBitWidth() == BitWidth) &&
APInt::isSameValue(CI->getValue(), Val);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change still needed, now that Logical Ands are matched whose operands must be single bits, rather than Selects?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restored, thanks!


bool match(const VPRecipeBase *R) {
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) {
if (!detail::MatchRecipeAndOpcode<Instruction::Select,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still need to match Select here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I just updated the PR based on the new LogicalAnd.

RecipeTys...>::match(R))
return false;
if (Opcode == Instruction::And) {
if (!m_False().match(R->getOperand(2)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?

Hmm, I am not sure. Would probably warrant a separate look.

Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?

}

template <typename Op0_t, typename Op1_t>
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
inline LogicalRecipe_match<Op0_t, Op1_t, VPInstruction::LogicalAnd, true, VPWidenRecipe,

base on #91897?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just updated the PR based on the new LogicalAnd, thanks!

@@ -273,6 +309,24 @@ inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
m_Or(const Op0_t &Op0, const Op1_t &Op1) {
return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
}

template <typename Op0_t, typename Op1_t>
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe,
inline LogicalRecipe_match<Op0_t, Op1_t, VPInstruction::LogicalOr, true, VPWidenRecipe,

for consistency, and supporting createBlockInMask?

Worth commenting true /* Commutable */, here and/or below?

}

bool match(const VPRecipeBase *R) {
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still need to consider R's with non matching Opcode?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The whole matcher is now gone.

return false;
} else if (Opcode == Instruction::Or) {
if (!m_True().match(R->getOperand(1)))
return false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this folding of And-with-true and Or-with-false redundancies and inherent part of folding (X&&Y)||(X&&!Y)->X redundancy?


template <typename Op0_t, typename Op1_t>
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all these recipes relevant?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for the logicalAnd, but in terms of generality we could have ANDs/OR with any of those opcodes, which could potentially benefit.

fhahn added a commit that referenced this pull request May 14, 2024
…91897)

Add a new opcode to mode non-poison propagating logical AND operations
used when generating edge masks. This follows the similar decision to
model Not as dedicated opcode as well, to improve clarity.

This also helps to simplify the matchers for
#89386.


PR: #91897
Copy link
Contributor Author

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latest version has been simplified and updated to use LogicalAnd.

"Trying the match constant with unexpected bitwidth.");
return APInt::isSameValue(CI->getValue(), Val);
return (BitWidth == 0 || CI->getBitWidth() == BitWidth) &&
APInt::isSameValue(CI->getValue(), Val);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restored, thanks!

RecipeTys...>::match(R))
return false;
if (Opcode == Instruction::And) {
if (!m_False().match(R->getOperand(2)))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated now to be based on LogicalAnd.

Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?

I am not entirely sure about changing the IR we generate here, the XOR variant appears to be the canonical for expected by other passes (i.e. that's what IRBuilder.CreateNot generates)


bool match(const VPRecipeBase *R) {
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) {
if (!detail::MatchRecipeAndOpcode<Instruction::Select,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I just updated the PR based on the new LogicalAnd.

}

template <typename Op0_t, typename Op1_t>
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just updated the PR based on the new LogicalAnd, thanks!


template <typename Op0_t, typename Op1_t>
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for the logicalAnd, but in terms of generality we could have ANDs/OR with any of those opcodes, which could potentially benefit.

}

bool match(const VPRecipeBase *R) {
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The whole matcher is now gone.

Copy link
Contributor Author

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latest version is simplified by stripping out the logic to match commutative expressions and limits the matching to the existing BinaryOr (renamed to be more accurate) and the new LogicalAnd matchers.

I'll share a follow-up adding just the commutative matcher

fhahn added a commit to fhahn/llvm-project that referenced this pull request May 17, 2024
fhahn added a commit to fhahn/llvm-project that referenced this pull request May 17, 2024
Split off from llvm#89386, this
extends the binary matcher to support matching commuative operations.
This is used for a new m_c_BinaryOr matcher, used in simplifyRecipe.
@fhahn
Copy link
Contributor Author

fhahn commented May 17, 2024

The follow-up to add support for commutative matching is available now as well: #92539

@fhahn fhahn closed this May 17, 2024
@fhahn
Copy link
Contributor Author

fhahn commented May 17, 2024

Reopen as I accidentally hit the close button......

@fhahn fhahn reopened this May 17, 2024
Copy link
Collaborator

@ayalz ayalz left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for simplifying and extracting the part that deals specifically with the X&&Y || X&&!Y pattern! This by itself results in various levels of subsequent cleanups. Adding a couple of nits.

@@ -270,9 +270,15 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) {

template <typename Op0_t, typename Op1_t>
inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
m_Or(const Op0_t &Op0, const Op1_t &Op1) {
m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why/Is this distinction of Binary important, here - to contrast LogicalAnd below (i.e., IllogicalOr ;-), emphasizing Bitwise vs. Logical (i.e., those allowed to introduce UB vs. those that prevent it), or to accommodate Ternary and multi-ary Or's in the future. Worth updating recipe to print binary-or or bitwise-or rather than simply or?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to clarify the matcher now that we have separate ones for logical AND and in line with naming of the IR pattern matcher.

For printing, we rely on Instruction::getName to print Instruction opcodes, so I think it would probably be good to keep printing as regular or to stay consistent with the original/underlying IR

Comment on lines 938 to 945
VPValue *X;
VPValue *Y;
VPValue *X1;
VPValue *Y1;
// Simplify (X && Y) || (X && !Y) -> X.
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
// && (Y || Z) and (X || !X) into true. This requires queuing newly created
// recipes to be visited during simplification.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
VPValue *X;
VPValue *Y;
VPValue *X1;
VPValue *Y1;
// Simplify (X && Y) || (X && !Y) -> X.
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
// && (Y || Z) and (X || !X) into true. This requires queuing newly created
// recipes to be visited during simplification.
// Simplify (X && Y) || (X && !Y) -> X.
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
// && (Y || Z) and (X || !X) into true. This requires queuing newly created
// recipes to be visited during simplification.
VPValue *X, Y, X1, Y1;

nit: better place definitions together, and closer to their use.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
; CHECK-NEXT: EMIT vp<[[MASK5:%.+]]> = logical-and vp<[[MASK4]]>, ir<%c.0>
; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that X is MASK1, Y is ir<%c.0>; all of !Y, X&&Y, (X&&Y) || (X&&!Y) are eliminated, the latter replaced by X, while X&&Y (MASK2) is retained due to another use - by BRANCH-ON-COUNT.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP11]], ptr [[B]], ptr poison
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice clean-up once TMP11 is replaced by TMP4, through extractelements, eliminating both logical ANDs and code generating Y (TMP5).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

@@ -223,10 +223,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]])
; TFCOMMON-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; TFCOMMON-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This eliminates only the OR, leaving the two logical ANDs intact, as they are used elsewhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

Copy link
Collaborator

@ayalz ayalz left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ship it!

@fhahn fhahn merged commit b050048 into llvm:main May 19, 2024
4 checks passed
@fhahn fhahn deleted the vplan-simplify-or-and branch May 19, 2024 15:45
fhahn added a commit that referenced this pull request May 20, 2024
Split off from #89386, this
extends the binary matcher to support matching commuative operations.
This is used for a new m_c_BinaryOr matcher, used in simplifyRecipe.

PR: #92539
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants