-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Simplify (X && Y) || (X && !Y) -> X. #89386
Conversation
Simplify a common pattern generated for masks when folding the tail.
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesSimplify a common pattern generated for masks when folding the tail. Patch is 27.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89386.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index ae7717eb7cc97f..76bd25d939523b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -185,6 +185,47 @@ using AllBinaryRecipe_match =
BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
VPWidenCastRecipe, VPInstruction>;
+template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutable,
+ typename... RecipeTys>
+struct LogicalRecipe_match {
+ Op0_t LHS;
+ Op1_t RHS;
+
+ LogicalRecipe_match(Op0_t LHS, Op1_t RHS) : LHS(LHS), RHS(RHS) {}
+
+ bool match(const VPValue *V) {
+ auto *DefR = V->getDefiningRecipe();
+ return DefR && match(DefR);
+ }
+
+ bool match(const VPSingleDefRecipe *R) {
+ return match(static_cast<const VPRecipeBase *>(R));
+ }
+
+ bool match(const VPRecipeBase *R) {
+ if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) {
+ if (!detail::MatchRecipeAndOpcode<Instruction::Select,
+ RecipeTys...>::match(R))
+ return false;
+ if (Opcode == Instruction::And) {
+ if (!m_SpecificInt(0).match(R->getOperand(2)))
+ return false;
+ } else if (Opcode == Instruction::Or) {
+ if (!m_SpecificInt(1).match(R->getOperand(1)))
+ return false;
+ } else {
+ llvm_unreachable("unsupported opcode");
+ }
+ } else {
+ assert(R->getNumOperands() == 2 &&
+ "recipe with matched opcode does not have 2 operands");
+ }
+ return (LHS.match(R->getOperand(0)) && RHS.match(R->getOperand(1))) ||
+ (Commutable && LHS.match(R->getOperand(1)) &&
+ RHS.match(R->getOperand(0)));
+ }
+};
+
template <unsigned Opcode, typename Op0_t>
inline UnaryVPInstruction_match<Op0_t, Opcode>
m_VPInstruction(const Op0_t &Op0) {
@@ -266,6 +307,24 @@ inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
m_Or(const Op0_t &Op0, const Op1_t &Op1) {
return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
}
+
+template <typename Op0_t, typename Op1_t>
+inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe,
+ VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>
+m_c_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
+ return LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe,
+ VPReplicateRecipe, VPWidenCastRecipe,
+ VPInstruction>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe,
+ VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>
+m_c_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
+ return LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true,
+ VPWidenRecipe, VPReplicateRecipe,
+ VPWidenCastRecipe, VPInstruction>(Op0, Op1);
+}
} // namespace VPlanPatternMatch
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 007dc3f89b3fb9..5453c47ecf7000 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -939,6 +939,18 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
#endif
}
+ VPValue *B;
+ VPValue *C;
+ VPValue *D;
+ // Simplify (X && Y) || (X && !Y) -> X.
+ if (match(&R,
+ m_c_LogicalOr(m_c_LogicalAnd(m_VPValue(A), m_VPValue(B)),
+ m_c_LogicalAnd(m_VPValue(C), m_Not(m_VPValue(D))))) &&
+ A == C && B == D) {
+ R.getVPSingleValue()->replaceAllUsesWith(A);
+ return;
+ }
+
if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
m_Mul(m_SpecificInt(1), m_VPValue(A)))))
return R.getVPSingleValue()->replaceAllUsesWith(A);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index b91579106261a6..200c2adcf0e666 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -223,10 +223,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]])
; TFCOMMON-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; TFCOMMON-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]]
; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP10]]
; TFCOMMON-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP13]])
+; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
; TFCOMMON-NEXT: [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -272,16 +271,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP14]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i1> zeroinitializer
; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = or <vscale x 2 x i1> [[TMP15]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = or <vscale x 2 x i1> [[TMP16]], [[TMP22]]
; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP17]]
; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP18]]
; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2
; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]]
-; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[TMP23]])
-; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[TMP24]])
+; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 2
@@ -405,10 +402,9 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
; TFCOMMON-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP10]])
; TFCOMMON-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i1> zeroinitializer
; TFCOMMON-NEXT: [[TMP13:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP12]])
-; TFCOMMON-NEXT: [[TMP14:%.*]] = or <vscale x 2 x i1> [[TMP10]], [[TMP12]]
; TFCOMMON-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP13]]
; TFCOMMON-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[TMP14]])
+; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
; TFCOMMON-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -456,16 +452,14 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP21]])
; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP22]])
-; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = or <vscale x 2 x i1> [[TMP17]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = or <vscale x 2 x i1> [[TMP18]], [[TMP22]]
; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP23]]
; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i64> [[TMP20]], <vscale x 2 x i64> [[TMP24]]
; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 2
; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i64 [[TMP29]]
-; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[TMP25]])
-; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[TMP26]])
+; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ad6e8534f31800..e22f70e436f330 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1241,9 +1241,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
; CHECK-ORDERED-TF-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 2b2742ca7ccbc3..63ad98b2d8ab2a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -480,11 +480,10 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
; CHECK-NEXT: [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1ce4cb928e8085..ee70f4aa35850f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -462,13 +462,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison)
-; TF-SCALABLE-NEXT: [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT: [[TMP15:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP15]]
; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
; TF-SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -510,13 +507,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP4]]
; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
; TF-FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
+; TF-FIXEDLEN-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
; TF-FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
...
[truncated]
|
Updated after landing #89384. Note that for the logical AND/OR matching, we need to actually check the provided bit width instead of asserting. |
ping :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Long-term, optimizing masks may be addressed when generated by a VPlan2VPlan Predicator, rather than by a separate subsequent redundancy eliminator.
} | ||
|
||
bool match(const VPSingleDefRecipe *R) { | ||
return match(static_cast<const VPRecipeBase *>(R)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this upcast and overloaded method needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, overloaded match(const VPSingleDefRecipe*) is needed because having both match(const VPValue*) and match(const VPRecipeBase *) its unclear which one to upcast to? Perhaps worth a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not needed for now, I removed it for real this time. (From the comment I thought I already removed it , but apparently not)
RecipeTys...>::match(R)) | ||
return false; | ||
if (Opcode == Instruction::And) { | ||
if (!m_False().match(R->getOperand(2))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This use of m_False applies to a general bitwise AND, rather than logical AND of boolean mask operands. But the redundancy holds if the operand is all-false, rather than restricting to a single zero bit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if I am missing something, but this here doesn't match the opcode of the recipe (that is matched above to be a select), it matches the operation that the matcher was instantiated to match.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to see why the above "Trying the match constant with unexpected bitwidth" assert needs to be relaxed into a check, when applied to m_False/m_True whose operand is still expected to be boolean?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would also match other selects where the last operand is 0, e.g. select i1 %c, i32 %a, i32 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This stems from searching for select i1 %c, i1 %a, i1 0
as the poison-preserving form of and i1 %c, %a
? Would it be clearer for VPlan to represent Poison-Preserving AND/OR directly, possibly via a flag, whose execute() emits the desired select patterns, rather than reverse-engineer them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would doing so earlier simplify this "Simplify (X && Y) || (X && !Y) -> X" patch, and potentially additional mask optimizations, by keeping it focused on AND/OR/NOT only, w/o having to reverse engineer selects, nor affect how false/true are matched?
When considering operands of AND/OR, in general, redundancy of constant false/true operands extends beyond i1 (of logical AND/OR) to all-zero/all-ones (of bitwise AND/OR) - in contrast to operands of select, whose matching to identify logical AND is restricted to i1.
Regarding handling (all) input - is that needed or expected, given that this matching tries to simplify mask computations that were generated earlier by LV. I.e., the select introduced by VPRecipeBuilder::createEdgeMask(). Representing LogicalAnd/LogicalOr is also consistent with the representation of Not
, introduced there (rather than its xor-with-true form, simply because it was introduced before VPlan could represent constants including true
).
(Another option is for VPlan to represent an AND-NOT recipe, which is admittedly a non-basic logic gate.)
Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?
The only AND/OR VPInstructions currently in use are those generated by create[Edge,BlockIn]Mask(), which are logical(ly dealing with masks), right? Actually, VPBuilder::createAnd() appears to be useless, after createEdgeMask() switched to calling VPBuilder::createSelect() instead, so no AND VPInstructions appear to exist, neither bitwise nor logical.
Note that the distinction between logical and bitwise effectively involves type analysis: is the bitwidth 1 or more. Vectorization turns logical boolean operations on (non-uniform) i1 values into bitwise ones.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Put up to add a logical-and opcode and use that when creating edge masks: #91897
Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?
Hmm, I am not sure. Would probably warrant a separate look.
The only AND/OR VPInstructions currently in use are those generated by create[Edge,BlockIn]Mask(), which are logical(ly dealing with masks), right? Actually, VPBuilder::createAnd() appears to be useless, after createEdgeMask() switched to calling VPBuilder::createSelect() instead, so no AND VPInstructions appear to exist, neither bitwise nor logical.
I think so yes, let me clean this up separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?
Hmm, I am not sure. Would probably warrant a separate look.
Perhaps Not
should also become a LogicalNot
, generating a select %c, false, true
instead of the natural xor %c, true
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated now to be based on LogicalAnd.
Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?
I am not entirely sure about changing the IR we generate here, the XOR variant appears to be the canonical for expected by other passes (i.e. that's what IRBuilder.CreateNot generates)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?
Hmm, I am not sure. Would probably warrant a separate look.
Looking at the original issue https://bugs.llvm.org/show_bug.cgi?id=48832 and its fix, it appears that masks of unreachable edges can no longer convey poison, so ORing them to produce block-in-masks should continue to be bitwise rather than logical. OTOH, logical ORs require their operands to be suitably ordered, whereas predecessors are in general unordered (unlike employing a logical AND for edge masks where the mask of the block naturally precedes the condition of its terminal branch).
Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?
I am not entirely sure about changing the IR we generate here, the XOR variant appears to be the canonical for expected by other passes (i.e. that's what IRBuilder.CreateNot generates)
The condition of a terminal branch may be poison, so may its negating NOT, provided the edge mask uses a logical AND to avoid propagating it in case the block is unreachable (has a false block-in-mask).
Note: better eliminate dead edges and nodes from the CFG prior to applying predication, once this is done via VPlan-to-VPlan transformation. Doing so may allow to use bitwise ANDs rather than logical ones, as before.
VPValue *B; | ||
VPValue *C; | ||
VPValue *D; | ||
// Simplify (X && Y) || (X && !Y) -> X. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Keep comments and code consistent: have both use A,B,C,D or both X,Y,Z.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to use A, B, C, D. The intention was to avoid A == C && B == D in the description, but it is probably more confusing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about keeping the description (X && Y) || (X && !Y) -> X and use X, Y, X1, Y1 instead of A, B, C, D in the code, respectively?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Excellent, updated!
VPValue *C; | ||
VPValue *D; | ||
// Simplify (X && Y) || (X && !Y) -> X. | ||
if (match(&R, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
More general and modular: first simplify (A && B) || (C && D) where A == C into A && (B || D). Also for A == D, B == C, B == D. Followed by simplifying (X || !X).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could do, but we will also add logic to simplify newly created recipes. Left as is for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, worth leaving behind a TODO?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
ping :) |
✅ With the latest revision this PR passed the C/C++ code formatter. |
RecipeTys...>::match(R)) | ||
return false; | ||
if (Opcode == Instruction::And) { | ||
if (!m_False().match(R->getOperand(2))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This stems from searching for select i1 %c, i1 %a, i1 0
as the poison-preserving form of and i1 %c, %a
? Would it be clearer for VPlan to represent Poison-Preserving AND/OR directly, possibly via a flag, whose execute() emits the desired select patterns, rather than reverse-engineer them?
VPValue *Y; | ||
VPValue *X1; | ||
VPValue *Y1; | ||
// Simplify (X && Y) || (X1 && !Y1) -> X. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Simplify (X && Y) || (X1 && !Y1) -> X. | |
// Simplify (X && Y) || (X && !Y) -> X. |
This now more closely matches the code, where repeated occurrences of X and Y are enumerated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
Add a new opcode to mode non-poison propagating logical AND operations used when generating edge masks. This follows the similar decision to model Not as dedicated opcode as well, to improve clarity. This also helps to simplify the matchers for llvm#89386.
"Trying the match constant with unexpected bitwidth."); | ||
return APInt::isSameValue(CI->getValue(), Val); | ||
return (BitWidth == 0 || CI->getBitWidth() == BitWidth) && | ||
APInt::isSameValue(CI->getValue(), Val); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this change still needed, now that Logical Ands are matched whose operands must be single bits, rather than Selects?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Restored, thanks!
|
||
bool match(const VPRecipeBase *R) { | ||
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) { | ||
if (!detail::MatchRecipeAndOpcode<Instruction::Select, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still need to match Select here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I just updated the PR based on the new LogicalAnd.
RecipeTys...>::match(R)) | ||
return false; | ||
if (Opcode == Instruction::And) { | ||
if (!m_False().match(R->getOperand(2))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Independently, should createBlockInMask() generate logical OR's using selects as well, possibly by recipes that target IRBuilder::CreateLogicalOr(), rather than plain bitwise OR's via IRBuilder's CreateBinOp()?
Hmm, I am not sure. Would probably warrant a separate look.
Perhaps Not
should also become a LogicalNot
, generating a select %c, false, true
instead of the natural xor %c, true
?
} | ||
|
||
template <typename Op0_t, typename Op1_t> | ||
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe, | |
inline LogicalRecipe_match<Op0_t, Op1_t, VPInstruction::LogicalAnd, true, VPWidenRecipe, |
base on #91897?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just updated the PR based on the new LogicalAnd, thanks!
@@ -273,6 +309,24 @@ inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or> | |||
m_Or(const Op0_t &Op0, const Op1_t &Op1) { | |||
return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1); | |||
} | |||
|
|||
template <typename Op0_t, typename Op1_t> | |||
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::Or, true, VPWidenRecipe, | |
inline LogicalRecipe_match<Op0_t, Op1_t, VPInstruction::LogicalOr, true, VPWidenRecipe, |
for consistency, and supporting createBlockInMask?
Worth commenting true /* Commutable */
, here and/or below?
} | ||
|
||
bool match(const VPRecipeBase *R) { | ||
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still need to consider R's with non matching Opcode?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The whole matcher is now gone.
return false; | ||
} else if (Opcode == Instruction::Or) { | ||
if (!m_True().match(R->getOperand(1))) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this folding of And-with-true and Or-with-false redundancies and inherent part of folding (X&&Y)||(X&&!Y)->X redundancy?
|
||
template <typename Op0_t, typename Op1_t> | ||
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe, | ||
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are all these recipes relevant?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not for the logicalAnd, but in terms of generality we could have ANDs/OR with any of those opcodes, which could potentially benefit.
Conflicts: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest version has been simplified and updated to use LogicalAnd.
"Trying the match constant with unexpected bitwidth."); | ||
return APInt::isSameValue(CI->getValue(), Val); | ||
return (BitWidth == 0 || CI->getBitWidth() == BitWidth) && | ||
APInt::isSameValue(CI->getValue(), Val); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Restored, thanks!
RecipeTys...>::match(R)) | ||
return false; | ||
if (Opcode == Instruction::And) { | ||
if (!m_False().match(R->getOperand(2))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated now to be based on LogicalAnd.
Perhaps Not should also become a LogicalNot, generating a select %c, false, true instead of the natural xor %c, true?
I am not entirely sure about changing the IR we generate here, the XOR variant appears to be the canonical for expected by other passes (i.e. that's what IRBuilder.CreateNot generates)
|
||
bool match(const VPRecipeBase *R) { | ||
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) { | ||
if (!detail::MatchRecipeAndOpcode<Instruction::Select, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I just updated the PR based on the new LogicalAnd.
} | ||
|
||
template <typename Op0_t, typename Op1_t> | ||
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just updated the PR based on the new LogicalAnd, thanks!
|
||
template <typename Op0_t, typename Op1_t> | ||
inline LogicalRecipe_match<Op0_t, Op1_t, Instruction::And, true, VPWidenRecipe, | ||
VPReplicateRecipe, VPWidenCastRecipe, VPInstruction> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not for the logicalAnd, but in terms of generality we could have ANDs/OR with any of those opcodes, which could potentially benefit.
} | ||
|
||
bool match(const VPRecipeBase *R) { | ||
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The whole matcher is now gone.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest version is simplified by stripping out the logic to match commutative expressions and limits the matching to the existing BinaryOr (renamed to be more accurate) and the new LogicalAnd matchers.
I'll share a follow-up adding just the commutative matcher
Split off from llvm#89386, this extends the binary matcher to support matching commuative operations. This is used for a new m_c_BinaryOr matcher, used in simplifyRecipe.
The follow-up to add support for commutative matching is available now as well: #92539 |
Reopen as I accidentally hit the close button...... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for simplifying and extracting the part that deals specifically with the X&&Y || X&&!Y pattern! This by itself results in various levels of subsequent cleanups. Adding a couple of nits.
@@ -270,9 +270,15 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) { | |||
|
|||
template <typename Op0_t, typename Op1_t> | |||
inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or> | |||
m_Or(const Op0_t &Op0, const Op1_t &Op1) { | |||
m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why/Is this distinction of Binary
important, here - to contrast LogicalAnd below (i.e., IllogicalOr ;-), emphasizing Bitwise
vs. Logical (i.e., those allowed to introduce UB vs. those that prevent it), or to accommodate Ternary and multi-ary Or's in the future. Worth updating recipe to print binary-or
or bitwise-or
rather than simply or
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is to clarify the matcher now that we have separate ones for logical AND and in line with naming of the IR pattern matcher.
For printing, we rely on Instruction::getName to print Instruction opcodes, so I think it would probably be good to keep printing as regular or
to stay consistent with the original/underlying IR
VPValue *X; | ||
VPValue *Y; | ||
VPValue *X1; | ||
VPValue *Y1; | ||
// Simplify (X && Y) || (X && !Y) -> X. | ||
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X | ||
// && (Y || Z) and (X || !X) into true. This requires queuing newly created | ||
// recipes to be visited during simplification. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPValue *X; | |
VPValue *Y; | |
VPValue *X1; | |
VPValue *Y1; | |
// Simplify (X && Y) || (X && !Y) -> X. | |
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X | |
// && (Y || Z) and (X || !X) into true. This requires queuing newly created | |
// recipes to be visited during simplification. | |
// Simplify (X && Y) || (X && !Y) -> X. | |
// TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X | |
// && (Y || Z) and (X || !X) into true. This requires queuing newly created | |
// recipes to be visited during simplification. | |
VPValue *X, Y, X1, Y1; |
nit: better place definitions together, and closer to their use.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
; CHECK-NEXT: BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]> | ||
; CHECK-NEXT: EMIT vp<[[MASK5:%.+]]> = logical-and vp<[[MASK4]]>, ir<%c.0> | ||
; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just noting that X is MASK1, Y is ir<%c.0>; all of !Y, X&&Y, (X&&Y) || (X&&!Y) are eliminated, the latter replaced by X, while X&&Y (MASK2) is retained due to another use - by BRANCH-ON-COUNT.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 | ||
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 | ||
; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]] | ||
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP11]], ptr [[B]], ptr poison |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice clean-up once TMP11 is replaced by TMP4, through extractelements, eliminating both logical ANDs and code generating Y (TMP5).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
@@ -223,10 +223,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { | |||
; TFCOMMON-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]]) | |||
; TFCOMMON-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer) | |||
; TFCOMMON-NEXT: [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer | |||
; TFCOMMON-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This eliminates only the OR, leaving the two logical ANDs intact, as they are used elsewhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ship it!
Simplify a common pattern generated for masks when folding the tail.