-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Remove unused first mask op from VPBlendRecipe. #87770
[VPlan] Remove unused first mask op from VPBlendRecipe. #87770
Conversation
VPBlendRecipe does not use the first mask operand. Removing it allows VPlan-based DCE to remove unused mask computations. This also fixes llvm#87410, where unused Not VPInstructions are considered having only their first lane demanded, but some of their operands providing a vector value due to other users. Fixes llvm#87410
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesVPBlendRecipe does not use the first mask operand. Removing it allows VPlan-based DCE to remove unused mask computations. This also fixes #87410, where unused Not VPInstructions are considered having only their first lane demanded, but some of their operands providing a vector value due to other users. Fixes #87410 Patch is 73.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87770.diff 21 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb0fd06554e6c3..12d38d4eac9605 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8245,6 +8245,8 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
"Distinct incoming values with one having a full mask");
break;
}
+ if (In == 0)
+ continue;
OperandsWithMask.push_back(EdgeMask);
}
return new VPBlendRecipe(Phi, OperandsWithMask);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 77577b516ae274..e86467202251e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1932,12 +1932,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
class VPBlendRecipe : public VPSingleDefRecipe {
public:
/// The blend operation is a User of the incoming values and of their
- /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
- /// might be incoming with a full mask for which there is no VPValue.
+ /// respective masks, ordered [I0, I1, M1, ...]. Note that the first incoming
+ /// value does not have a mask associated.
VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
: VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
assert(Operands.size() > 0 &&
- ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
+ ((Operands.size() == 1) || ((Operands.size() + 1) % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
"of operands");
}
@@ -1951,13 +1951,18 @@ class VPBlendRecipe : public VPSingleDefRecipe {
/// Return the number of incoming values, taking into account that a single
/// incoming value has no mask.
- unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
+ unsigned getNumIncomingValues() const { return (getNumOperands() + 2) / 2; }
/// Return incoming value number \p Idx.
- VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
+ VPValue *getIncomingValue(unsigned Idx) const {
+ return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1);
+ }
/// Return mask number \p Idx.
- VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
+ VPValue *getMask(unsigned Idx) const {
+ assert(Idx > 0);
+ return getOperand(Idx * 2);
+ }
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1be0287ce7c9e1..4199869f1ffad2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1514,6 +1514,8 @@ void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
O << " ";
getIncomingValue(I)->printAsOperand(O, SlotTracker);
+ if (I == 0)
+ continue;
O << "/";
getMask(I)->printAsOperand(O, SlotTracker);
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index 9dcc751db7cf0e..c544d2a92e6397 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -304,8 +304,6 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index dcd78aa7f1e3d9..7ca1b5395dd013 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -449,7 +449,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = xor <vscale x 2 x i1> [[TMP9]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -502,8 +501,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
; FIXED-NEXT: [[TMP10:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP8]]
; FIXED-NEXT: [[TMP11:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]]
-; FIXED-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]]
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
@@ -583,7 +580,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = xor <vscale x 2 x i1> [[TMP9]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -636,8 +632,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
; FIXED-NEXT: [[TMP10:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP8]]
; FIXED-NEXT: [[TMP11:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]]
-; FIXED-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]]
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]]
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
@@ -714,7 +708,6 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 42, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP10:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 27, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP9]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -763,8 +756,6 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], <i64 42, i64 42, i64 42, i64 42>
; FIXED-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], <i64 27, i64 27, i64 27, i64 27>
; FIXED-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], <i64 27, i64 27, i64 27, i64 27>
-; FIXED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]]
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]]
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
@@ -841,7 +832,6 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 42, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP10:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 27, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP9]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
; CHECK-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -890,8 +880,6 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], <i64 42, i64 42, i64 42, i64 42>
; FIXED-NEXT: [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], <i64 27, i64 27, i64 27, i64 27>
; FIXED-NEXT: [[TMP9:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], <i64 27, i64 27, i64 27, i64 27>
-; FIXED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]]
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]]
; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
@@ -969,7 +957,6 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -128, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP10:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP11:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = xor <vscale x 16 x i1> [[TMP9]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[WIDE_LOAD]]
; CHECK-NEXT: store <vscale x 16 x i8> [[PREDPHI]], ptr [[TMP8]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1020,8 +1007,6 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
; FIXED-NEXT: [[TMP9:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; FIXED-NEXT: [[TMP10:%.*]] = sdiv <32 x i8> [[WIDE_LOAD]], [[TMP8]]
; FIXED-NEXT: [[TMP11:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP9]]
-; FIXED-NEXT: [[TMP12:%.*]] = xor <32 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT: [[TMP13:%.*]] = xor <32 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; FIXED-NEXT: [[PREDPHI:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP10]], <32 x i8> [[WIDE_LOAD]]
; FIXED-NEXT: [[PREDPHI2:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> [[TMP11]], <32 x i8> [[WIDE_LOAD1]]
; FIXED-NEXT: store <32 x i8> [[PREDPHI]], ptr [[TMP4]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 34a7987bb40abe..19e19a1a31effd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -412,7 +412,6 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison)
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -444,7 +443,6 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
; SCALABLE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; SCALABLE-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; SCALABLE-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
; SCALABLE-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index dcfa9bb105b626..1ce4cb928e8085 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -346,7 +346,6 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> poison)
-; SCALABLE-NEXT: [[TMP12:%.*]] = xor <vscale x 2 x i1> [[TMP11]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; SCALABLE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
@@ -395,8 +394,6 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], <i64 10, i64 10, i64 10, i64 10>
; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP3]], <4 x i64> poison)
-; FIXEDLEN-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; FIXEDLEN-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 ...
[truncated]
|
Nice clean-up of a redundant operand! Adding minor nits. In general, it may be worth selecting which mask operand to drop, e.g., the one most expensive to compute, rather than picking the one listed first arbitrarily. Perhaps worth a TODO somewhere.
This appeases the specific reproducer provided by #87410, but does it fix the underlying issue - which is teaching mask generation operations (select, or, blend) to support only-first-lane-used cases? Worth retaining a(nother) reproducer, e.g., with the problematic mask operand being non-first (which could be worked-around by carefully selecting it for omission), and/or with multiple problematic mask operands. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me, thanks for cleaning this up!
Added minor nits.
@@ -8245,6 +8245,8 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, | |||
"Distinct incoming values with one having a full mask"); | |||
break; | |||
} | |||
if (In == 0) | |||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: may be better to early-continue earlier - before creating the edge mask, which for first operand is needed only to assert, and for other operands can simplify the assert. Can also simplify simplifyRecipe() to look for single-operand Blends, by doing, e.g.,
OperandsWithMask.push_back(Operands[0]);
if (all_equal(Operands)) /* This case is optimized away later (can be simplified). */
return new VPBlendRecipe(Phi, OperandsWithMask);
/* First operand must have a non-full mask, which is ignored. */
assert(createEdgeMask(Phi->getIncomingBlock(0), Phi->getParent()) &&
"Distinct incoming values with one having a full mask");
for (unsigned In = 1; In < NumIncoming; In++) {
OperandsWithMask.push_back(Operands[In]);
VPValue *EdgeMask =
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
assert(EdgeMask && "Both null and non-null edge masks found");
OperandsWithMask.push_back(EdgeMask);
}
return new VPBlendRecipe(Phi, OperandsWithMask);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately we can't use an early continue at the top, as null masks cannot be added as operands due to them not being proper VPValues. The early continue would mean we break after adding 2 incoming values but no mask. Left as is for now, but pushed 9430a4b to replace createEdgeMask
to with getEdgeMask
to make it clear that no new edge-mask is created, only the existing one retrieved.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pushed 9430a4b to replace createEdgeMask to with getEdgeMask to make it clear that no new edge-mask is created, only the existing one retrieved.
Good catch!
Unfortunately we can't use an early continue at the top ...
Hence the restructuring proposed above, which may look clearer, early returning after inserting the first operand (w/o its mask) if all operands are the same. Otherwise the masks must be non-null. This takes care of simplifyRecipe(), leaving it to do copy elimination (remove blends of a single element).
@@ -1932,12 +1932,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { | |||
class VPBlendRecipe : public VPSingleDefRecipe { | |||
public: | |||
/// The blend operation is a User of the incoming values and of their | |||
/// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value | |||
/// might be incoming with a full mask for which there is no VPValue. | |||
/// respective masks, ordered [I0, I1, M1, ...]. Note that the first incoming |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// respective masks, ordered [I0, I1, M1, ...]. Note that the first incoming | |
/// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first incoming |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
((Operands.size() == 1) || ((Operands.size() + 1) % 2 == 0)) && | ||
"Expected either a single incoming value or a positive even number " | ||
"of operands"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
((Operands.size() == 1) || ((Operands.size() + 1) % 2 == 0)) && | |
"Expected either a single incoming value or a positive even number " | |
"of operands"); | |
((Operands.size() + 1) % 2 == 0)) && "Expected an odd number of operands"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
/// Return the number of incoming values, taking into account that a single | ||
/// incoming value has no mask. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Return the number of incoming values, taking into account that a single | |
/// incoming value has no mask. | |
/// Return the number of incoming values, taking into account that the first | |
/// incoming value has no mask. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks!
@@ -1951,13 +1951,18 @@ class VPBlendRecipe : public VPSingleDefRecipe { | |||
|
|||
/// Return the number of incoming values, taking into account that a single | |||
/// incoming value has no mask. | |||
unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } | |||
unsigned getNumIncomingValues() const { return (getNumOperands() + 2) / 2; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Original code that rounds-up a division by 2 should still be ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, restored, thanks!
|
||
/// Return mask number \p Idx. | ||
VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); } | ||
VPValue *getMask(unsigned Idx) const { | ||
assert(Idx > 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(Idx > 0); | |
assert(Idx > 0 && "First index has no mask associated"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
br label %loop.latch | ||
|
||
loop.latch: | ||
%merge = phi ptr [ %B, %else.2 ], [ poison, %loop.header ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This still breaks if the operands are swapped:
%merge = phi ptr [ poison, %loop.header ], [ %B, %else.2 ]
so that the mask associated with the %loop.header predecessor gets dropped,
but the mask of the problematic %else.2 predecessor is still used to compute the blend.
br label %loop.latch | ||
|
||
loop.latch: | ||
%merge = phi ptr [ %B, %else.2 ], [ poison, %loop.header ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This still breaks if the operands are swapped:
%merge = phi ptr [ poison, %loop.header ], [ %B, %else.2 ]
so that the mask associated with the %loop.header predecessor gets dropped,
but the mask of the problematic %else.2 predecessor is still used to compute the blend.
…irst-mask-for-blend Conflicts: llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
…irst-mask-for-blend
assert(Operands.size() > 0 && (Operands.size() + 1) % 2 == 0 && | ||
"Expected an odd number of operands"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(Operands.size() > 0 && (Operands.size() + 1) % 2 == 0 && | |
"Expected an odd number of operands"); | |
assert(Operands.size() % 2 == 1 && "Expected an odd number of operands"); |
?
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed comments, thanks!
VPBlendRecipe does not use the first mask operand. Removing it allows VPlan-based DCE to remove unused mask computations.
Nice clean-up of a redundant operand! Adding minor nits. In general, it may be worth selecting which mask operand to drop, e.g., the one most expensive to compute, rather than picking the one listed first arbitrarily. Perhaps worth a TODO somewhere.
Updated and added TODO where VPlendRecipes are created, thanks!
This also fixes #87410, where unused Not VPInstructions are considered having only their first lane demanded, but some of their operands providing a vector value due to other users.
Fixes #87410This appeases the specific reproducer provided by #87410, but does it fix the underlying issue - which is teaching mask generation operations (select, or, blend) to support only-first-lane-used cases? Worth retaining a(nother) reproducer, e.g., with the problematic mask operand being non-first (which could be worked-around by carefully selecting it for omission), and/or with multiple problematic mask operands.
Ah yes, some cases remain. Will follow up.
@@ -8245,6 +8245,8 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, | |||
"Distinct incoming values with one having a full mask"); | |||
break; | |||
} | |||
if (In == 0) | |||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately we can't use an early continue at the top, as null masks cannot be added as operands due to them not being proper VPValues. The early continue would mean we break after adding 2 incoming values but no mask. Left as is for now, but pushed 9430a4b to replace createEdgeMask
to with getEdgeMask
to make it clear that no new edge-mask is created, only the existing one retrieved.
@@ -1932,12 +1932,12 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { | |||
class VPBlendRecipe : public VPSingleDefRecipe { | |||
public: | |||
/// The blend operation is a User of the incoming values and of their | |||
/// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value | |||
/// might be incoming with a full mask for which there is no VPValue. | |||
/// respective masks, ordered [I0, I1, M1, ...]. Note that the first incoming |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
((Operands.size() == 1) || ((Operands.size() + 1) % 2 == 0)) && | ||
"Expected either a single incoming value or a positive even number " | ||
"of operands"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
/// Return the number of incoming values, taking into account that a single | ||
/// incoming value has no mask. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated, thanks!
@@ -1951,13 +1951,18 @@ class VPBlendRecipe : public VPSingleDefRecipe { | |||
|
|||
/// Return the number of incoming values, taking into account that a single | |||
/// incoming value has no mask. | |||
unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } | |||
unsigned getNumIncomingValues() const { return (getNumOperands() + 2) / 2; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, restored, thanks!
|
||
/// Return mask number \p Idx. | ||
VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); } | ||
VPValue *getMask(unsigned Idx) const { | ||
assert(Idx > 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
((Operands.size() == 1) || (Operands.size() % 2 == 0)) && | ||
"Expected either a single incoming value or a positive even number " | ||
"of operands"); | ||
assert((Operands.size() + 1) % 2 == 0 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: is this better than asserting that the number of operands is odd by Operands.size() % 2 == 1
?
VPBlendRecipe does not use the first mask operand. Removing it allows VPlan-based DCE to remove unused mask computations.
This also fixes #87410, where unused Not VPInstructions are considered having only their first lane demanded, but some of their operands providing a vector value due to other users.
Fixes #87410