-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Reland [VPlan] Expand WidenInt inductions with nuw/nsw #168354
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
From Alex Bradbury.
Changes: The previous patch had to be reverted to a mismatching-OpType assert in cse. The reduced-test has now been added corresponding to a RVV pointer-induction, and the pointer-induction case has been updated to use createOverflowingBinaryOp. While at it, record VPIRFlags in VPWidenInductionRecipe.
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Ramkumar Ramachandra (artagnon) ChangesChanges: The previous patch had to be reverted to a mismatching-OpType assert in cse. The reduced-test has now been added corresponding to a RVV pointer-induction, and the pointer-induction case has been updated to use createOverflowingBinaryOp. While at it, record VPIRFlags in VPWidenInductionRecipe. Patch is 487.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168354.diff 121 Files Affected:
diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
index 87ab9efeb703b..2c4a3495eb0b7 100644
--- a/flang/test/Integration/unroll-loops.f90
+++ b/flang/test/Integration/unroll-loops.f90
@@ -25,7 +25,7 @@ subroutine unroll(a)
! NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
! NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
! NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
- ! NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+ ! NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
!
! UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
! UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
diff --git a/flang/test/Lower/HLFIR/unroll-loops.fir b/flang/test/Lower/HLFIR/unroll-loops.fir
index 89e8ce82d6f3f..1ccb6b1bd0315 100644
--- a/flang/test/Lower/HLFIR/unroll-loops.fir
+++ b/flang/test/Lower/HLFIR/unroll-loops.fir
@@ -27,7 +27,7 @@ func.func @unroll(%arg0: !fir.ref<!fir.array<1000 x index>> {fir.bindc_name = "a
// NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
// NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
// NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
- // NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+ // NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
// UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
// UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 04b05627fa769..5dc3175382254 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -178,11 +178,10 @@ class VPBuilder {
new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
}
- VPInstruction *createOverflowingOp(unsigned Opcode,
- ArrayRef<VPValue *> Operands,
- VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
- DebugLoc DL = DebugLoc::getUnknown(),
- const Twine &Name = "") {
+ VPInstruction *createOverflowingOp(
+ unsigned Opcode, ArrayRef<VPValue *> Operands,
+ VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false},
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cbfbc29360b0b..10bd6cd471152 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7639,6 +7639,10 @@ createWidenInductionRecipes(VPInstruction *PhiR,
assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
"Start VPValue must match IndDesc's start value");
+ // It is always safe to copy over the NoWrap and FastMath flags. In
+ // particular, when folding tail by masking, the masked-off lanes are never
+ // used, so it is safe.
+ VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
@@ -7651,7 +7655,7 @@ createWidenInductionRecipes(VPInstruction *PhiR,
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, PhiR->getDebugLoc());
+ IndDesc, Flags, PhiR->getDebugLoc());
}
VPHeaderPHIRecipe *
@@ -7705,10 +7709,15 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
PHINode *Phi = WidenIV->getPHINode();
VPValue *Start = WidenIV->getStartValue();
const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
+
+ // It is always safe to copy over the NoWrap and FastMath flags. In
+ // particular, when folding tail by masking, the masked-off lanes are never
+ // used, so it is safe.
+ VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, I, VPI->getDebugLoc());
+ return new VPWidenIntOrFpInductionRecipe(
+ Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
}
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 754c6b50ae028..91b78044c4d17 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2123,7 +2123,8 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector values. This is an abstract recipe and must be
/// converted to concrete recipes before executing.
-class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
+class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe,
+ public VPIRFlags {
TruncInst *Trunc;
// If this recipe is unrolled it will have 2 additional operands.
@@ -2132,19 +2133,20 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
VPValue *VF, const InductionDescriptor &IndDesc,
- DebugLoc DL)
+ const VPIRFlags &Flags, DebugLoc DL)
: VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
Step, IndDesc, DL),
- Trunc(nullptr) {
+ VPIRFlags(Flags), Trunc(nullptr) {
addOperand(VF);
}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
VPValue *VF, const InductionDescriptor &IndDesc,
- TruncInst *Trunc, DebugLoc DL)
+ TruncInst *Trunc, const VPIRFlags &Flags,
+ DebugLoc DL)
: VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
Step, IndDesc, DL),
- Trunc(Trunc) {
+ VPIRFlags(Flags), Trunc(Trunc) {
addOperand(VF);
SmallVector<std::pair<unsigned, MDNode *>> Metadata;
(void)Metadata;
@@ -2158,7 +2160,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
VPWidenIntOrFpInductionRecipe *clone() override {
return new VPWidenIntOrFpInductionRecipe(
getPHINode(), getStartValue(), getStepValue(), getVFValue(),
- getInductionDescriptor(), Trunc, getDebugLoc());
+ getInductionDescriptor(), Trunc, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 84bf0d525b86e..8702dc840e67c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2382,7 +2382,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
- O << " = WIDEN-INDUCTION ";
+ O << " = WIDEN-INDUCTION";
+ printFlags(O);
+ O << " ";
printOperands(O, SlotTracker);
if (auto *TI = getTruncInst())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb61308cb7d9..bbeb447de45cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -76,8 +76,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
+ // It is always safe to copy over the NoWrap and FastMath flags. In
+ // particular, when folding tail by masking, the masked-off lanes are
+ // never used, so it is safe.
+ VPIRFlags Flags = vputils::getFlagsFromIndDesc(*II);
NewRecipe = new VPWidenIntOrFpInductionRecipe(
- Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
+ Phi, Start, Step, &Plan.getVF(), *II, Flags,
+ Ingredient.getDebugLoc());
}
} else {
assert(isa<VPInstruction>(&Ingredient) &&
@@ -542,6 +547,11 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
// only.
if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
vputils::onlyFirstLaneUsed(WidenNewIV)) {
+ // We are replacing a wide canonical iv with a suitable wide induction.
+ // This is used to compute header mask, hence all lanes will be used and
+ // we need to drop wrap flags only applying to lanes guranteed to execute
+ // in the original scalar loop.
+ WidenOriginalIV->dropPoisonGeneratingFlags();
WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
WidenNewIV->eraseFromParent();
return;
@@ -3285,16 +3295,13 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
- // FIXME: The newly created binary instructions should contain nsw/nuw
- // flags, which can be found from the original scalar operations.
- VPIRFlags Flags;
+ VPIRFlags Flags = *WidenIVR;
if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
AddOp = Instruction::Add;
MulOp = Instruction::Mul;
} else {
AddOp = ID.getInductionOpcode();
MulOp = Instruction::FMul;
- Flags = ID.getInductionBinOp()->getFastMathFlags();
}
// If the phi is truncated, truncate the start and step values.
@@ -3406,7 +3413,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
Type *StepTy = TypeInfo.inferScalarType(Step);
VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
- Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
+ Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
VPValue *PtrAdd = Builder.createNaryOp(
VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
R->replaceAllUsesWith(PtrAdd);
@@ -3416,7 +3423,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
DL);
- VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
+ VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
VPValue *InductionGEP =
Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index df1613d760a04..51bafe0846141 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -73,6 +73,19 @@ std::optional<VPValue *>
getRecipesForUncountableExit(VPlan &Plan,
SmallVectorImpl<VPRecipeBase *> &Recipes,
SmallVectorImpl<VPRecipeBase *> &GEPs);
+
+/// Extracts and returns NoWrap and FastMath flags from the induction binop in
+/// \p ID.
+inline VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID) {
+ if (ID.getKind() == InductionDescriptor::IK_FpInduction)
+ return ID.getInductionBinOp()->getFastMathFlags();
+
+ if (auto *OBO = dyn_cast_if_present<OverflowingBinaryOperator>(
+ ID.getInductionBinOp()))
+ return VPIRFlags::WrapFlagsTy(OBO->hasNoUnsignedWrap(),
+ OBO->hasNoSignedWrap());
+ return {};
+}
} // namespace vputils
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 0415b01d78b46..ac8095ae5c3e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -14,8 +14,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -76,8 +76,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 2f7e3568d5654..cb4bd793013b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1052,7 +1052,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index a49f089bd2085..2180f18750bf2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -35,7 +35,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[TMP10]] = select <16 x i1> [[TMP17]], <16 x i8> [[VEC_IND]], <16 x i8> [[VEC_PHI]]
; CHECK-NEXT: [[TMP11]] = select <16 x i1> [[TMP23]], <16 x i8> [[STEP_ADD]], <16 x i8> [[VEC_PHI2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[STEP_ADD]], splat (i8 16)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <16 x i8> [[STEP_ADD]], splat (i8 16)
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -48,7 +48,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 8
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -62,11 +62,11 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add nuw nsw <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[IV:%.*]] = trunc i32 [[INDEX6]] to i8
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
@@ -74,9 +74,9 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD12]], splat (i8 3)
; CHECK-NEXT: [[TMP20]] = select <8 x i1> [[TMP19]], <8 x i8> [[VEC_IND7]], <8 x i8> [[VEC_PHI9]]
; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i32 [[INDEX6]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <8 x i8> [[VEC_IND7]], splat (i8 8)
+; CHECK-NEXT: [[VEC_IND_NEXT13]] = add nuw nsw <8 x i8> [[VEC_IND7]], splat (i8 8)
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT13]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP21]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[VEC_EPIL...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesChanges: The previous patch had to be reverted to a mismatching-OpType assert in cse. The reduced-test has now been added corresponding to a RVV pointer-induction, and the pointer-induction case has been updated to use createOverflowingBinaryOp. While at it, record VPIRFlags in VPWidenInductionRecipe. Patch is 487.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168354.diff 121 Files Affected:
diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
index 87ab9efeb703b..2c4a3495eb0b7 100644
--- a/flang/test/Integration/unroll-loops.f90
+++ b/flang/test/Integration/unroll-loops.f90
@@ -25,7 +25,7 @@ subroutine unroll(a)
! NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
! NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
! NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
- ! NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+ ! NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
!
! UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
! UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
diff --git a/flang/test/Lower/HLFIR/unroll-loops.fir b/flang/test/Lower/HLFIR/unroll-loops.fir
index 89e8ce82d6f3f..1ccb6b1bd0315 100644
--- a/flang/test/Lower/HLFIR/unroll-loops.fir
+++ b/flang/test/Lower/HLFIR/unroll-loops.fir
@@ -27,7 +27,7 @@ func.func @unroll(%arg0: !fir.ref<!fir.array<1000 x index>> {fir.bindc_name = "a
// NO-UNROLL-NEXT: %[[GEP:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
// NO-UNROLL-NEXT: store <2 x i64> %[[VIND]], ptr %[[GEP]]
// NO-UNROLL-NEXT: %[[NIV:.*]] = add nuw i64 %{{.*}}, 2
- // NO-UNROLL-NEXT: %[[NVIND]] = add <2 x i64> %[[VIND]], splat (i64 2)
+ // NO-UNROLL-NEXT: %[[NVIND]] = add nuw nsw <2 x i64> %[[VIND]], splat (i64 2)
// UNROLL-NEXT: %[[VIND1:.*]] = add <2 x i64> %[[VIND]], splat (i64 2)
// UNROLL-NEXT: %[[GEP0:.*]] = getelementptr i64, ptr %[[ARG0]], i64 %[[IND]]
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 04b05627fa769..5dc3175382254 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -178,11 +178,10 @@ class VPBuilder {
new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
}
- VPInstruction *createOverflowingOp(unsigned Opcode,
- ArrayRef<VPValue *> Operands,
- VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
- DebugLoc DL = DebugLoc::getUnknown(),
- const Twine &Name = "") {
+ VPInstruction *createOverflowingOp(
+ unsigned Opcode, ArrayRef<VPValue *> Operands,
+ VPRecipeWithIRFlags::WrapFlagsTy WrapFlags = {false, false},
+ DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, {}, DL, Name));
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cbfbc29360b0b..10bd6cd471152 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7639,6 +7639,10 @@ createWidenInductionRecipes(VPInstruction *PhiR,
assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
"Start VPValue must match IndDesc's start value");
+ // It is always safe to copy over the NoWrap and FastMath flags. In
+ // particular, when folding tail by masking, the masked-off lanes are never
+ // used, so it is safe.
+ VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
@@ -7651,7 +7655,7 @@ createWidenInductionRecipes(VPInstruction *PhiR,
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, PhiR->getDebugLoc());
+ IndDesc, Flags, PhiR->getDebugLoc());
}
VPHeaderPHIRecipe *
@@ -7705,10 +7709,15 @@ VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
PHINode *Phi = WidenIV->getPHINode();
VPValue *Start = WidenIV->getStartValue();
const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
+
+ // It is always safe to copy over the NoWrap and FastMath flags. In
+ // particular, when folding tail by masking, the masked-off lanes are never
+ // used, so it is safe.
+ VPIRFlags Flags = vputils::getFlagsFromIndDesc(IndDesc);
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, I, VPI->getDebugLoc());
+ return new VPWidenIntOrFpInductionRecipe(
+ Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
}
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 754c6b50ae028..91b78044c4d17 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2123,7 +2123,8 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector values. This is an abstract recipe and must be
/// converted to concrete recipes before executing.
-class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
+class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe,
+ public VPIRFlags {
TruncInst *Trunc;
// If this recipe is unrolled it will have 2 additional operands.
@@ -2132,19 +2133,20 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
VPValue *VF, const InductionDescriptor &IndDesc,
- DebugLoc DL)
+ const VPIRFlags &Flags, DebugLoc DL)
: VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
Step, IndDesc, DL),
- Trunc(nullptr) {
+ VPIRFlags(Flags), Trunc(nullptr) {
addOperand(VF);
}
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
VPValue *VF, const InductionDescriptor &IndDesc,
- TruncInst *Trunc, DebugLoc DL)
+ TruncInst *Trunc, const VPIRFlags &Flags,
+ DebugLoc DL)
: VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start,
Step, IndDesc, DL),
- Trunc(Trunc) {
+ VPIRFlags(Flags), Trunc(Trunc) {
addOperand(VF);
SmallVector<std::pair<unsigned, MDNode *>> Metadata;
(void)Metadata;
@@ -2158,7 +2160,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
VPWidenIntOrFpInductionRecipe *clone() override {
return new VPWidenIntOrFpInductionRecipe(
getPHINode(), getStartValue(), getStepValue(), getVFValue(),
- getInductionDescriptor(), Trunc, getDebugLoc());
+ getInductionDescriptor(), Trunc, *this, getDebugLoc());
}
VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 84bf0d525b86e..8702dc840e67c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2382,7 +2382,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent;
printAsOperand(O, SlotTracker);
- O << " = WIDEN-INDUCTION ";
+ O << " = WIDEN-INDUCTION";
+ printFlags(O);
+ O << " ";
printOperands(O, SlotTracker);
if (auto *TI = getTruncInst())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb61308cb7d9..bbeb447de45cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -76,8 +76,13 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
+ // It is always safe to copy over the NoWrap and FastMath flags. In
+ // particular, when folding tail by masking, the masked-off lanes are
+ // never used, so it is safe.
+ VPIRFlags Flags = vputils::getFlagsFromIndDesc(*II);
NewRecipe = new VPWidenIntOrFpInductionRecipe(
- Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
+ Phi, Start, Step, &Plan.getVF(), *II, Flags,
+ Ingredient.getDebugLoc());
}
} else {
assert(isa<VPInstruction>(&Ingredient) &&
@@ -542,6 +547,11 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
// only.
if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
vputils::onlyFirstLaneUsed(WidenNewIV)) {
+ // We are replacing a wide canonical iv with a suitable wide induction.
+ // This is used to compute header mask, hence all lanes will be used and
+ // we need to drop wrap flags only applying to lanes guranteed to execute
+ // in the original scalar loop.
+ WidenOriginalIV->dropPoisonGeneratingFlags();
WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
WidenNewIV->eraseFromParent();
return;
@@ -3285,16 +3295,13 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
- // FIXME: The newly created binary instructions should contain nsw/nuw
- // flags, which can be found from the original scalar operations.
- VPIRFlags Flags;
+ VPIRFlags Flags = *WidenIVR;
if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
AddOp = Instruction::Add;
MulOp = Instruction::Mul;
} else {
AddOp = ID.getInductionOpcode();
MulOp = Instruction::FMul;
- Flags = ID.getInductionBinOp()->getFastMathFlags();
}
// If the phi is truncated, truncate the start and step values.
@@ -3406,7 +3413,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
Type *StepTy = TypeInfo.inferScalarType(Step);
VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
- Offset = Builder.createNaryOp(Instruction::Mul, {Offset, Step});
+ Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
VPValue *PtrAdd = Builder.createNaryOp(
VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
R->replaceAllUsesWith(PtrAdd);
@@ -3416,7 +3423,7 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
DL);
- VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
+ VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
VPValue *InductionGEP =
Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index df1613d760a04..51bafe0846141 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -73,6 +73,19 @@ std::optional<VPValue *>
getRecipesForUncountableExit(VPlan &Plan,
SmallVectorImpl<VPRecipeBase *> &Recipes,
SmallVectorImpl<VPRecipeBase *> &GEPs);
+
+/// Extracts and returns NoWrap and FastMath flags from the induction binop in
+/// \p ID.
+inline VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID) {
+ if (ID.getKind() == InductionDescriptor::IK_FpInduction)
+ return ID.getInductionBinOp()->getFastMathFlags();
+
+ if (auto *OBO = dyn_cast_if_present<OverflowingBinaryOperator>(
+ ID.getInductionBinOp()))
+ return VPIRFlags::WrapFlagsTy(OBO->hasNoUnsignedWrap(),
+ OBO->hasNoSignedWrap());
+ return {};
+}
} // namespace vputils
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 0415b01d78b46..ac8095ae5c3e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -14,8 +14,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
@@ -76,8 +76,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
-; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP1]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 2f7e3568d5654..cb4bd793013b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1052,7 +1052,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; DEFAULT-NEXT: store i32 [[TMP2]], ptr [[DST]], align 4
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[STEP_ADD]], splat (i64 4)
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; DEFAULT-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index a49f089bd2085..2180f18750bf2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -35,7 +35,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[TMP10]] = select <16 x i1> [[TMP17]], <16 x i8> [[VEC_IND]], <16 x i8> [[VEC_PHI]]
; CHECK-NEXT: [[TMP11]] = select <16 x i1> [[TMP23]], <16 x i8> [[STEP_ADD]], <16 x i8> [[VEC_PHI2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[STEP_ADD]], splat (i8 16)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <16 x i8> [[STEP_ADD]], splat (i8 16)
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -48,7 +48,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 8
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -62,11 +62,11 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add nuw nsw <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[IV:%.*]] = trunc i32 [[INDEX6]] to i8
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
@@ -74,9 +74,9 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD12]], splat (i8 3)
; CHECK-NEXT: [[TMP20]] = select <8 x i1> [[TMP19]], <8 x i8> [[VEC_IND7]], <8 x i8> [[VEC_PHI9]]
; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i32 [[INDEX6]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <8 x i8> [[VEC_IND7]], splat (i8 8)
+; CHECK-NEXT: [[VEC_IND_NEXT13]] = add nuw nsw <8 x i8> [[VEC_IND7]], splat (i8 8)
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT13]], [[N_VEC5]]
-; CHECK-NEXT: br i1 [[TMP21]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[VEC_EPIL...
[truncated]
|
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM,thanks
|
Looks like it still crashes the compiler in some cases: opt -S --passes=loop-vectorize ./repro.ll |
|
Thanks, will fix in a few moments. |
Changes: The previous patch had to be reverted to a mismatching-OpType assert in cse. The reduced-test has now been added corresponding to a RVV pointer-induction, and the pointer-induction case has been updated to use createOverflowingBinaryOp.
While at it, record VPIRFlags in VPWidenInductionRecipe.