diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a25061f06c148d..b09b1e360c10e6 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1478,6 +1478,10 @@ class TargetTransformInfo { bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; + /// Return true if the loop vectorizer should consider vectorizing an + /// otherwise scalar epilogue loop. + bool preferEpilogueVectorization() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; @@ -1881,6 +1885,8 @@ class TargetTransformInfo::Concept { ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; + virtual bool preferEpilogueVectorization() const = 0; + virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; virtual unsigned getMinTripCountTailFoldingThreshold() const = 0; @@ -2526,6 +2532,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { ReductionFlags Flags) const override { return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags); } + bool preferEpilogueVectorization() const override { + return Impl.preferEpilogueVectorization(); + } + bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 36e363ad9cb73d..8fcaa52a7bf518 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -828,6 +828,10 @@ class TargetTransformInfoImplBase { return false; } + bool preferEpilogueVectorization() const { + return true; + } + bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } unsigned getGISelRematGlobalCost() const { return 1; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0d3058512e05e7..2c33f2d59ca124 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1145,6 +1145,10 @@ bool TargetTransformInfo::preferPredicatedReductionSelect( return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags); } +bool TargetTransformInfo::preferEpilogueVectorization() const { + return TTIImpl->preferEpilogueVectorization(); +} + TargetTransformInfo::VPLegalization TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const { return TTIImpl->getVPLegalizationStrategy(VPI); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 704d0dbdffaff5..6f9c958ecef9bf 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -84,6 +84,13 @@ class RISCVTTIImpl : public BasicTTIImplBase { unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; + bool preferEpilogueVectorization() const { + // Epilogue vectorization is usually unprofitable - tail folding or + // a smaller VF would have been better. This a blunt hammer - we + // should re-examine this once vectorization is better tuned. + return false; + } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 108cf6f17f52ac..7983165f09842d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5506,6 +5506,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( // as register pressure, code size increase and cost of extra branches into // account. For now we apply a very crude heuristic and only consider loops // with vectorization factors larger than a certain value. + + // Allow the target to opt out entirely. + if (!TTI.preferEpilogueVectorization()) + return false; + // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index 5997b64c9f88a2..5c7f5be5729285 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -1015,10 +1015,8 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @predicated_sdiv_by_minus_one( -; FIXED-NEXT: iter.check: -; FIXED-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; FIXED: vector.main.loop.iter.check: -; FIXED-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: entry: +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: @@ -1048,35 +1046,12 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 -; FIXED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; FIXED: vec.epilog.iter.check: -; FIXED-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; FIXED: vec.epilog.ph: -; FIXED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; FIXED-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; FIXED: vec.epilog.vector.body: -; FIXED-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0 -; FIXED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] -; FIXED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, ptr [[TMP17]], align 1 -; FIXED-NEXT: [[TMP18:%.*]] = icmp ne <8 x i8> [[WIDE_LOAD5]], -; FIXED-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i8> , <8 x i8> -; FIXED-NEXT: [[TMP20:%.*]] = sdiv <8 x i8> [[WIDE_LOAD5]], [[TMP19]] -; FIXED-NEXT: [[TMP21:%.*]] = xor <8 x i1> [[TMP18]], -; FIXED-NEXT: [[PREDPHI6:%.*]] = select <8 x i1> [[TMP18]], <8 x i8> [[TMP20]], <8 x i8> [[WIDE_LOAD5]] -; FIXED-NEXT: store <8 x i8> [[PREDPHI6]], ptr [[TMP17]], align 1 -; FIXED-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[OFFSET_IDX]], 8 -; FIXED-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 1024 -; FIXED-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] -; FIXED: vec.epilog.middle.block: -; FIXED-NEXT: [[CMP_N3:%.*]] = icmp eq i64 1024, 1024 -; FIXED-NEXT: br i1 [[CMP_N3]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] -; FIXED: vec.epilog.scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; FIXED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; FIXED: scalar.ph: +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; FIXED-NEXT: [[C:%.*]] = icmp ne i8 [[ELEM]], -128 @@ -1089,7 +1064,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: store i8 [[PHI]], ptr [[ARRAYIDX]], align 1 ; FIXED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; FIXED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; FIXED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; FIXED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; FIXED: for.end: ; FIXED-NEXT: ret void ;