Skip to content

Commit

Permalink
[Passes] Only run extra vector passes if loops have been vectorized.
Browse files Browse the repository at this point in the history
This patch uses a similar trick as in D113947 to only run the extra
passes after vectorization on functions where loops have been
vectorized.

The reason for running the 'extra vector passes' is
simplification/unswitching of the runtime checks created by LV, there
should be no need to run them if nothing got vectorized

To do that, a new dummy analysis ShouldRunExtraVectorPasses has been
added. If loops have been vectorized for a function, LV will cache the
analysis. At the moment it uses MadeCFGChanges as proxy for loop
vectorized, which isn't perfect (it could be too aggressive, e.g.
because no runtime checks have been added), but should be good enough
for now.

The extra passes are now managed by a new FunctionPassManager that
runs its passes only if ShouldRunExtraVectorPasses has been cached.

Without this patch, `-extra-vectorizer-passes` has the following
compile-time impact:

NewPM-O3: +4.86%
NewPM-ReleaseThinLTO: +3.56%
NewPM-ReleaseLTO-g: +7.17%

http://llvm-compile-time-tracker.com/compare.php?from=ead3979a92fc33add4710c4510d6906260dcb4ad&to=c292da649e2c6e88a31e702fdc474727d09c72bc&stat=instructions

With this patch, that gets reduced to

NewPM-O3: +1.43%
NewPM-ReleaseThinLTO: +1.00%
NewPM-ReleaseLTO-g: +1.58%

http://llvm-compile-time-tracker.com/compare.php?from=ead3979a92fc33add4710c4510d6906260dcb4ad&to=e67d86b57810011cf285eb9aa1944781be6096f0&stat=instructions

It is probably still too high to enable by default, but much better.

Reviewed By: aeubanks

Differential Revision: https://reviews.llvm.org/D115052
  • Loading branch information
fhahn committed Dec 10, 2021
1 parent eef8f3f commit acea6e9
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 8 deletions.
32 changes: 32 additions & 0 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
Expand Up @@ -80,6 +80,38 @@ class TargetTransformInfo;
extern cl::opt<bool> EnableLoopInterleaving;
extern cl::opt<bool> EnableLoopVectorization;

/// A marker to determine if extra passes after loop vectorization should be
/// run.
struct ShouldRunExtraVectorPasses
: public AnalysisInfoMixin<ShouldRunExtraVectorPasses> {
static AnalysisKey Key;
struct Result {
bool invalidate(Function &F, const PreservedAnalyses &PA,
FunctionAnalysisManager::Invalidator &) {
// Check whether the analysis has been explicitly invalidated. Otherwise,
// it remains preserved.
auto PAC = PA.getChecker<ShouldRunExtraVectorPasses>();
return !PAC.preservedWhenStateless();
}
};

Result run(Function &F, FunctionAnalysisManager &FAM) { return Result(); }
};

/// A pass manager to run a set of extra function simplification passes after
/// vectorization, if requested. LoopVectorize caches the
/// ShouldRunExtraVectorPasses analysis to request extra simplifications, if
/// they could be beneficial.
struct ExtraVectorPassManager : public FunctionPassManager {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) {
auto PA = PreservedAnalyses::all();
if (AM.getCachedResult<ShouldRunExtraVectorPasses>(F))
PA.intersect(FunctionPassManager::run(F, AM));
PA.abandon<ShouldRunExtraVectorPasses>();
return PA;
}
};

struct LoopVectorizeOptions {
/// If false, consider all loops for interleaving.
/// If true, only loops that explicitly request interleaving are considered.
Expand Down
16 changes: 9 additions & 7 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Expand Up @@ -995,26 +995,28 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
FPM.addPass(InstCombinePass());

if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
ExtraVectorPassManager ExtraPasses;
// At higher optimization levels, try to clean up any runtime overlap and
// alignment checks inserted by the vectorizer. We want to track correlated
// runtime checks for two inner loops in the same outer loop, fold any
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
FPM.addPass(EarlyCSEPass());
FPM.addPass(CorrelatedValuePropagationPass());
FPM.addPass(InstCombinePass());
ExtraPasses.addPass(EarlyCSEPass());
ExtraPasses.addPass(CorrelatedValuePropagationPass());
ExtraPasses.addPass(InstCombinePass());
LoopPassManager LPM;
LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
OptimizationLevel::O3));
FPM.addPass(
ExtraPasses.addPass(
RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
FPM.addPass(
ExtraPasses.addPass(
createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
/*UseBlockFrequencyInfo=*/true));
FPM.addPass(SimplifyCFGPass());
FPM.addPass(InstCombinePass());
ExtraPasses.addPass(SimplifyCFGPass());
ExtraPasses.addPass(InstCombinePass());
FPM.addPass(std::move(ExtraPasses));
}

// Now that we've formed fast to execute loop structures, we do further
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Passes/PassRegistry.def
Expand Up @@ -203,6 +203,7 @@ FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis())
FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis())
FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis())
FUNCTION_ANALYSIS("should-not-run-function-passes", ShouldNotRunFunctionPassesAnalysis())
FUNCTION_ANALYSIS("should-run-extra-vector-passes", ShouldRunExtraVectorPasses())
FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis())
FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis())
FUNCTION_ANALYSIS("targetir",
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -428,6 +428,8 @@ class GeneratedRTChecks;

namespace llvm {

AnalysisKey ShouldRunExtraVectorPasses::Key;

/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
Expand Down Expand Up @@ -10746,8 +10748,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
}
if (!Result.MadeCFGChange)

if (Result.MadeCFGChange) {
// Making CFG changes likely means a loop got vectorized. Indicate that
// extra simplification passes should be run.
// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
// be run if runtime checks have been added.
AM.getResult<ShouldRunExtraVectorPasses>(F);
PA.preserve<ShouldRunExtraVectorPasses>();
} else {
PA.preserveSet<CFGAnalyses>();
}
return PA;
}

Expand Down
3 changes: 3 additions & 0 deletions llvm/test/Other/opt-pipeline-vector-passes.ll
Expand Up @@ -2,6 +2,9 @@
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -force-vector-width=4 -S %s 2>&1 | FileCheck %s --check-prefixes=O2
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -force-vector-width=4 -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2_EXTRA

; When the loop doesn't get vectorized, no extra vector passes should run.
; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -force-vector-width=0 -extra-vectorizer-passes -S %s 2>&1 | FileCheck %s --check-prefixes=O2

; REQUIRES: asserts

; The loop vectorizer still runs at both -O1/-O2 even with the
Expand Down

0 comments on commit acea6e9

Please sign in to comment.