diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 5209b57f7f4102..1d63437ee2c11c 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -710,6 +710,9 @@ class PassBuilder { void addRequiredLTOPreLinkPasses(ModulePassManager &MPM); + void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM, + bool IsLTO); + static Optional> parsePipelineText(StringRef Text); diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h index a9928c3f5a40da..76d5e8ff8ed0fb 100644 --- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -218,6 +218,7 @@ class PassManagerBuilder { void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM); void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); + void addVectorPasses(legacy::PassManagerBase &PM, bool IsLTO); public: /// populateFunctionPassManager - This fills in the function pass manager, diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f0963d4292dee9..3e2973937ed7db 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1201,6 +1201,118 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, return MPM; } +/// FIXME: Should LTO cause any differences to this set of passes? +void PassBuilder::addVectorPasses(OptimizationLevel Level, + FunctionPassManager &FPM, bool IsLTO) { + FPM.addPass(LoopVectorizePass( + LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + + if (IsLTO) { + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && PTO.LoopUnrolling) + FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); + FPM.addPass(LoopUnrollPass(LoopUnrollOptions( + Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, + PTO.ForgetAllSCEVInLoopUnroll))); + FPM.addPass(WarnMissedTransformationsPass()); + } + + if (!IsLTO) { + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + FPM.addPass(LoopLoadEliminationPass()); + } + // Cleanup after the loop optimization passes. + FPM.addPass(InstCombinePass()); + + if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track correlated + // runtime checks for two inner loops in the same outer loop, fold any + // common computations, hoist loop-invariant aspects out of any outer loop, + // and unswitch the runtime checks if possible. Once hoisted, we may have + // dead (or speculatable) control flows or more combining opportunities. + FPM.addPass(EarlyCSEPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(InstCombinePass()); + LoopPassManager LPM(DebugLogging); + LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == + OptimizationLevel::O3)); + FPM.addPass( + RequireAnalysisPass()); + FPM.addPass(createFunctionToLoopPassAdaptor( + std::move(LPM), EnableMSSALoopDependency, + /*UseBlockFrequencyInfo=*/true, DebugLogging)); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); + } + + // Now that we've formed fast to execute loop structures, we do further + // optimizations. These are run afterward as they might block doing complex + // analyses and transforms such as what are needed for loop vectorization. + + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + if (IsLTO) { + FPM.addPass(SCCPPass()); + FPM.addPass(InstCombinePass()); + FPM.addPass(BDCEPass()); + } + + // Optimize parallel scalar instruction chains into SIMD instructions. + if (PTO.SLPVectorization) { + FPM.addPass(SLPVectorizerPass()); + if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { + FPM.addPass(EarlyCSEPass()); + } + } + + // Enhance/cleanup vector code. + FPM.addPass(VectorCombinePass()); + if (IsLTO) { + // After vectorization, assume intrinsics may tell us more about pointer + // alignments. + FPM.addPass(AlignmentFromAssumptionsPass()); + } + + FPM.addPass(InstCombinePass()); + + if (!IsLTO) { + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && PTO.LoopUnrolling) + FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); + FPM.addPass(LoopUnrollPass(LoopUnrollOptions( + Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, + PTO.ForgetAllSCEVInLoopUnroll))); + FPM.addPass(WarnMissedTransformationsPass()); + FPM.addPass(InstCombinePass()); + } +} + ModulePassManager PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, bool LTOPreLink) { @@ -1295,82 +1407,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // from the TargetLibraryInfo. OptimizePM.addPass(InjectTLIMappings()); - // Now run the core loop vectorizer. - OptimizePM.addPass(LoopVectorizePass( - LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); - - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - OptimizePM.addPass(LoopLoadEliminationPass()); - - // Cleanup after the loop optimization passes. - OptimizePM.addPass(InstCombinePass()); - - if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correlated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - OptimizePM.addPass(EarlyCSEPass()); - OptimizePM.addPass(CorrelatedValuePropagationPass()); - OptimizePM.addPass(InstCombinePass()); - LoopPassManager LPM(DebugLogging); - LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); - LPM.addPass( - SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); - OptimizePM.addPass(RequireAnalysisPass()); - OptimizePM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true, - DebugLogging)); - OptimizePM.addPass(SimplifyCFGPass()); - OptimizePM.addPass(InstCombinePass()); - } + addVectorPasses(Level, OptimizePM, /* IsLTO */ false); - // Now that we've formed fast to execute loop structures, we do further - // optimizations. These are run afterward as they might block doing complex - // analyses and transforms such as what are needed for loop vectorization. - - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - - // Optimize parallel scalar instruction chains into SIMD instructions. - if (PTO.SLPVectorization) { - OptimizePM.addPass(SLPVectorizerPass()); - if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { - OptimizePM.addPass(EarlyCSEPass()); - } - } - - // Enhance/cleanup vector code. - OptimizePM.addPass(VectorCombinePass()); - OptimizePM.addPass(InstCombinePass()); - - // Unroll small loops to hide loop backedge latency and saturate any parallel - // execution resources of an out-of-order processor. We also then need to - // clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) { - OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); - } - OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions( - Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); - OptimizePM.addPass(WarnMissedTransformationsPass()); - OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(RequireAnalysisPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), @@ -1826,39 +1864,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, DebugLogging)); MainFPM.addPass(LoopDistributePass()); - MainFPM.addPass(LoopVectorizePass( - LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); - // The vectorizer may have significantly shortened a loop body; unroll again. - MainFPM.addPass(LoopUnrollPass(LoopUnrollOptions( - Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); - - MainFPM.addPass(WarnMissedTransformationsPass()); - - MainFPM.addPass(InstCombinePass()); - MainFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))); - MainFPM.addPass(SCCPPass()); - MainFPM.addPass(InstCombinePass()); - MainFPM.addPass(BDCEPass()); - - // More scalar chains could be vectorized due to more alias information - if (PTO.SLPVectorization) { - MainFPM.addPass(SLPVectorizerPass()); - if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { - MainFPM.addPass(EarlyCSEPass()); - } - } - - MainFPM.addPass(VectorCombinePass()); // Clean up partial vectorization. - - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - MainFPM.addPass(AlignmentFromAssumptionsPass()); - // FIXME: Conditionally run LoadCombine here, after it's ported - // (in case we still have this pass, given its questionable usefulness). + addVectorPasses(Level, MainFPM, /* IsLTO */ true); - MainFPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(MainFPM, Level); MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM))); diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 1e752462da8190..1de8cfa7744575 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -521,6 +521,109 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createControlHeightReductionLegacyPass()); } +/// FIXME: Should LTO cause any differences to this set of passes? +void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, + bool IsLTO) { + PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); + + if (IsLTO) { + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && !DisableUnrollLoops) + PM.add(createLoopUnrollAndJamPass(OptLevel)); + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createWarnMissedTransformationsPass()); + } + + if (!IsLTO) { + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + PM.add(createLoopLoadEliminationPass()); + } + // Cleanup after the loop optimization passes. + PM.add(createInstructionCombiningPass()); + + if (OptLevel > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track correlated + // runtime checks for two inner loops in the same outer loop, fold any + // common computations, hoist loop-invariant aspects out of any outer loop, + // and unswitch the runtime checks if possible. Once hoisted, we may have + // dead (or speculatable) control flows or more combining opportunities. + PM.add(createEarlyCSEPass()); + PM.add(createCorrelatedValuePropagationPass()); + PM.add(createInstructionCombiningPass()); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + PM.add(createCFGSimplificationPass()); + PM.add(createInstructionCombiningPass()); + } + + // Now that we've formed fast to execute loop structures, we do further + // optimizations. These are run afterward as they might block doing complex + // analyses and transforms such as what are needed for loop vectorization. + + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + PM.add(createCFGSimplificationPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + + if (IsLTO) { + PM.add(createSCCPPass()); // Propagate exposed constants + PM.add(createInstructionCombiningPass()); // Clean up again + PM.add(createBitTrackingDCEPass()); + } + + // Optimize parallel scalar instruction chains into SIMD instructions. + if (SLPVectorize) { + PM.add(createSLPVectorizerPass()); + if (OptLevel > 1 && ExtraVectorizerPasses) + PM.add(createEarlyCSEPass()); + } + + // Enhance/cleanup vector code. + PM.add(createVectorCombinePass()); + + if (IsLTO) { + // After vectorization, assume intrinsics may tell us more about pointer + // alignments. + PM.add(createAlignmentFromAssumptionsPass()); + } + addExtensionsToPM(EP_Peephole, PM); + PM.add(createInstructionCombiningPass()); + + if (!IsLTO) { + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && !DisableUnrollLoops) + PM.add(createLoopUnrollAndJamPass(OptLevel)); + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + if (!DisableUnrollLoops) + PM.add(createInstructionCombiningPass()); + } +} + void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link @@ -792,74 +895,9 @@ void PassManagerBuilder::populateModulePassManager( // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); - MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - MPM.add(createLoopLoadEliminationPass()); - - // FIXME: Because of #pragma vectorize enable, the passes below are always - // inserted in the pipeline, even when the vectorizer doesn't run (ex. when - // on -O1 and no #pragma is found). Would be good to have these two passes - // as function calls, so that we can only pass them when the vectorizer - // changed the code. - MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correllated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - MPM.add(createEarlyCSEPass()); - MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createInstructionCombiningPass()); - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - MPM.add(createCFGSimplificationPass()); - MPM.add(createInstructionCombiningPass()); - } - - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - if (OptLevel > 1 && ExtraVectorizerPasses) { - MPM.add(createEarlyCSEPass()); - } - } - - // Enhance/cleanup vector code. - MPM.add(createVectorCombinePass()); - - addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createInstructionCombiningPass()); - - if (EnableUnrollAndJam && !DisableUnrollLoops) { - // Unroll and Jam. We do this before unroll but need to be in a separate - // loop pass manager in order for the outer loop to be processed by - // unroll and jam before the inner loop is unrolled. - MPM.add(createLoopUnrollAndJamPass(OptLevel)); - } - - // Unroll small loops - MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); + addVectorPasses(MPM, /* IsLTO */ false); if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); - // Runtime unrolling will introduce runtime check in loop prologue. If the // unrolled loop is a inner loop, then the prologue will be inside the // outer loop. LICM pass can help to promote the runtime check out if the @@ -1081,35 +1119,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); PM.add(createLoopDistributePass()); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - - PM.add(createWarnMissedTransformationsPass()); - - // Now that we've optimized loops (in particular loop induction variables), - // we may have exposed more scalar opportunities. Run parts of the scalar - // optimizer again at this point. - PM.add(createInstructionCombiningPass()); // Initial cleanup - PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert - .hoistCommonInsts(true))); - PM.add(createSCCPPass()); // Propagate exposed constants - PM.add(createInstructionCombiningPass()); // Clean up again - PM.add(createBitTrackingDCEPass()); - - // More scalar chains could be vectorized due to more alias information - if (SLPVectorize) - PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - PM.add(createVectorCombinePass()); // Clean up partial vectorization. - - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - PM.add(createAlignmentFromAssumptionsPass()); - - // Cleanup and simplify the code after the scalar optimizations. - PM.add(createInstructionCombiningPass()); + + addVectorPasses(PM, /* IsLTO */ true); + addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));