diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index f8252b9583677f..339a2b7d424121 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -709,9 +709,6 @@ class PassBuilder { void addRequiredLTOPreLinkPasses(ModulePassManager &MPM); - void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM, - bool IsLTO); - static Optional> parsePipelineText(StringRef Text); diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h index 76d5e8ff8ed0fb..a9928c3f5a40da 100644 --- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -218,7 +218,6 @@ class PassManagerBuilder { void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM); void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); - void addVectorPasses(legacy::PassManagerBase &PM, bool IsLTO); public: /// populateFunctionPassManager - This fills in the function pass manager, diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index c32104df0ab4a1..e6554f6106ed73 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1201,118 +1201,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, return MPM; } -/// FIXME: Should LTO cause any differences to this set of passes? -void PassBuilder::addVectorPasses(OptimizationLevel Level, - FunctionPassManager &FPM, bool IsLTO) { - FPM.addPass(LoopVectorizePass( - LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); - - if (IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) - FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); - FPM.addPass(LoopUnrollPass(LoopUnrollOptions( - Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); - FPM.addPass(WarnMissedTransformationsPass()); - } - - if (!IsLTO) { - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - FPM.addPass(LoopLoadEliminationPass()); - } - // Cleanup after the loop optimization passes. - FPM.addPass(InstCombinePass()); - - if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correlated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - FPM.addPass(EarlyCSEPass()); - FPM.addPass(CorrelatedValuePropagationPass()); - FPM.addPass(InstCombinePass()); - LoopPassManager LPM; - LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); - LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == - OptimizationLevel::O3)); - FPM.addPass( - RequireAnalysisPass()); - FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), EnableMSSALoopDependency, - /*UseBlockFrequencyInfo=*/true)); - FPM.addPass(SimplifyCFGPass()); - FPM.addPass(InstCombinePass()); - } - - // Now that we've formed fast to execute loop structures, we do further - // optimizations. These are run afterward as they might block doing complex - // analyses and transforms such as what are needed for loop vectorization. - - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - if (IsLTO) { - FPM.addPass(SCCPPass()); - FPM.addPass(InstCombinePass()); - FPM.addPass(BDCEPass()); - } - - // Optimize parallel scalar instruction chains into SIMD instructions. - if (PTO.SLPVectorization) { - FPM.addPass(SLPVectorizerPass()); - if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { - FPM.addPass(EarlyCSEPass()); - } - } - - // Enhance/cleanup vector code. - FPM.addPass(VectorCombinePass()); - if (IsLTO) { - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - FPM.addPass(AlignmentFromAssumptionsPass()); - } - - FPM.addPass(InstCombinePass()); - - if (!IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) - FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); - FPM.addPass(LoopUnrollPass(LoopUnrollOptions( - Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); - FPM.addPass(WarnMissedTransformationsPass()); - FPM.addPass(InstCombinePass()); - } -} - ModulePassManager PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, bool LTOPreLink) { @@ -1407,8 +1295,83 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // from the TargetLibraryInfo. OptimizePM.addPass(InjectTLIMappings()); - addVectorPasses(Level, OptimizePM, /* IsLTO */ false); + // Now run the core loop vectorizer. + OptimizePM.addPass(LoopVectorizePass( + LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + OptimizePM.addPass(LoopLoadEliminationPass()); + + // Cleanup after the loop optimization passes. + OptimizePM.addPass(InstCombinePass()); + + if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track correlated + // runtime checks for two inner loops in the same outer loop, fold any + // common computations, hoist loop-invariant aspects out of any outer loop, + // and unswitch the runtime checks if possible. Once hoisted, we may have + // dead (or speculatable) control flows or more combining opportunities. + OptimizePM.addPass(EarlyCSEPass()); + OptimizePM.addPass(CorrelatedValuePropagationPass()); + OptimizePM.addPass(InstCombinePass()); + LoopPassManager LPM; + LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == + OptimizationLevel::O3)); + OptimizePM.addPass( + RequireAnalysisPass()); + OptimizePM.addPass(createFunctionToLoopPassAdaptor( + std::move(LPM), EnableMSSALoopDependency, + /*UseBlockFrequencyInfo=*/true)); + OptimizePM.addPass(SimplifyCFGPass()); + OptimizePM.addPass(InstCombinePass()); + } + + // Now that we've formed fast to execute loop structures, we do further + // optimizations. These are run afterward as they might block doing complex + // analyses and transforms such as what are needed for loop vectorization. + + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + + // Optimize parallel scalar instruction chains into SIMD instructions. + if (PTO.SLPVectorization) { + OptimizePM.addPass(SLPVectorizerPass()); + if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { + OptimizePM.addPass(EarlyCSEPass()); + } + } + // Enhance/cleanup vector code. + OptimizePM.addPass(VectorCombinePass()); + OptimizePM.addPass(InstCombinePass()); + + // Unroll small loops to hide loop backedge latency and saturate any parallel + // execution resources of an out-of-order processor. We also then need to + // clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && PTO.LoopUnrolling) { + OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); + } + OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions( + Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, + PTO.ForgetAllSCEVInLoopUnroll))); + OptimizePM.addPass(WarnMissedTransformationsPass()); + OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(RequireAnalysisPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), @@ -1862,9 +1825,39 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); MainFPM.addPass(LoopDistributePass()); + MainFPM.addPass(LoopVectorizePass( + LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + // The vectorizer may have significantly shortened a loop body; unroll again. + MainFPM.addPass(LoopUnrollPass(LoopUnrollOptions( + Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, + PTO.ForgetAllSCEVInLoopUnroll))); + + MainFPM.addPass(WarnMissedTransformationsPass()); + + MainFPM.addPass(InstCombinePass()); + MainFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true))); + MainFPM.addPass(SCCPPass()); + MainFPM.addPass(InstCombinePass()); + MainFPM.addPass(BDCEPass()); + + // More scalar chains could be vectorized due to more alias information + if (PTO.SLPVectorization) { + MainFPM.addPass(SLPVectorizerPass()); + if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { + MainFPM.addPass(EarlyCSEPass()); + } + } + + MainFPM.addPass(VectorCombinePass()); // Clean up partial vectorization. + + // After vectorization, assume intrinsics may tell us more about pointer + // alignments. + MainFPM.addPass(AlignmentFromAssumptionsPass()); - addVectorPasses(Level, MainFPM, /* IsLTO */ true); + // FIXME: Conditionally run LoadCombine here, after it's ported + // (in case we still have this pass, given its questionable usefulness). + MainFPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(MainFPM, Level); MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM))); diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 6e74f4c929bd02..87731e5eb12072 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -523,109 +523,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createControlHeightReductionLegacyPass()); } -/// FIXME: Should LTO cause any differences to this set of passes? -void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, - bool IsLTO) { - PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - - if (IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && !DisableUnrollLoops) - PM.add(createLoopUnrollAndJamPass(OptLevel)); - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createWarnMissedTransformationsPass()); - } - - if (!IsLTO) { - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - PM.add(createLoopLoadEliminationPass()); - } - // Cleanup after the loop optimization passes. - PM.add(createInstructionCombiningPass()); - - if (OptLevel > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correlated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - PM.add(createEarlyCSEPass()); - PM.add(createCorrelatedValuePropagationPass()); - PM.add(createInstructionCombiningPass()); - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - PM.add(createCFGSimplificationPass()); - PM.add(createInstructionCombiningPass()); - } - - // Now that we've formed fast to execute loop structures, we do further - // optimizations. These are run afterward as they might block doing complex - // analyses and transforms such as what are needed for loop vectorization. - - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - PM.add(createCFGSimplificationPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - - if (IsLTO) { - PM.add(createSCCPPass()); // Propagate exposed constants - PM.add(createInstructionCombiningPass()); // Clean up again - PM.add(createBitTrackingDCEPass()); - } - - // Optimize parallel scalar instruction chains into SIMD instructions. - if (SLPVectorize) { - PM.add(createSLPVectorizerPass()); - if (OptLevel > 1 && ExtraVectorizerPasses) - PM.add(createEarlyCSEPass()); - } - - // Enhance/cleanup vector code. - PM.add(createVectorCombinePass()); - - if (IsLTO) { - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - PM.add(createAlignmentFromAssumptionsPass()); - } - addExtensionsToPM(EP_Peephole, PM); - PM.add(createInstructionCombiningPass()); - - if (!IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && !DisableUnrollLoops) - PM.add(createLoopUnrollAndJamPass(OptLevel)); - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - if (!DisableUnrollLoops) - PM.add(createInstructionCombiningPass()); - } -} - void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link @@ -897,9 +794,74 @@ void PassManagerBuilder::populateModulePassManager( // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); - addVectorPasses(MPM, /* IsLTO */ false); + MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + MPM.add(createLoopLoadEliminationPass()); + + // FIXME: Because of #pragma vectorize enable, the passes below are always + // inserted in the pipeline, even when the vectorizer doesn't run (ex. when + // on -O1 and no #pragma is found). Would be good to have these two passes + // as function calls, so that we can only pass them when the vectorizer + // changed the code. + MPM.add(createInstructionCombiningPass()); + if (OptLevel > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track correllated + // runtime checks for two inner loops in the same outer loop, fold any + // common computations, hoist loop-invariant aspects out of any outer loop, + // and unswitch the runtime checks if possible. Once hoisted, we may have + // dead (or speculatable) control flows or more combining opportunities. + MPM.add(createEarlyCSEPass()); + MPM.add(createCorrelatedValuePropagationPass()); + MPM.add(createInstructionCombiningPass()); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + MPM.add(createCFGSimplificationPass()); + MPM.add(createInstructionCombiningPass()); + } + + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + + if (SLPVectorize) { + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + if (OptLevel > 1 && ExtraVectorizerPasses) { + MPM.add(createEarlyCSEPass()); + } + } + + // Enhance/cleanup vector code. + MPM.add(createVectorCombinePass()); + + addExtensionsToPM(EP_Peephole, MPM); + MPM.add(createInstructionCombiningPass()); + + if (EnableUnrollAndJam && !DisableUnrollLoops) { + // Unroll and Jam. We do this before unroll but need to be in a separate + // loop pass manager in order for the outer loop to be processed by + // unroll and jam before the inner loop is unrolled. + MPM.add(createLoopUnrollAndJamPass(OptLevel)); + } + + // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + MPM.add(createInstructionCombiningPass()); + // Runtime unrolling will introduce runtime check in loop prologue. If the // unrolled loop is a inner loop, then the prologue will be inside the // outer loop. LICM pass can help to promote the runtime check out if the @@ -1121,9 +1083,35 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); PM.add(createLoopDistributePass()); - - addVectorPasses(PM, /* IsLTO */ true); - + PM.add(createLoopVectorizePass(true, !LoopVectorize)); + // The vectorizer may have significantly shortened a loop body; unroll again. + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + + PM.add(createWarnMissedTransformationsPass()); + + // Now that we've optimized loops (in particular loop induction variables), + // we may have exposed more scalar opportunities. Run parts of the scalar + // optimizer again at this point. + PM.add(createInstructionCombiningPass()); // Initial cleanup + PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert + .hoistCommonInsts(true))); + PM.add(createSCCPPass()); // Propagate exposed constants + PM.add(createInstructionCombiningPass()); // Clean up again + PM.add(createBitTrackingDCEPass()); + + // More scalar chains could be vectorized due to more alias information + if (SLPVectorize) + PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + + PM.add(createVectorCombinePass()); // Clean up partial vectorization. + + // After vectorization, assume intrinsics may tell us more about pointer + // alignments. + PM.add(createAlignmentFromAssumptionsPass()); + + // Cleanup and simplify the code after the scalar optimizations. + PM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));