36 changes: 30 additions & 6 deletions llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,14 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
Expand All @@ -54,6 +57,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include <algorithm>
#include <cassert>
#include <forward_list>
Expand Down Expand Up @@ -159,8 +163,9 @@ namespace {
class LoadEliminationForLoop {
public:
LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
DominatorTree *DT)
: L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
DominatorTree *DT, BlockFrequencyInfo *BFI,
ProfileSummaryInfo* PSI)
: L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}

/// Look through the loop-carried and loop-independent dependences in
/// this loop and find store->load dependences.
Expand Down Expand Up @@ -529,7 +534,11 @@ class LoadEliminationForLoop {
}

if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
if (L->getHeader()->getParent()->hasOptSize()) {
auto *HeaderBB = L->getHeader();
auto *F = HeaderBB->getParent();
bool OptForSize = F->hasOptSize() ||
llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI);
if (OptForSize) {
LLVM_DEBUG(
dbgs() << "Versioning is needed but not allowed when optimizing "
"for size.\n");
Expand Down Expand Up @@ -572,13 +581,16 @@ class LoadEliminationForLoop {
LoopInfo *LI;
const LoopAccessInfo &LAI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
ProfileSummaryInfo *PSI;
PredicatedScalarEvolution PSE;
};

} // end anonymous namespace

static bool
eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
// Build up a worklist of inner-loops to transform to avoid iterator
// invalidation.
Expand All @@ -597,7 +609,7 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
bool Changed = false;
for (Loop *L : Worklist) {
// The actual work is performed by LoadEliminationForLoop.
LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
Changed |= LEL.processLoop();
}
return Changed;
Expand All @@ -622,10 +634,14 @@ class LoopLoadElimination : public FunctionPass {
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
nullptr;

// Process each loop nest in the function.
return eliminateLoadsAcrossLoops(
F, LI, DT,
F, LI, DT, BFI, PSI,
[&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
}

Expand All @@ -638,6 +654,8 @@ class LoopLoadElimination : public FunctionPass {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
}
};

Expand All @@ -653,6 +671,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)

FunctionPass *llvm::createLoopLoadEliminationPass() {
Expand All @@ -668,13 +688,17 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
MemorySSA *MSSA = EnableMSSALoopDependency
? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
: nullptr;

auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
bool Changed = eliminateLoadsAcrossLoops(
F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & {
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
});
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,8 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
return LoopUnrollResult::Unmodified;

TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, OptLevel, None, None, None, None, None, None);
L, SE, TTI, nullptr, nullptr, OptLevel,
None, None, None, None, None, None);
if (AllowUnrollAndJam.getNumOccurrences() > 0)
UP.UnrollAndJam = AllowUnrollAndJam;
if (UnrollAndJamThreshold.getNumOccurrences() > 0)
Expand Down
24 changes: 18 additions & 6 deletions llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
Expand Down Expand Up @@ -55,6 +57,7 @@
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -165,7 +168,8 @@ static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
Expand Down Expand Up @@ -198,7 +202,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
TTI.getUnrollingPreferences(L, SE, UP);

// Apply size attributes
if (L->getHeader()->getParent()->hasOptSize()) {
bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
if (OptForSize) {
UP.Threshold = UP.OptSizeThreshold;
UP.PartialThreshold = UP.PartialOptSizeThreshold;
}
Expand Down Expand Up @@ -963,7 +969,9 @@ bool llvm::computeUnrollCount(
static LoopUnrollResult tryToUnrollLoop(
Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
const TargetTransformInfo &TTI, AssumptionCache &AC,
OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
OptimizationRemarkEmitter &ORE,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
bool PreserveLCSSA, int OptLevel,
bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
Expand All @@ -989,7 +997,7 @@ static LoopUnrollResult tryToUnrollLoop(
bool NotDuplicatable;
bool Convergent;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
ProvidedAllowPeeling);
// Exit early if unrolling is disabled.
Expand Down Expand Up @@ -1176,7 +1184,8 @@ class LoopUnroll : public LoopPass {
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);

LoopUnrollResult Result = tryToUnrollLoop(
L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced,
L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr,
PreserveLCSSA, OptLevel, OnlyWhenForced,
ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling);

Expand Down Expand Up @@ -1257,6 +1266,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,

bool Changed =
tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
/*BFI*/ nullptr, /*PSI*/ nullptr,
/*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
/*ForgetAllSCEV*/ false, /*Count*/ None,
/*Threshold*/ None, /*AllowPartial*/ false,
Expand Down Expand Up @@ -1359,6 +1369,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
ProfileSummaryInfo *PSI =
MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
auto *BFI = (PSI && PSI->hasProfileSummary()) ?
&AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;

bool Changed = false;

Expand Down Expand Up @@ -1394,7 +1406,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
// The API here is quite complex to call and we allow to select some
// flavors of unrolling during construction time (by setting UnrollOpts).
LoopUnrollResult Result = tryToUnrollLoop(
&L, DT, &LI, SE, TTI, AC, ORE,
&L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
/*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
/*ForgetAllSCEV*/ false, /*Count*/ None,
/*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ add_llvm_library(LLVMTransformUtils
SimplifyCFG.cpp
SimplifyIndVar.cpp
SimplifyLibCalls.cpp
SizeOpts.cpp
SplitModule.cpp
StripNonLineTableDebugInfo.cpp
SymbolRewriter.cpp
Expand Down
10 changes: 8 additions & 2 deletions llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
Expand All @@ -34,6 +36,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/SizeOpts.h"

using namespace llvm;
using namespace PatternMatch;
Expand Down Expand Up @@ -2375,7 +2378,9 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {

// Don't rewrite fputs to fwrite when optimising for size because fwrite
// requires more arguments and thus extra MOVs are required.
if (CI->getFunction()->hasOptSize())
bool OptForSize = CI->getFunction()->hasOptSize() ||
llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
if (OptForSize)
return nullptr;

// Check if has any use
Expand Down Expand Up @@ -2750,9 +2755,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
LibCallSimplifier::LibCallSimplifier(
const DataLayout &DL, const TargetLibraryInfo *TLI,
OptimizationRemarkEmitter &ORE,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<void(Instruction *, Value *)> Replacer,
function_ref<void(Instruction *)> Eraser)
: FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE),
: FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}

void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
Expand Down
37 changes: 37 additions & 0 deletions llvm/lib/Transforms/Utils/SizeOpts.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//===-- SizeOpts.cpp - code size optimization related code ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains some shared code size optimization related code.
//
//===----------------------------------------------------------------------===//

#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
using namespace llvm;

static cl::opt<bool> ProfileGuidedSizeOpt(
"pgso", cl::Hidden, cl::init(true),
cl::desc("Enable the profile guided size optimization. "));

bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) {
assert(F);
if (!PSI || !BFI || !PSI->hasProfileSummary())
return false;
return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI);
}

bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) {
assert(BB);
if (!PSI || !BFI || !PSI->hasProfileSummary())
return false;
return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI);
}
37 changes: 26 additions & 11 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
Expand Down Expand Up @@ -134,6 +135,7 @@
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -1452,12 +1454,13 @@ struct LoopVectorize : public FunctionPass {
auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();

std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };

return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
GetLAA, *ORE);
GetLAA, *ORE, PSI);
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
Expand All @@ -1483,6 +1486,7 @@ struct LoopVectorize : public FunctionPass {

AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
};

Expand Down Expand Up @@ -6054,6 +6058,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)

namespace llvm {
Expand Down Expand Up @@ -7147,7 +7152,8 @@ static bool processLoopInVPlanNativePath(
Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {

assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
Expand All @@ -7162,10 +7168,12 @@ static bool processLoopInVPlanNativePath(
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();

// Check the function attributes to find out if this function should be
// optimized for size.
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasOptSize();
Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));

// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
Expand Down Expand Up @@ -7245,10 +7253,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}

// Check the function attributes to find out if this function should be
// optimized for size.
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
bool OptForSize =
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasOptSize();
Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
(F->hasOptSize() ||
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));

// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
Expand All @@ -7257,7 +7267,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// pipeline.
if (!L->empty())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, Hints);
ORE, BFI, PSI, Hints);

assert(L->empty() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
Expand Down Expand Up @@ -7523,7 +7533,7 @@ bool LoopVectorizePass::runImpl(
DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
OptimizationRemarkEmitter &ORE_) {
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
Expand All @@ -7535,6 +7545,7 @@ bool LoopVectorizePass::runImpl(
GetLAA = &GetLAA_;
DB = &DB_;
ORE = &ORE_;
PSI = PSI_;

// Don't attempt if
// 1. the target claims to have no vector registers, and
Expand Down Expand Up @@ -7603,8 +7614,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
const ModuleAnalysisManager &MAM =
AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
ProfileSummaryInfo *PSI =
MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
bool Changed =
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Other/new-pm-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O-NEXT: Running analysis: AAManager
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
Expand Down Expand Up @@ -245,7 +246,6 @@
; CHECK-O-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
Expand Down
1 change: 1 addition & 0 deletions llvm/test/Other/new-pm-lto-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass
; CHECK-O2-NEXT: Running pass: InstCombinePass
; CHECK-O2-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass>
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Other/new-pm-thinlto-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-PRELINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O-NEXT: Running analysis: AAManager
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
Expand Down Expand Up @@ -219,7 +220,6 @@
; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/opt-O2-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/opt-O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/opt-Os-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -S < %s | FileCheck %s
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso -S < %s | FileCheck %s -check-prefix=PGSO
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO

; There are different candidates here for the base constant: 1073876992 and
; 1073876996. But we don't want to see the latter because it results in
Expand All @@ -8,6 +10,7 @@ define void @foo() #0 {
entry:
; CHECK-LABEL: @foo
; CHECK-NOT: [[CONST1:%const_mat[0-9]*]] = add i32 %const, -4
; CHECK-LABEL: @foo_pgso
%0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%or = or i32 %0, 1
store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096
Expand Down Expand Up @@ -40,3 +43,59 @@ entry:
}

attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }

define void @foo_pgso() #1 !prof !14 {
entry:
; PGSO-LABEL: @foo_pgso
; PGSO-NOT: [[CONST2:%const_mat[0-9]*]] = add i32 %const, -4
; NPGSO-LABEL: @foo_pgso
; NPGSO: [[CONST2:%const_mat[0-9]*]] = add i32 %const, -4
%0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%or = or i32 %0, 1
store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096
%1 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4
%and = and i32 %1, -117506048
store volatile i32 %and, i32* inttoptr (i32 1073876996 to i32*), align 4
%2 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%and1 = and i32 %2, -17367041
store volatile i32 %and1, i32* inttoptr (i32 1073876996 to i32*), align 4096
%3 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%and2 = and i32 %3, -262145
store volatile i32 %and2, i32* inttoptr (i32 1073876992 to i32*), align 4096
%4 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4
%and3 = and i32 %4, -8323073
store volatile i32 %and3, i32* inttoptr (i32 1073876996 to i32*), align 4
store volatile i32 10420224, i32* inttoptr (i32 1073877000 to i32*), align 8
%5 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4096
%or4 = or i32 %5, 65536
store volatile i32 %or4, i32* inttoptr (i32 1073876996 to i32*), align 4096
%6 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
%or6.i.i = or i32 %6, 16
store volatile i32 %or6.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
%7 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
%and7.i.i = and i32 %7, -4
store volatile i32 %and7.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
%8 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
%or8.i.i = or i32 %8, 2
store volatile i32 %or8.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
ret void
}

attributes #1 = { norecurse nounwind readnone uwtable } ; no optsize or minsize

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}
33 changes: 33 additions & 0 deletions llvm/test/Transforms/InstCombine/fputs-opt-size.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
; because it requires more arguments and thus extra MOVs are required.
;
; RUN: opt < %s -instcombine -S | FileCheck %s
; RUN: opt < %s -instcombine -pgso -S | FileCheck %s -check-prefix=PGSO
; RUN: opt < %s -instcombine -pgso=false -S | FileCheck %s -check-prefix=NPGSO

%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
Expand All @@ -26,3 +28,34 @@ declare i32 @fputs(i8* nocapture readonly, %struct._IO_FILE* nocapture) local_un

attributes #0 = { nounwind optsize }
attributes #1 = { nounwind optsize }

define i32 @main_pgso() local_unnamed_addr !prof !14 {
entry:
; PGSO-LABEL: @main_pgso(
; PGSO-NOT: call i64 @fwrite
; PGSO: call i32 @fputs
; NPGSO-LABEL: @main_pgso(
; NPGSO: call i64 @fwrite
; NPGSO-NOT: call i32 @fputs

%call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0)) #2
%call1 = tail call i32 @fputs(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.2, i32 0, i32 0), %struct._IO_FILE* %call) #2
ret i32 0
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}
53 changes: 53 additions & 0 deletions llvm/test/Transforms/LoopLoadElim/opt-size.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s
; RUN: opt -basicaa -loop-load-elim -pgso -S < %s | FileCheck %s -check-prefix=PGSO
; RUN: opt -basicaa -loop-load-elim -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO

; When optimizing for size don't eliminate in this loop because the loop would
; have to be versioned first because A and C may alias.
Expand Down Expand Up @@ -74,3 +76,54 @@ for.body: ; preds = %for.body, %entry
for.end: ; preds = %for.body
ret void
}


; PGSO-LABEL: @f_pgso(
; NPGSO-LABEL: @f_pgso(
define void @f_pgso(i32* %A, i32* %B, i32* %C, i64 %N) !prof !14 {

entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1

%Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
%Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
%Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
%Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv

%b = load i32, i32* %Bidx, align 4
%a_p1 = add i32 %b, 2
store i32 %a_p1, i32* %Aidx_next, align 4

%a = load i32, i32* %Aidx, align 4
; PGSO: %c = mul i32 %a, 2
; NPGSO-NOT: %c = mul i32 %a, 2
%c = mul i32 %a, 2
store i32 %c, i32* %Cidx, align 4

%exitcond = icmp eq i64 %indvars.iv.next, %N
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret void
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}
46 changes: 46 additions & 0 deletions llvm/test/Transforms/LoopUnroll/unroll-opt-attribute.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
; RUN: opt < %s -S -loop-unroll -unroll-count=4 | FileCheck -check-prefix=CHECK_COUNT4 %s
; RUN: opt < %s -S -loop-unroll | FileCheck -check-prefix=CHECK_NOCOUNT %s
; RUN: opt < %s -S -passes='require<profile-summary>,function(unroll)' -pgso | FileCheck -check-prefix=PGSO %s
; RUN: opt < %s -S -passes='require<profile-summary>,function(unroll)' -pgso=false | FileCheck -check-prefix=NPGSO %s


;///////////////////// TEST 1 //////////////////////////////
Expand Down Expand Up @@ -128,3 +130,47 @@ for.end: ; preds = %for.body
; CHECK_NOCOUNT-LABEL: @Test4
; CHECK_NOCOUNT: phi
; CHECK_NOCOUNT: icmp

;///////////////////// TEST 5 //////////////////////////////

; This test shows that with PGO, this loop is cold and not unrolled.

define i32 @Test5() !prof !14 {
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds [24 x i32], [24 x i32]* @tab, i32 0, i32 %i.05
store i32 %i.05, i32* %arrayidx, align 4
%inc = add nuw nsw i32 %i.05, 1
%exitcond = icmp eq i32 %inc, 24
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret i32 42
}

; PGSO-LABEL: @Test5
; PGSO: phi
; PGSO: icmp
; NPGSO-LABEL: @Test5
; NPGSO-NOT: phi
; NPGSO-NOT: icmp

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}
43 changes: 43 additions & 0 deletions llvm/test/Transforms/LoopVectorize/optsize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
; loop with the optimize for size or the minimize size attributes.
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO
; RUN: opt < %s -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO

target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"

Expand Down Expand Up @@ -36,6 +38,7 @@ define i32 @foo_minsize() #1 {
; CHECK-LABEL: @foo_minsize(
; CHECK-NOT: <2 x i8>
; CHECK-NOT: <4 x i8>
; CHECK-LABEL: @foo_pgso(

entry:
br label %for.body
Expand All @@ -57,3 +60,43 @@ for.end: ; preds = %for.body

attributes #1 = { minsize }

define i32 @foo_pgso() !prof !14 {
; PGSO-LABEL: @foo_pgso(
; PGSO-NOT: <{{[0-9]+}} x i8>
; NPGSO-LABEL: @foo_pgso(
; NPGSO: <{{[0-9]+}} x i8>

entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
%0 = load i8, i8* %arrayidx, align 1
%cmp1 = icmp eq i8 %0, 0
%. = select i1 %cmp1, i8 2, i8 1
store i8 %., i8* %arrayidx, align 1
%inc = add nsw i32 %i.08, 1
%exitcond = icmp eq i32 %i.08, 202
br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body
ret i32 0
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 10000}
!4 = !{!"MaxCount", i64 10}
!5 = !{!"MaxInternalCount", i64 1}
!6 = !{!"MaxFunctionCount", i64 1000}
!7 = !{!"NumCounts", i64 3}
!8 = !{!"NumFunctions", i64 3}
!9 = !{!"DetailedSummary", !10}
!10 = !{!11, !12, !13}
!11 = !{i32 10000, i64 100, i32 1}
!12 = !{i32 999000, i64 100, i32 1}
!13 = !{i32 999999, i64 1, i32 2}
!14 = !{!"function_entry_count", i64 0}