45 changes: 44 additions & 1 deletion llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
#define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H

#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <functional>

namespace llvm {
Expand All @@ -36,15 +38,56 @@ class MemProfContextDisambiguation
/// the IR.
bool applyImport(Module &M);

// Builds the symtab and analysis used for ICP during ThinLTO backends.
bool initializeIndirectCallPromotionInfo(Module &M);

// Data structure for saving indirect call profile info for use in ICP with
// cloning.
struct ICallAnalysisData {
CallBase *CB;
std::vector<InstrProfValueData> CandidateProfileData;
uint32_t NumCandidates;
uint64_t TotalCount;
size_t CallsiteInfoStartIndex;
};

// Record information needed for ICP of an indirect call, depending on its
// profile information and the clone information recorded in the corresponding
// CallsiteInfo records. The SI iterator point to the current iteration point
// through AllCallsites in this function, and will be updated in this method
// as we iterate through profiled targets. The number of clones recorded for
// this indirect call is returned. The necessary information is recorded in
// the ICallAnalysisInfo list for later ICP.
unsigned recordICPInfo(CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
ArrayRef<CallsiteInfo>::iterator &SI,
SmallVector<ICallAnalysisData> &ICallAnalysisInfo);

// Actually performs any needed ICP in the function, using the information
// recorded in the ICallAnalysisInfo list.
void performICP(Module &M, ArrayRef<CallsiteInfo> AllCallsites,
ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
OptimizationRemarkEmitter &ORE);

/// Import summary containing cloning decisions for the ThinLTO backend.
const ModuleSummaryIndex *ImportSummary;

// Owns the import summary specified by internal options for testing the
// ThinLTO backend via opt (to simulate distributed ThinLTO).
std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;

// Whether we are building with SamplePGO. This is needed for correctly
// updating profile metadata on speculatively promoted calls.
bool isSamplePGO;

// Used when performing indirect call analysis and promotion when cloning in
// the ThinLTO backend during applyImport.
std::unique_ptr<InstrProfSymtab> Symtab;
std::unique_ptr<ICallPromotionAnalysis> ICallAnalysis;

public:
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr,
bool isSamplePGO = false);

PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);

Expand Down
59 changes: 36 additions & 23 deletions llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ static cl::opt<std::string> ModuleSummaryDotFile(
"module-summary-dot-file", cl::Hidden, cl::value_desc("filename"),
cl::desc("File to emit dot graph of new summary into"));

static cl::opt<bool> EnableMemProfIndirectCallSupport(
"enable-memprof-indirect-call-support", cl::init(true), cl::Hidden,
cl::desc(
"Enable MemProf support for summarizing and cloning indirect calls"));

extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;

extern cl::opt<unsigned> MaxNumVTableAnnotations;
Expand Down Expand Up @@ -404,6 +409,11 @@ static void computeFunctionSummary(
if (HasLocalsInUsedOrAsm && CI && CI->isInlineAsm())
HasInlineAsmMaybeReferencingInternal = true;

// Compute this once per indirect call.
uint32_t NumCandidates = 0;
uint64_t TotalCount = 0;
MutableArrayRef<InstrProfValueData> CandidateProfileData;

auto *CalledValue = CB->getCalledOperand();
auto *CalledFunction = CB->getCalledFunction();
if (CalledValue && !CalledFunction) {
Expand Down Expand Up @@ -481,9 +491,7 @@ static void computeFunctionSummary(
}
}

uint32_t NumCandidates;
uint64_t TotalCount;
auto CandidateProfileData =
CandidateProfileData =
ICallAnalysis.getPromotionCandidatesForInstruction(&I, TotalCount,
NumCandidates);
for (const auto &Candidate : CandidateProfileData)
Expand All @@ -495,16 +503,6 @@ static void computeFunctionSummary(
if (!IsThinLTO)
continue;

// TODO: Skip indirect calls for now. Need to handle these better, likely
// by creating multiple Callsites, one per target, then speculatively
// devirtualize while applying clone info in the ThinLTO backends. This
// will also be important because we will have a different set of clone
// versions per target. This handling needs to match that in the ThinLTO
// backend so we handle things consistently for matching of callsite
// summaries to instructions.
if (!CalledFunction)
continue;

// Ensure we keep this analysis in sync with the handling in the ThinLTO
// backend (see MemProfContextDisambiguation::applyImport). Save this call
// so that we can skip it in checking the reverse case later.
Expand Down Expand Up @@ -555,13 +553,24 @@ static void computeFunctionSummary(
SmallVector<unsigned> StackIdIndices;
for (auto StackId : InstCallsite)
StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId));
// Use the original CalledValue, in case it was an alias. We want
// to record the call edge to the alias in that case. Eventually
// an alias summary will be created to associate the alias and
// aliasee.
auto CalleeValueInfo =
Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
Callsites.push_back({CalleeValueInfo, StackIdIndices});
if (CalledFunction) {
// Use the original CalledValue, in case it was an alias. We want
// to record the call edge to the alias in that case. Eventually
// an alias summary will be created to associate the alias and
// aliasee.
auto CalleeValueInfo =
Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
Callsites.push_back({CalleeValueInfo, StackIdIndices});
} else if (EnableMemProfIndirectCallSupport) {
// For indirect callsites, create multiple Callsites, one per target.
// This enables having a different set of clone versions per target,
// and we will apply the cloning decisions while speculatively
// devirtualizing in the ThinLTO backends.
for (const auto &Candidate : CandidateProfileData) {
auto CalleeValueInfo = Index.getOrInsertValueInfo(Candidate.Value);
Callsites.push_back({CalleeValueInfo, StackIdIndices});
}
}
}
}
}
Expand Down Expand Up @@ -1214,9 +1223,13 @@ bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
if (CI && CalledFunction->isIntrinsic())
return false;
} else {
// TODO: For now skip indirect calls. See comments in
// computeFunctionSummary for what is needed to handle this.
return false;
// Skip inline assembly calls.
if (CI && CI->isInlineAsm())
return false;
// Skip direct calls via Constant.
if (!CalledValue || isa<Constant>(CalledValue))
return false;
return true;
}
return true;
}
4 changes: 2 additions & 2 deletions llvm/lib/IR/AsmWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3609,7 +3609,7 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {

void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
Out << "^" << Slot << " = gv: (";
if (!VI.name().empty())
if (VI.hasName() && !VI.name().empty())
Out << "name: \"" << VI.name() << "\"";
else
Out << "guid: " << VI.getGUID();
Expand All @@ -3623,7 +3623,7 @@ void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
Out << ")";
}
Out << ")";
if (!VI.name().empty())
if (VI.hasName() && !VI.name().empty())
Out << " ; guid = " << VI.getGUID();
Out << "\n";
}
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1710,7 +1710,8 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
// For ThinLTO we must apply the context disambiguation decisions early, to
// ensure we can correctly match the callsites to summary data.
if (EnableMemProfContextDisambiguation)
MPM.addPass(MemProfContextDisambiguation(ImportSummary));
MPM.addPass(MemProfContextDisambiguation(
ImportSummary, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));

// These passes import type identifier resolutions for whole-program
// devirtualization and CFI. They must run early because other passes may
Expand Down Expand Up @@ -1923,7 +1924,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// amount of additional cloning required to distinguish the allocation
// contexts.
if (EnableMemProfContextDisambiguation)
MPM.addPass(MemProfContextDisambiguation());
MPM.addPass(MemProfContextDisambiguation(
/*Summary=*/nullptr,
PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));

// Optimize globals again after we ran the inliner.
MPM.addPass(GlobalOptPass());
Expand Down
530 changes: 505 additions & 25 deletions llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6866,7 +6866,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
return Results;
};
auto ProcessGatheredLoads =
[&](ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
[&, &TTI = *TTI](
ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,
bool Final = false) {
SmallVector<LoadInst *> NonVectorized;
for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
Expand Down Expand Up @@ -7009,7 +7010,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
OrdersType Order;
SmallVector<Value *> PointerOps;
// Segmented load detected - vectorize at maximum vector factor.
if (TTI->isLegalInterleavedAccessType(
if (TTI.isLegalInterleavedAccessType(
getWidenedType(Slice.front()->getType(), VF),
InterleaveFactor,
cast<LoadInst>(Slice.front())->getAlign(),
Expand Down
322 changes: 322 additions & 0 deletions llvm/test/ThinLTO/X86/memprof-icp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
;; Test that cloning of an indirect call works. We should perform ICP and update
;; promoted call to the correct clone.

;; This was created from the following source code, for which both memprof and
;; instrumentation PGO was collected and then applied, then the IR was reduced
;; using llvm-reduce with the expected FileCheck input.
;; TODO: Consider adding a sample PGO based test, however, that uses the same VP
;; metadata and should behave the same.

;; -- virtfunc.h: --
;; void external(int *x);
;;
;; class B0 {
;; public:
;; virtual int bar(unsigned s);
;; };
;;
;; class B : public B0 {
;; public:
;; int bar(unsigned s) override;
;; };
;;
;; int foo(B0 &b, unsigned s);

;; -- virtfunc.cc: --
;; #include "virtfunc.h"
;;
;; int foo(B0 &b, unsigned s) {
;; return b.bar(s);
;; }

;; -- virtfunc_main.cc: --
;; #include "virtfunc.h"
;; #include <stdio.h>
;; #include <unistd.h>
;;
;; int main() {
;; B b;
;; int x = foo(b, 1);
;; printf("%d\n", x);
;; int y = foo(b, 10);
;; printf("%d\n", y);
;; B0 b0;
;; x = foo(b0, 1);
;; printf("%d\n", x);
;; y = foo(b0, 10);
;; printf("%d\n", y);
;; return 0;
;; }
;;
;; int B0::bar(unsigned s) {
;; int *x = new int;
;; sleep(s);
;; external(x);
;; delete x;
;; return 1;
;; }
;;
;; int B::bar(unsigned s) {
;; int *x = new int;
;; sleep(s);
;; external(x);
;; delete x;
;; return 2;
;; }

;; -stats requires asserts
; REQUIRES: asserts

; RUN: split-file %s %t

; RUN: opt -thinlto-bc %t/main.ll >%t/main.o
; RUN: opt -thinlto-bc %t/foo.ll >%t/foo.o

;; Check that we get the synthesized callsite records. There should be 2, one
;; for each profiled target in the VP metadata. They will have the same stackIds
;; since the debug information for the callsite is the same.
; RUN: llvm-dis %t/foo.o -o - | FileCheck %s --check-prefix=CALLSITES
; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)))

;; Make sure that we don't get the synthesized callsite records if the
;; -enable-memprof-indirect-call-support flag is false.
; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false -o - \
; RUN: | llvm-dis -o - | FileCheck %s --implicit-check-not callsites

;; First perform in-process ThinLTO
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
; RUN: -r=%t/main.o,_Znwm, \
; RUN: -r=%t/main.o,_ZdlPvm, \
; RUN: -r=%t/main.o,_Z8externalPi, \
; RUN: -r=%t/main.o,main,plx \
; RUN: -r=%t/main.o,_ZN2B03barEj,plx \
; RUN: -r=%t/main.o,_ZN1B3barEj,plx \
; RUN: -r=%t/main.o,_ZTV1B,plx \
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
; RUN: -r=%t/main.o,_ZTS1B,plx \
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
; RUN: -r=%t/main.o,_ZTS2B0,plx \
; RUN: -r=%t/main.o,_ZTI2B0,plx \
; RUN: -r=%t/main.o,_ZTI1B,plx \
; RUN: -r=%t/main.o,_ZTV2B0,plx \
; RUN: -thinlto-threads=1 \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \
; RUN: -pass-remarks=. -save-temps \
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS-MAIN \
; RUN: --check-prefix=REMARKS-FOO

; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR

;; Try again but with distributed ThinLTO
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
; RUN: -thinlto-distributed-indexes \
; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
; RUN: -r=%t/main.o,_Z3fooR2B0j, \
; RUN: -r=%t/main.o,_Znwm, \
; RUN: -r=%t/main.o,_ZdlPvm, \
; RUN: -r=%t/main.o,_Z8externalPi, \
; RUN: -r=%t/main.o,main,plx \
; RUN: -r=%t/main.o,_ZN2B03barEj,plx \
; RUN: -r=%t/main.o,_ZN1B3barEj,plx \
; RUN: -r=%t/main.o,_ZTV1B,plx \
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
; RUN: -r=%t/main.o,_ZTS1B,plx \
; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
; RUN: -r=%t/main.o,_ZTS2B0,plx \
; RUN: -r=%t/main.o,_ZTI2B0,plx \
; RUN: -r=%t/main.o,_ZTI1B,plx \
; RUN: -r=%t/main.o,_ZTV2B0,plx \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS

;; Run ThinLTO backend
; RUN: opt -import-all-index -passes=function-import,memprof-context-disambiguation,inline \
; RUN: -summary-file=%t/foo.o.thinlto.bc -memprof-import-summary=%t/foo.o.thinlto.bc \
; RUN: -enable-import-metadata -stats -pass-remarks=. \
; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \
; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO

; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1
; REMARKS-MAIN: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
; REMARKS-MAIN: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
; REMARKS-MAIN: created clone _ZN1B3barEj.memprof.1
; REMARKS-MAIN: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
; REMARKS-MAIN: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
; REMARKS-FOO: created clone _Z3fooR2B0j.memprof.1
;; In each version of foo we should have promoted the indirect call to two conditional
;; direct calls, one to B::bar and one to B0::bar. The cloned version of foo should call
;; the cloned versions of bar for both promotions.
; REMARKS-FOO: Promote indirect call to _ZN1B3barEj with count 2 out of 4
; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN1B3barEj
; REMARKS-FOO: Promote indirect call to _ZN1B3barEj with count 2 out of 4
; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN1B3barEj.memprof.1
; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj
; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1
; REMARKS-FOO: created clone _ZN2B03barEj.memprof.1
; REMARKS-FOO: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
; REMARKS-FOO: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
; REMARKS-FOO: created clone _ZN1B3barEj.memprof.1
; REMARKS-FOO: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
; REMARKS-FOO: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold

; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
; STATS-BE: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
; STATS-BE: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend

; IR: define {{.*}} @_Z3fooR2B0j(
; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj
; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
; IR: if.true.direct_targ:
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
; IR: if.false.orig_indirect:
; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj
; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
; IR: if.true.direct_targ1:
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
; IR: if.false.orig_indirect2:
; IR: call {{.*}} %0

; IR: define {{.*}} @_Z3fooR2B0j.memprof.1(
;; We should still compare against the original versions of bar since that is
;; what is in the vtable. However, we should have called the cloned versions
;; that perform cold allocations, which were subsequently inlined.
; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj
; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
; IR: if.true.direct_targ:
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
; IR: if.false.orig_indirect:
; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj
; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
; IR: if.true.direct_targ1:
; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
; IR: if.false.orig_indirect2:
; IR: call {{.*}} %0

; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"

; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
; STATS-BE-DISTRIB: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend

;--- foo.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @_Z3fooR2B0j(ptr %b) {
entry:
%0 = load ptr, ptr %b, align 8
%call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1
ret i32 0
}

!0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2}
!1 = !{i64 -2101080423462424381}

;--- main.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

@_ZTV1B = external constant { [3 x ptr] }
@_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
@_ZTS1B = external constant [3 x i8]
@_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
@_ZTS2B0 = external constant [4 x i8]
@_ZTI2B0 = external constant { ptr, ptr }
@_ZTI1B = external constant { ptr, ptr, ptr }
@_ZTV2B0 = external constant { [3 x ptr] }

define i32 @main() !prof !29 {
entry:
%call2 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !30
%call4 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !31
%call6 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !32
ret i32 0
}

declare i32 @_Z3fooR2B0j(ptr, i32)

define i32 @_ZN2B03barEj(ptr %this, i32 %s) {
entry:
%call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !33, !callsite !38
store volatile i32 0, ptr %call, align 4
ret i32 0
}

declare ptr @_Znwm(i64)

declare void @_Z8externalPi()

declare void @_ZdlPvm()

define i32 @_ZN1B3barEj(ptr %this, i32 %s) {
entry:
%call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !39, !callsite !44
store volatile i32 0, ptr %call, align 4
ret i32 0
}

; uselistorder directives
uselistorder ptr @_Z3fooR2B0j, { 2, 1, 0 }

attributes #0 = { builtin allocsize(0) }

!llvm.module.flags = !{!0}

!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
!2 = !{!"ProfileFormat", !"InstrProf"}
!3 = !{!"TotalCount", i64 13}
!4 = !{!"MaxCount", i64 4}
!5 = !{!"MaxInternalCount", i64 0}
!6 = !{!"MaxFunctionCount", i64 4}
!7 = !{!"NumCounts", i64 5}
!8 = !{!"NumFunctions", i64 5}
!9 = !{!"IsPartialProfile", i64 0}
!10 = !{!"PartialProfileRatio", double 0.000000e+00}
!11 = !{!"DetailedSummary", !12}
!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
!13 = !{i32 10000, i64 0, i32 0}
!14 = !{i32 100000, i64 4, i32 2}
!15 = !{i32 200000, i64 4, i32 2}
!16 = !{i32 300000, i64 4, i32 2}
!17 = !{i32 400000, i64 4, i32 2}
!18 = !{i32 500000, i64 4, i32 2}
!19 = !{i32 600000, i64 4, i32 2}
!20 = !{i32 700000, i64 2, i32 4}
!21 = !{i32 800000, i64 2, i32 4}
!22 = !{i32 900000, i64 2, i32 4}
!23 = !{i32 950000, i64 2, i32 4}
!24 = !{i32 990000, i64 2, i32 4}
!25 = !{i32 999000, i64 2, i32 4}
!26 = !{i32 999900, i64 2, i32 4}
!27 = !{i32 999990, i64 2, i32 4}
!28 = !{i32 999999, i64 2, i32 4}
!29 = !{!"function_entry_count", i64 1}
!30 = !{i64 -6490791336773930154}
!31 = !{i64 5188446645037944434}
!32 = !{i64 5583420417449503557}
!33 = !{!34, !36}
!34 = !{!35, !"notcold"}
!35 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
!36 = !{!37, !"cold"}
!37 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5583420417449503557}
!38 = !{i64 -852997907418798798}
!39 = !{!40, !42}
!40 = !{!41, !"notcold"}
!41 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 132626519179914298}
!42 = !{!43, !"cold"}
!43 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 -6490791336773930154}
!44 = !{i64 4457553070050523782}