diff --git a/llvm/test/tools/llvm-profgen/Inputs/external-address.perfscript b/llvm/test/tools/llvm-profgen/Inputs/external-address.perfscript new file mode 100644 index 00000000000000..8e69d46157e9e1 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/Inputs/external-address.perfscript @@ -0,0 +1,39 @@ +PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin + +; Test for an external top address, should only ignore the call stack and keep unwinding the LBR + +; Valid LBR. The first 4006d7 will be kept for unwinding, the second will be truncated. + + ffffffff + ffffffff + 4006d7 + ffffffff + 4006d7 + ffffffff + 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 + + ; Valid LBR + ffffffff + 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 + +; Valid LBR + ffffffff + 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0 + +; Valid LBR + 40067e + 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0 + +; Valid LBR + ffffffff + 5541f689495641d7 + 0xffffffff/0xffffffff/P/-/-/0 0x4006c8/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0 + +; Empty sample + ffffffff + 5541f689495641d7 + 0xffffffff/0xffffffff/P/-/-/0 0xffffffff/0xffffffff/P/-/-/0 + +; Invalid LBR + ffffffff + 0xffffffff/0xffffffff/P/-/-/0 0x40069b/0x400670/M/-/-/0 diff --git a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript index 116bd0a2c4c103..e32fadd2eb2424 100644 --- a/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript +++ b/llvm/test/tools/llvm-profgen/Inputs/inline-cs-noprobe.perfscript @@ -1,12 +1,5 @@ PERF_RECORD_MMAP2 2854748/2854748: [0x400000(0x1000) @ 0 00:1d 123291722 526021]: r-xp /home/inline-cs-noprobe.perfbin -; test for an external or invalid top address, should skip the whole sample - - ffffffff - 40067e - 5541f689495641d7 - 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 - 40067e 5541f689495641d7 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x40069b/0x400670/M/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 0x4006c8/0x40067e/P/-/-/0 diff --git a/llvm/test/tools/llvm-profgen/cs-external-address.test b/llvm/test/tools/llvm-profgen/cs-external-address.test new file mode 100644 index 00000000000000..83c6ff2005a3f0 --- /dev/null +++ b/llvm/test/tools/llvm-profgen/cs-external-address.test @@ -0,0 +1,28 @@ +; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/external-address.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --skip-symbolization --profile-summary-hot-count=0 --compress-recursion=0 +; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER + + +; CHECK-UNWINDER: [main:1 @ foo] +; CHECK-UNWINDER: 2 +; CHECK-UNWINDER: 670-6ad:4 +; CHECK-UNWINDER: 6bd-6c8:4 +; CHECK-UNWINDER: 2 +; CHECK-UNWINDER: 69b->670:5 +; CHECK-UNWINDER: 6c8->67e:1 +; CHECK-UNWINDER: [main:1 @ foo:3.1 @ bar] +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6af-6bb:4 +; CHECK-UNWINDER: 0 + +; Manually created to test if remaining call stack can be correctly unwinded. +; CHECK-UNWINDER: [main:1 @ foo:4 @ main:1 @ foo] +; CHECK-UNWINDER: 2 +; CHECK-UNWINDER: 670-6ad:1 +; CHECK-UNWINDER: 6bd-6c8:1 +; CHECK-UNWINDER: 2 +; CHECK-UNWINDER: 69b->670:1 +; CHECK-UNWINDER: 6c8->67e:1 +; CHECK-UNWINDER: [main:1 @ foo:4 @ main:1 @ foo:3.1 @ bar] +; CHECK-UNWINDER: 1 +; CHECK-UNWINDER: 6af-6bb:1 +; CHECK-UNWINDER: 0 diff --git a/llvm/test/tools/llvm-profgen/cs-interrupt.test b/llvm/test/tools/llvm-profgen/cs-interrupt.test index 95f38376e28d90..4541434b46dfb2 100644 --- a/llvm/test/tools/llvm-profgen/cs-interrupt.test +++ b/llvm/test/tools/llvm-profgen/cs-interrupt.test @@ -3,7 +3,6 @@ ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --skip-symbolization --profile-summary-cold-count=0 ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-interrupt.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --profile-summary-cold-count=0 ->>>>>>> 02ea7084c370 ([llvm-profgen] Support LBR only perf script) ; RUN: FileCheck %s --input-file %t ; CHECK:[main:1 @ foo]:88:0 diff --git a/llvm/test/tools/llvm-profgen/inline-noprobe2.test b/llvm/test/tools/llvm-profgen/inline-noprobe2.test index 70842f7b1ec57a..3c110f2f43d6ca 100644 --- a/llvm/test/tools/llvm-profgen/inline-noprobe2.test +++ b/llvm/test/tools/llvm-profgen/inline-noprobe2.test @@ -73,22 +73,30 @@ ;CHECK: 1: 5 ;CHECK: 2: 5 ;CHECK: 3: 5 -;CHECK: main:486:0 +;CHECK: main:906:0 ;CHECK: 0: 0 ;CHECK: 3: 0 ;CHECK: 4.1: 0 ;CHECK: 4.3: 0 -;CHECK: 5.1: 6 -;CHECK: 5.3: 6 -;CHECK: 6: 6 -;CHECK: 6.1: 6 -;CHECK: 6.3: 6 +;CHECK: 5.1: 11 +;CHECK: 5.3: 11 +;CHECK: 6: 11 +;CHECK: 6.1: 14 +;CHECK: 6.3: 11 ;CHECK: 7: 0 ;CHECK: 8: 0 quick_sort:1 ;CHECK: 9: 0 ;CHECK: 11: 0 ;CHECK: 14: 0 ;CHECK: 65499: 0 +;CHECK: quick_sort:903:25 +;CHECK: 1: 24 +;CHECK: 2: 12 partition_pivot_last:7 partition_pivot_first:5 +;CHECK: 3: 11 quick_sort:12 +;CHECK: 4: 12 quick_sort:12 +;CHECK: 6: 24 +;CHECK: 65507: 12 + ; original code: diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp index 1e9f0f42f683ce..5d7b732d41a3e6 100644 --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -59,6 +59,7 @@ void VirtualUnwinder::unwindCall(UnwindState &State) { // pro/epi tracker(Dwarf CFI) for the precise check. uint64_t Source = State.getCurrentLBRSource(); auto *ParentFrame = State.getParentFrame(); + if (ParentFrame == State.getDummyRootPtr() || ParentFrame->Address != Source) { State.switchToFrame(Source); @@ -121,7 +122,7 @@ void VirtualUnwinder::unwindReturn(UnwindState &State) { State.InstPtr.update(LBR.Source); } -void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) { +void VirtualUnwinder::unwindBranch(UnwindState &State) { // TODO: Tolerate tail call for now, as we may see tail call from libraries. // This is only for intra function branches, excluding tail calls. uint64_t Source = State.getCurrentLBRSource(); @@ -219,7 +220,7 @@ void VirtualUnwinder::collectSamplesFromFrameTrie( void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, UnwindState &State, uint64_t Repeat) { - if (Branch.IsArtificial) + if (Branch.IsArtificial || Branch.Target == ExternalAddr) return; if (Binary->usePseudoProbes()) { @@ -242,21 +243,18 @@ bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { if (!State.validateInitialState()) return false; - // Also do not attempt linear unwind for the leaf range as it's incomplete. - bool IsLeaf = true; - // Now process the LBR samples in parrallel with stack sample // Note that we do not reverse the LBR entry order so we can // unwind the sample stack as we walk through LBR entries. while (State.hasNextLBR()) { State.checkStateConsistency(); - // Unwind implicit calls/returns from inlining, along the linear path, - // break into smaller sub section each with its own calling context. - if (!IsLeaf) { + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. unwindLinear(State, Repeat); } - IsLeaf = false; // Save the LBR branch before it gets unwound. const LBREntry &Branch = State.getCurrentLBR(); @@ -271,9 +269,15 @@ bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { // Unwind returns - check whether the IP is indeed at a return instruction unwindReturn(State); } else { - // Unwind branches - for regular intra function branches, we only - // need to record branch with context. - unwindBranchWithinFrame(State); + // Unwind branches + // For regular intra function branches, we only need to record branch with + // context. For an artificial branch cross function boundaries, we got an + // issue with returning to external code. Take the two LBR enties for + // example: [foo:8(RETURN), ext:1] [ext:3(CALL), bar:1] After perf reader, + // we only get[foo:8(RETURN), bar:1], unwinder will be confused like foo + // return to bar. Here we detect and treat this case as BRANCH instead of + // RETURN which only update the source address. + unwindBranch(State); } State.advanceLBR(); // Record `branch` with calling context after unwinding. @@ -432,9 +436,9 @@ void HybridPerfReader::unwindSamples() { if (Binary->useFSDiscriminator()) exitWithError("FS discriminator is not supported in CS profile."); std::set AllUntrackedCallsites; + VirtualUnwinder Unwinder(&SampleCounters, Binary); for (const auto &Item : AggregatedSamples) { const PerfSample *Sample = Item.first.getPtr(); - VirtualUnwinder Unwinder(&SampleCounters, Binary); Unwinder.unwind(Sample, Item.second); auto &CurrUntrackedCallsites = Unwinder.getUntrackedCallsites(); AllUntrackedCallsites.insert(CurrUntrackedCallsites.begin(), @@ -508,26 +512,32 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, bool IsOutgoing = SrcIsInternal && !DstIsInternal; bool IsArtificial = false; - // Ignore branches outside the current binary. Ignore all remaining branches - // if there's no incoming branch before the external branch in reverse - // order. + // Ignore branches outside the current binary. if (IsExternal) { - if (PrevTrDst) - continue; - if (!LBRStack.empty()) { + if (!PrevTrDst && !LBRStack.empty()) { WithColor::warning() << "Invalid transfer to external code in LBR record at line " << TraceIt.getLineNumber() << ": " << TraceIt.getCurrentLine() << "\n"; } - break; + // Do not ignore the entire samples, the remaining LBR can still be + // unwound using a context-less stack. + continue; } if (IsOutgoing) { if (!PrevTrDst) { - // This is unpaired outgoing jump which is likely due to interrupt or - // incomplete LBR trace. Ignore current and subsequent entries since - // they are likely in different contexts. + // This is a leading outgoing LBR, we should keep processing the LBRs. + if (LBRStack.empty()) { + NumLeadingOutgoingLBR++; + // Record this LBR since current source and next LBR' target is still + // a valid range. + LBRStack.emplace_back(LBREntry(Src, ExternalAddr, false)); + continue; + } + // This is middle unpaired outgoing jump which is likely due to + // interrupt or incomplete LBR trace. Ignore current and subsequent + // entries since they are likely in different contexts. break; } @@ -593,9 +603,17 @@ bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, } TraceIt.advance(); // Currently intermixed frame from different binaries is not supported. - // Ignore caller frames not from binary of interest. - if (!Binary->addressIsCode(FrameAddr)) - break; + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } // We need to translate return address to call address for non-leaf frames. if (!CallStack.empty()) { @@ -613,6 +631,10 @@ bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, CallStack.emplace_back(FrameAddr); } + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + // Skip other unrelated line, find the next valid LBR line // Note that even for empty call stack, we should skip the address at the // bottom, otherwise the following pass may generate a truncated callstack @@ -885,6 +907,7 @@ uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { } void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; uint64_t Count = parseAggregatedCount(TraceIt); assert(Count >= 1 && "Aggregated count should be >= 1!"); parseSample(TraceIt, Count); @@ -1131,6 +1154,11 @@ void PerfScriptReader::parsePerfTraces() { // Parse perf traces and do aggregation. parseAndAggregateTrace(); + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + // Generate unsymbolized profile. warnTruncatedStack(); warnInvalidRange(); diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h index 50177b281422e5..a133001a6ae200 100644 --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -213,6 +213,15 @@ using AggregatedCounter = Hashable::Hash, Hashable::Equal>; using SampleVector = SmallVector, 16>; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + ExternalAddr = 1, +}; + // The state for the unwinder, it doesn't hold the data but only keep the // pointer/index of the data, While unwinding, the CallStack is changed // dynamicially and will be recorded as the context of the sample @@ -221,7 +230,7 @@ struct UnwindState { const ProfiledBinary *Binary; // Call stack trie node struct ProfiledFrame { - const uint64_t Address = 0; + const uint64_t Address = DummyRoot; ProfiledFrame *Parent; SampleVector RangeSamples; SampleVector BranchSamples; @@ -241,7 +250,8 @@ struct UnwindState { void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); } - bool isDummyRoot() { return Address == 0; } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } bool isLeafFrame() { return Children.empty(); } }; @@ -262,6 +272,9 @@ struct UnwindState { bool validateInitialState() { uint64_t LBRLeaf = LBRStack[LBRIndex].Target; uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + // When we take a stack sample, ideally the sampling distance between the // leaf IP of stack and the last LBR target shouldn't be very large. // Use a heuristic size (0x100) to filter out broken records. @@ -283,8 +296,9 @@ struct UnwindState { uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } void advanceLBR() { LBRIndex++; } - ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } void pushFrame(uint64_t Address) { @@ -412,6 +426,10 @@ struct FrameStack { ProfiledBinary *Binary; FrameStack(ProfiledBinary *B) : Binary(B) {} bool pushFrame(UnwindState::ProfiledFrame *Cur) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see + if (Cur->isExternalFrame()) + return false; Stack.push_back(Cur->Address); return true; } @@ -428,6 +446,10 @@ struct ProbeStack { ProfiledBinary *Binary; ProbeStack(ProfiledBinary *B) : Binary(B) {} bool pushFrame(UnwindState::ProfiledFrame *Cur) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see + if (Cur->isExternalFrame()) + return false; const MCDecodedPseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address); // We may not find a probe for a merged or external callsite. @@ -500,7 +522,7 @@ class VirtualUnwinder { void unwindCall(UnwindState &State); void unwindLinear(UnwindState &State, uint64_t Repeat); void unwindReturn(UnwindState &State); - void unwindBranchWithinFrame(UnwindState &State); + void unwindBranch(UnwindState &State); template void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); @@ -546,6 +568,10 @@ class PerfReaderBase { ContextSampleCounterMap SampleCounters; bool ProfileIsCSFlat = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; }; // Read perf script to parse the events and samples.