Skip to content

Commit

Permalink
[CodeGen] Fine tune MachineFunctionSplitPass (MFS) for FSAFDO.
Browse files Browse the repository at this point in the history
The original MFS work D85368 shows good performance improvement with
Instrumented FDO. However, AutoFDO or Flow-Sensitive AutoFDO (FSAFDO)
does not show performance gain. This is mainly caused by a less
accurate profile compared to the iFDO profile.

For the past few months, we have been working to improve FSAFDO
quality, like in D145171. Taking advantage of this improvement, MFS
now shows performance improvements over FSAFDO profiles.

That being said, 2 minor changes need to be made, 1) An FS-AutoFDO
profile generation pass needs to be added right before MFS pass and an
FSAFDO profile load pass is needed when FS-AutoFDO is enabled and the
MFS flag is present. 2) MFS only applies to hot functions, because we
believe (and experiment also shows) FS-AutoFDO is more accurate about
functions that have plenty of samples than those with no or very few
samples.

With this improvement, we see a 1.2% performance improvement in clang
benchmark, 0.9% QPS improvement in our internal search benchmark, and
3%-5% improvement in internal storage benchmark.

This is #1 of the two patches that enables the improvement.

Reviewed By: wenlei, snehasish, xur

Differential Revision: https://reviews.llvm.org/D152399
  • Loading branch information
shenhanc78 committed Jul 10, 2023
1 parent 7c7b191 commit 8df7596
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 11 deletions.
52 changes: 41 additions & 11 deletions llvm/lib/CodeGen/MachineFunctionSplitter.cpp
Expand Up @@ -24,6 +24,8 @@
//===----------------------------------------------------------------------===//

#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/EHUtils.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/BasicBlockSectionUtils.h"
Expand Down Expand Up @@ -94,16 +96,26 @@ static void setDescendantEHBlocksCold(MachineFunction &MF) {
}
}

static void finishAdjustingBasicBlocksAndLandingPads(MachineFunction &MF) {
auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) {
return X.getSectionID().Type < Y.getSectionID().Type;
};
llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
llvm::avoidZeroOffsetLandingPad(MF);
}

static bool isColdBlock(const MachineBasicBlock &MBB,
const MachineBlockFrequencyInfo *MBFI,
ProfileSummaryInfo *PSI) {
ProfileSummaryInfo *PSI, bool HasAccurateProfile) {
std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
// If using accurate profile, no count means cold.
// If no accurate profile, no count means "do not judge
// coldness".
if (!Count)
return true;
return HasAccurateProfile;

if (PercentileCutoff > 0) {
if (PercentileCutoff > 0)
return PSI->isColdCountNthPercentile(PercentileCutoff, *Count);
}
return (*Count < ColdCountThreshold);
}

Expand Down Expand Up @@ -140,9 +152,28 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {

MachineBlockFrequencyInfo *MBFI = nullptr;
ProfileSummaryInfo *PSI = nullptr;
// Whether this pass is using FSAFDO profile (not accurate) or IRPGO
// (accurate). HasAccurateProfile is only used when UseProfileData is true,
// but giving it a default value to silent any possible warning.
bool HasAccurateProfile = false;
if (UseProfileData) {
MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
// "HasAccurateProfile" is false for FSAFDO, true when using IRPGO
// (traditional instrumented FDO) or CSPGO profiles.
HasAccurateProfile =
PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile();
// If HasAccurateProfile is false, we only trust hot functions,
// which have many samples, and consider them as split
// candidates. On the other hand, if HasAccurateProfile (likeIRPGO), we
// trust both cold and hot functions.
if (!HasAccurateProfile && !PSI->isFunctionHotInCallGraph(&MF, *MBFI)) {
// Split all EH code and it's descendant statically by default.
if (SplitAllEHCode)
setDescendantEHBlocksCold(MF);
finishAdjustingBasicBlocksAndLandingPads(MF);
return true;
}
}

SmallVector<MachineBasicBlock *, 2> LandingPads;
Expand All @@ -152,7 +183,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {

if (MBB.isEHPad())
LandingPads.push_back(&MBB);
else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode)
else if (UseProfileData &&
isColdBlock(MBB, MBFI, PSI, HasAccurateProfile) && !SplitAllEHCode)
MBB.setSectionID(MBBSectionID::ColdSectionID);
}

Expand All @@ -161,21 +193,19 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
setDescendantEHBlocksCold(MF);
// We only split out eh pads if all of them are cold.
else {
// Here we have UseProfileData == true.
bool HasHotLandingPads = false;
for (const MachineBasicBlock *LP : LandingPads) {
if (!isColdBlock(*LP, MBFI, PSI))
if (!isColdBlock(*LP, MBFI, PSI, HasAccurateProfile))
HasHotLandingPads = true;
}
if (!HasHotLandingPads) {
for (MachineBasicBlock *LP : LandingPads)
LP->setSectionID(MBBSectionID::ColdSectionID);
}
}
auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) {
return X.getSectionID().Type < Y.getSectionID().Type;
};
llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
llvm::avoidZeroOffsetLandingPad(MF);

finishAdjustingBasicBlocksAndLandingPads(MF);
return true;
}

Expand Down
68 changes: 68 additions & 0 deletions llvm/test/CodeGen/X86/machine-function-splitter.ll
Expand Up @@ -2,6 +2,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -mfs-psi-cutoff=0 -mfs-count-threshold=2000 | FileCheck %s --dump-input=always -check-prefix=MFS-OPTS1
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -mfs-psi-cutoff=950000 | FileCheck %s -check-prefix=MFS-OPTS2
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -mfs-split-ehcode | FileCheck %s -check-prefix=MFS-EH-SPLIT
; RUN: sed 's/InstrProf/SampleProfile/g' %s > %t.ll
; RUN: llc < %t.ll -mtriple=x86_64-unknown-linux-gnu -split-machine-functions | FileCheck %s --check-prefix=FSAFDO-MFS
; RUN: llc < %t.ll -mtriple=x86_64-unknown-linux-gnu -split-machine-functions | FileCheck %s --check-prefix=FSAFDO-MFS2

define void @foo1(i1 zeroext %0) nounwind !prof !14 !section_prefix !15 {
;; Check that cold block is moved to .text.split.
; MFS-DEFAULTS-LABEL: foo1
Expand Down Expand Up @@ -368,6 +372,66 @@ try.cont: ; preds = %entry
ret i32 %8
}

define void @foo14(i1 zeroext %0, i1 zeroext %1) nounwind !prof !24 {
; FSAFDO-MFS: .section .text.split.foo14,"ax"
; FSAFDO-MFS: foo14.cold:
br i1 %0, label %3, label %7, !prof !25

3:
%4 = call i32 @bar()
br label %7

5:
%6 = call i32 @baz()
br label %7

7:
br i1 %1, label %8, label %10, !prof !26

8:
%9 = call i32 @bam()
br label %12

10:
%11 = call i32 @baz()
br label %12

12:
%13 = tail call i32 @qux()
ret void
}

define void @foo15(i1 zeroext %0, i1 zeroext %1) nounwind !prof !27 {
;; HasAccurateProfile is false, foo15 is hot, but no profile data for
;; blocks, no split should happen.
; FSAFDO-MFS2-NOT: .section .text.split.foo15,"ax"
; FSAFDO-MFS2-NOT: foo15.cold:
br i1 %0, label %3, label %7

3:
%4 = call i32 @bar()
br label %7

5:
%6 = call i32 @baz()
br label %7

7:
br i1 %1, label %8, label %10

8:
%9 = call i32 @bam()
br label %12

10:
%11 = call i32 @baz()
br label %12

12:
%13 = tail call i32 @qux()
ret void
}

declare i32 @bar()
declare i32 @baz()
declare i32 @bam()
Expand Down Expand Up @@ -404,3 +468,7 @@ attributes #0 = { "implicit-section-name"="nosplit" }
!21 = !{!"branch_weights", i32 6000, i32 4000}
!22 = !{!"branch_weights", i32 80, i32 9920}
!23 = !{!"function_entry_count", i64 7}
!24 = !{!"function_entry_count", i64 10000}
!25 = !{!"branch_weights", i32 0, i32 7000}
!26 = !{!"branch_weights", i32 1000, i32 6000}
!27 = !{!"function_entry_count", i64 10000}

0 comments on commit 8df7596

Please sign in to comment.