diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 74f93e1979532..61ab0cb869904 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -62,7 +63,6 @@ static cl::opt MaxLoadsPerMemcmpOptSize( namespace { - // This class provides helper functions to expand a memcmp library call into an // inline expansion. class MemCmpExpansion { @@ -92,8 +92,7 @@ class MemCmpExpansion { // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}. struct LoadEntry { LoadEntry(unsigned LoadSize, uint64_t Offset) - : LoadSize(LoadSize), Offset(Offset) { - } + : LoadSize(LoadSize), Offset(Offset) {} // The size of the load for this block, in bytes. unsigned LoadSize; @@ -488,6 +487,8 @@ void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, // continue to next LoadCmpBlock or EndBlock. BasicBlock *BB = Builder.GetInsertBlock(); BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + setExplicitlyUnknownBranchWeightsIfProfiled(*CmpBr, DEBUG_TYPE, + CI->getFunction()); Builder.Insert(CmpBr); if (DTU) DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB}, @@ -552,6 +553,8 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { // to next LoadCmpBlock or EndBlock. BasicBlock *BB = Builder.GetInsertBlock(); BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); + setExplicitlyUnknownBranchWeightsIfProfiled(*CmpBr, DEBUG_TYPE, + CI->getFunction()); Builder.Insert(CmpBr); if (DTU) DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB}, @@ -592,6 +595,8 @@ void MemCmpExpansion::emitMemCmpResultBlock() { Value *Res = Builder.CreateSelect(Cmp, Constant::getAllOnesValue(Builder.getInt32Ty()), ConstantInt::get(Builder.getInt32Ty(), 1)); + setExplicitlyUnknownBranchWeightsIfProfiled(*cast(Res), + DEBUG_TYPE, CI->getFunction()); PhiRes->addIncoming(Res, ResBlock.BB); BranchInst *NewBr = BranchInst::Create(EndBlock); @@ -724,7 +729,8 @@ Value *MemCmpExpansion::getMemCmpExpansion() { // calculate which source was larger. The calculation requires the // two loaded source values of each load compare block. // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); // Create the number of required load compare basic blocks. createLoadCmpBlocks(); @@ -853,15 +859,14 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, const bool IsUsedForZeroCmp = IsBCmp || isOnlyUsedInZeroEqualityComparison(CI); bool OptForSize = llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); - auto Options = TTI->enableMemCmpExpansion(OptForSize, - IsUsedForZeroCmp); - if (!Options) return false; + auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp); + if (!Options) + return false; if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()) Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock; - if (OptForSize && - MaxLoadsPerMemcmpOptSize.getNumOccurrences()) + if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences()) Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize; if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences()) @@ -907,13 +912,14 @@ class ExpandMemCmpLegacyPass : public FunctionPass { } bool runOnFunction(Function &F) override { - if (skipFunction(F)) return false; + if (skipFunction(F)) + return false; auto *TPC = getAnalysisIfAvailable(); if (!TPC) { return false; } - const TargetLowering* TL = + const TargetLowering *TL = TPC->getTM().getSubtargetImpl(F)->getTargetLowering(); const TargetLibraryInfo *TLI = @@ -921,9 +927,9 @@ class ExpandMemCmpLegacyPass : public FunctionPass { const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); auto *PSI = &getAnalysis().getPSI(); - auto *BFI = (PSI && PSI->hasProfileSummary()) ? - &getAnalysis().getBFI() : - nullptr; + auto *BFI = (PSI && PSI->hasProfileSummary()) + ? &getAnalysis().getBFI() + : nullptr; DominatorTree *DT = nullptr; if (auto *DTWP = getAnalysisIfAvailable()) DT = &DTWP->getDomTree(); @@ -969,7 +975,7 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, if (DT) DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy); - const DataLayout& DL = F.getDataLayout(); + const DataLayout &DL = F.getDataLayout(); bool MadeChanges = false; for (auto BBIt = F.begin(); BBIt != F.end();) { if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) { diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll index 7df48d878bd0b..0ed3af535ac9c 100644 --- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3 ; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s @@ -98,15 +98,15 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y) { ret i32 %call } -define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) { +define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 { ; CHECK-LABEL: define i32 @cmp7( -; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]]) { +; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]]) !prof [[PROF0:![0-9]+]] { ; CHECK-NEXT: br label [[LOADBB:%.*]] ; CHECK: res_block: ; CHECK-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ] ; CHECK-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ] ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]] ; CHECK-NEXT: br label [[ENDBLOCK:%.*]] ; CHECK: loadbb: ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[X]], align 1 @@ -114,7 +114,7 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) ; CHECK-NEXT: [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]], !prof [[PROF1]] ; CHECK: loadbb1: ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3 @@ -123,7 +123,7 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; CHECK-NEXT: [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]]) ; CHECK-NEXT: [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]]) ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]] -; CHECK-NEXT: br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; CHECK-NEXT: br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]], !prof [[PROF1]] ; CHECK: endblock: ; CHECK-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] ; CHECK-NEXT: ret i32 [[PHI_RES]] @@ -860,3 +860,11 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) { %conv = zext i1 %cmp to i32 ret i32 %conv } + +!0 = !{!"function_entry_count", i64 1000} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"unknown", !"expand-memcmp"} +;. diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll index 0507ec9de542e..30da8b391f55d 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -S -expand-memcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32 ; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32 @@ -34,12 +34,12 @@ define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonl ret i32 %call } -define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) { +define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 { ; X32-LABEL: @cmp3( ; X32-NEXT: br label [[LOADBB:%.*]] ; X32: res_block: ; X32-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]] -; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]] ; X32-NEXT: br label [[ENDBLOCK:%.*]] ; X32: loadbb: ; X32-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1 @@ -47,7 +47,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X32-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) ; X32-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) ; X32-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] -; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]], !prof [[PROF1]] ; X32: loadbb1: ; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2 ; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2 @@ -564,3 +564,10 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) { ret i32 %conv } +!0 = !{!"function_entry_count", i64 1000} +;. +; X32: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } +;. +; X32: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; X32: [[PROF1]] = !{!"unknown", !"expand-memcmp"} +;. diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll index 86dc3e5245f24..b5075a2fcff0b 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD ; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD @@ -36,12 +36,12 @@ define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonl ret i32 %call } -define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) { +define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 { ; X64-LABEL: @cmp3( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: ; X64-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]] -; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1, !prof [[PROF1:![0-9]+]] ; X64-NEXT: br label [[ENDBLOCK:%.*]] ; X64: loadbb: ; X64-NEXT: [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1 @@ -49,7 +49,7 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X64-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) ; X64-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) ; X64-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] -; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]], !prof [[PROF1]] ; X64: loadbb1: ; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2 ; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2 @@ -474,7 +474,7 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y) { ret i32 %conv } -define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) { +define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) !prof !0 { ; X64_1LD-LABEL: @cmp_eq3( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -483,14 +483,14 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y) { ; X64_1LD-NEXT: [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1 ; X64_1LD-NEXT: [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1 ; X64_1LD-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]] +; X64_1LD-NEXT: br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]], !prof [[PROF1]] ; X64_1LD: loadbb1: ; X64_1LD-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2 ; X64_1LD-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2 ; X64_1LD-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1 ; X64_1LD-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1 ; X64_1LD-NEXT: [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]] -; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]] +; X64_1LD-NEXT: br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]], !prof [[PROF1]] ; X64_1LD: endblock: ; X64_1LD-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ] ; X64_1LD-NEXT: [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0 @@ -1076,3 +1076,15 @@ define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y) { ret i32 %conv } +!0 = !{!"function_entry_count", i64 1000} +;. +; X64_1LD: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } +;. +; X64_2LD: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } +;. +; X64_1LD: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; X64_1LD: [[PROF1]] = !{!"unknown", !"expand-memcmp"} +;. +; X64_2LD: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; X64_2LD: [[PROF1]] = !{!"unknown", !"expand-memcmp"} +;.