-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DSE] Split memory intrinsics if they are dead in the middle #75478
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Nabeel Omer (omern1) ChangesThis patch enables Dead Store Elimination to split memory intrinsics that are dead in the middle into
Resolves #72113 Pre-committing the tests to show diff in a moment. Patch is 38.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75478.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 008dcc53fd44fc..7e9f1b6350a014 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -554,6 +555,78 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
}
}
+static bool tryToSplitMiddle(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
+ int64_t &DeadStart, uint64_t &DeadSize, const TargetTransformInfo& TTI) {
+ if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+ int64_t KillingStart = OII->second;
+ uint64_t KillingSize = OII->first - KillingStart;
+
+ assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
+
+ uint64_t Threshold = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+
+ // __Front__ ___Rear___
+ // | ------------- Dead ------------- |
+ // | --- Killing --- |
+
+ if (KillingStart < DeadStart ||
+ uint64_t(KillingStart + KillingSize) > uint64_t(DeadStart + DeadSize))
+ return false;
+
+ auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
+ Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
+
+ // Assume Front is already correctly aligned.
+ uint64_t FrontSize = KillingStart - DeadStart;
+
+ int64_t RearStart = alignDown(
+ uint64_t(KillingStart + KillingSize), PrefAlign.value());
+ uint64_t RearSize = (DeadStart + DeadSize) - RearStart;
+
+ // If Front and Rear are both bigger than the threshold they won't be inlined
+ // so this seems like a bad idea. If Dead is smaller than the threshold it
+ // will be inlined so this isn't a good idea.
+ if ((FrontSize > Threshold && RearSize > Threshold) || DeadSize < Threshold)
+ return false;
+
+ Value *DeadWriteLength = DeadIntrinsic->getLength();
+ Value *DeadDest = DeadIntrinsic->getRawDest();
+
+ LLVM_DEBUG(dbgs() << "DSE: Split and shortened partially dead store: ["
+ << DeadStart << ", " << DeadSize + DeadStart
+ << "]\nInto: Front: [" << DeadStart << ", "
+ << DeadStart + FrontSize << "], Rear: [" << RearStart
+ << ", " << RearStart + RearSize << "]\n"
+ << "Killer: [" << KillingStart << ", "
+ << KillingSize + KillingStart << "]\n");
+
+ // Dead is now Front.
+ DeadIntrinsic->setLength(
+ ConstantInt::get(DeadWriteLength->getType(), FrontSize));
+ DeadIntrinsic->addDereferenceableParamAttr(0, FrontSize);
+
+ Value *Indices[1] = {ConstantInt::get(DeadWriteLength->getType(), RearStart)};
+ Instruction *RearDestGEP = GetElementPtrInst::CreateInBounds(
+ Type::getInt8Ty(DeadIntrinsic->getContext()), DeadDest, Indices, "",
+ DeadI);
+ auto *Rear = cast<AnyMemIntrinsic>(DeadIntrinsic->clone());
+ Rear->setDest(RearDestGEP);
+ Rear->setLength(ConstantInt::get(DeadWriteLength->getType(), RearSize));
+ Rear->insertAfter(RearDestGEP);
+ Rear->setDestAlignment(PrefAlign);
+ Rear->addDereferenceableParamAttr(0, RearSize);
+
+ shortenAssignment(DeadI, DeadDest, DeadStart * 8, DeadSize * 8, FrontSize * 8,
+ true);
+
+ IntervalMap.erase(OII);
+ DeadSize = FrontSize;
+ return true;
+}
+
static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
uint64_t &DeadSize, int64_t KillingStart,
uint64_t KillingSize, bool IsOverwriteEnd) {
@@ -821,6 +894,7 @@ struct DSEState {
const TargetLibraryInfo &TLI;
const DataLayout &DL;
const LoopInfo &LI;
+ const TargetTransformInfo &TTI;
// Whether the function contains any irreducible control flow, useful for
// being accurately able to detect loops.
@@ -860,9 +934,9 @@ struct DSEState {
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
- const LoopInfo &LI)
+ const LoopInfo &LI, const TargetTransformInfo& TTI)
: F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
- PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+ PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), TTI(TTI) {
// Collect blocks with throwing instructions not modeled in MemorySSA and
// alloc-like objects.
unsigned PO = 0;
@@ -1978,7 +2052,7 @@ struct DSEState {
return false;
}
- bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) {
+ bool removePartiallyOverlappedIntrinsicStores(InstOverlapIntervalsTy &IOL) {
bool Changed = false;
for (auto OI : IOL) {
Instruction *DeadI = OI.first;
@@ -1994,6 +2068,9 @@ struct DSEState {
if (IntervalMap.empty())
continue;
Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+ if (IntervalMap.empty())
+ continue;
+ Changed |= tryToSplitMiddle(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
}
return Changed;
}
@@ -2059,10 +2136,10 @@ struct DSEState {
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI,
- const LoopInfo &LI) {
+ const LoopInfo &LI, const TargetTransformInfo& TTI) {
bool MadeChange = false;
- DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+ DSEState State(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
// For each store:
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
MemoryDef *KillingDef = State.MemDefs[I];
@@ -2226,7 +2303,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
if (EnablePartialOverwriteTracking)
for (auto &KV : State.IOLs)
- MadeChange |= State.removePartiallyOverlappedStores(KV.second);
+ MadeChange |= State.removePartiallyOverlappedIntrinsicStores(KV.second);
MadeChange |= State.eliminateRedundantStoresOfExistingValues();
MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
@@ -2244,8 +2321,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+ bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
diff --git a/llvm/test/DebugInfo/dse-split-memintrinsics.ll b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
new file mode 100644
index 00000000000000..f7c083a2f0969c
--- /dev/null
+++ b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=dse -S | FileCheck %s
+
+; Check a dbg.assign is inserted that sets the dead middle bits to no-location (see tryToSplitMiddle).
+
+define dso_local void @_Z22overwrite_middle_localv() local_unnamed_addr #0 !dbg !103 {
+; CHECK-LABEL: define dso_local void @_Z22overwrite_middle_localv(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG103:![0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[BLOB:%.*]] = alloca [1000 x i8], align 16, !DIAssignID [[DIASSIGNID112:![0-9]+]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108:![0-9]+]], metadata !DIExpression(), metadata [[DIASSIGNID112]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113:![0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5:[0-9]+]], !dbg [[DBG114:![0-9]+]]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 976
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(24) [[TMP0]], i8 5, i64 24, i1 false), !dbg [[DBG115:![0-9]+]], !DIAssignID [[DIASSIGNID116:![0-9]+]]
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(10) [[BLOB]], i8 5, i64 10, i1 false), !dbg [[DBG115]], !DIAssignID [[DIASSIGNID116]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(), metadata [[DIASSIGNID116]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7920), metadata [[META117:![0-9]+]], metadata ptr undef, metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 10, !dbg [[DBG118:![0-9]+]]
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) [[ADD_PTR]], i8 3, i64 980, i1 false), !dbg [[DBG119:![0-9]+]], !DIAssignID [[DIASSIGNID120:![0-9]+]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata [[DIASSIGNID120]], metadata ptr [[ADD_PTR]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT: call void @_Z3escPc(ptr noundef nonnull [[BLOB]]), !dbg [[DBG121:![0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5]], !dbg [[DBG122:![0-9]+]]
+; CHECK-NEXT: ret void, !dbg [[DBG122]]
+;
+entry:
+ %blob = alloca [1000 x i8], align 16, !DIAssignID !112
+ call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !112, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+ call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull %blob) #5, !dbg !114
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(1000) %blob, i8 5, i64 1000, i1 false), !dbg !115, !DIAssignID !116
+ call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !116, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+ %add.ptr = getelementptr inbounds i8, ptr %blob, i64 10, !dbg !117
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) %add.ptr, i8 3, i64 980, i1 false), !dbg !118, !DIAssignID !119
+ call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata !119, metadata ptr %add.ptr, metadata !DIExpression()), !dbg !113
+ call void @_Z3escPc(ptr noundef nonnull %blob), !dbg !120
+ call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull %blob) #5, !dbg !121
+ ret void, !dbg !121
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+declare !dbg !122 void @_Z3escPc(ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #4
+
+attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #3 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!95, !96, !97, !98, !99, !100, !101}
+!llvm.ident = !{!102}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, imports: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "debuginfo.cpp", directory: "/home", checksumkind: CSK_MD5, checksum: "3dc84462c14a3d86dd372d0473fa13aa")
+!2 = !{!3, !16, !20, !27, !31, !35, !45, !49, !51, !53, !57, !61, !65, !69, !73, !75, !77, !79, !83, !87, !91, !93}
+!3 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5, file: !15, line: 77)
+!4 = !DINamespace(name: "std", scope: null)
+!5 = !DISubprogram(name: "memchr", scope: !6, file: !6, line: 100, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!6 = !DIFile(filename: "/usr/include/string.h", directory: "", checksumkind: CSK_MD5, checksum: "3fc3efdf2e52b973f380a6e7608374ff")
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9, !11, !12}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: null)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !13, line: 13, baseType: !14)
+!13 = !DIFile(filename: "build_upstream/lib/clang/18/include/__stddef_size_t.h", directory: "/home", checksumkind: CSK_MD5, checksum: "405db6ea5fb824de326715f26fa9fab5")
+!14 = !DIBasicType(name: "unsigned long", size: 64, encoding: DW_ATE_unsigned)
+!15 = !DIFile(filename: "/usr/lib64/gcc/x86_64-suse-linux/13/../../../../include/c++/13/cstring", directory: "")
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !17, file: !15, line: 78)
+!17 = !DISubprogram(name: "memcmp", scope: !6, file: !6, line: 64, type: !18, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!11, !9, !9, !12}
+!20 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !21, file: !15, line: 79)
+!21 = !DISubprogram(name: "memcpy", scope: !6, file: !6, line: 43, type: !22, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!22 = !DISubroutineType(types: !23)
+!23 = !{!24, !25, !26, !12}
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!25 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24)
+!26 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!27 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !28, file: !15, line: 80)
+!28 = !DISubprogram(name: "memmove", scope: !6, file: !6, line: 47, type: !29, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!29 = !DISubroutineType(types: !30)
+!30 = !{!24, !24, !9, !12}
+!31 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !32, file: !15, line: 81)
+!32 = !DISubprogram(name: "memset", scope: !6, file: !6, line: 61, type: !33, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!33 = !DISubroutineType(types: !34)
+!34 = !{!24, !24, !11, !12}
+!35 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !36, file: !15, line: 82)
+!36 = !DISubprogram(name: "strcat", scope: !6, file: !6, line: 149, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!37 = !DISubroutineType(types: !38)
+!38 = !{!39, !41, !42}
+!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !40, size: 64)
+!40 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!41 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !39)
+!42 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !43)
+!43 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !44, size: 64)
+!44 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !40)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !46, file: !15, line: 83)
+!46 = !DISubprogram(name: "strcmp", scope: !6, file: !6, line: 156, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!11, !43, !43}
+!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !50, file: !15, line: 84)
+!50 = !DISubprogram(name: "strcoll", scope: !6, file: !6, line: 163, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !52, file: !15, line: 85)
+!52 = !DISubprogram(name: "strcpy", scope: !6, file: !6, line: 141, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!53 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !54, file: !15, line: 86)
+!54 = !DISubprogram(name: "strcspn", scope: !6, file: !6, line: 293, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!55 = !DISubroutineType(types: !56)
+!56 = !{!12, !43, !43}
+!57 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !58, file: !15, line: 87)
+!58 = !DISubprogram(name: "strerror", scope: !6, file: !6, line: 419, type: !59, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!59 = !DISubroutineType(types: !60)
+!60 = !{!39, !11}
+!61 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !62, file: !15, line: 88)
+!62 = !DISubprogram(name: "strlen", scope: !6, file: !6, line: 407, type: !63, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!63 = !DISubroutineType(types: !64)
+!64 = !{!12, !43}
+!65 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !66, file: !15, line: 89)
+!66 = !DISubprogram(name: "strncat", scope: !6, file: !6, line: 152, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!67 = !DISubroutineType(types: !68)
+!68 = !{!39, !41, !42, !12}
+!69 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !70, file: !15, line: 90)
+!70 = !DISubprogram(name: "strncmp", scope: !6, file: !6, line: 159, type: !71, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!71 = !DISubroutineType(types: !72)
+!72 = !{!11, !43, !43, !12}
+!73 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !74, file: !15, line: 91)
+!74 = !DISubprogram(name: "strncpy", scope: !6, file: !6, line: 144, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!75 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !76, file: !15, line: 92)
+!76 = !DISubprogram(name: "strspn", scope: !6, file: !6, line: 297, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!77 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !78, file: !15, line: 93)
+!78 = !DISubprogram(name: "strtok", scope: !6, file: !6, line: 356, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!79 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !80, file: !15, line: 94)
+!80 = !DISubprogram(name: "strxfrm", scope: !6, file: !6, line: 166, type: !81, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!81 = !DISubroutineType(types: !82)
+!82 = !{!12, !41, !42, !12}
+!83 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !84, file: !15, line: 95)
+!84 = !DISubprogram(name: "strchr", scope: !6, file: !6, line: 239, type: !85, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!85 = !DISubroutineType(types: !86)
+!86 = !{!43, !43, !11}
+!87 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !88, file: !15, line: 96)
+!88 = !DISubprogram(name: "strpbrk", scope: !6, file: !6, line: 316, type: !89, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!89 = !DISubroutineType(types: !90)
+!90 = !{!43, !43, !43}
+!91 = !DIImpo...
[truncated]
|
@llvm/pr-subscribers-debuginfo Author: Nabeel Omer (omern1) ChangesThis patch enables Dead Store Elimination to split memory intrinsics that are dead in the middle into
Resolves #72113 Pre-committing the tests to show diff in a moment. Patch is 38.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75478.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 008dcc53fd44fc..7e9f1b6350a014 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -554,6 +555,78 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
}
}
+static bool tryToSplitMiddle(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
+ int64_t &DeadStart, uint64_t &DeadSize, const TargetTransformInfo& TTI) {
+ if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+ int64_t KillingStart = OII->second;
+ uint64_t KillingSize = OII->first - KillingStart;
+
+ assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
+
+ uint64_t Threshold = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+
+ // __Front__ ___Rear___
+ // | ------------- Dead ------------- |
+ // | --- Killing --- |
+
+ if (KillingStart < DeadStart ||
+ uint64_t(KillingStart + KillingSize) > uint64_t(DeadStart + DeadSize))
+ return false;
+
+ auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
+ Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
+
+ // Assume Front is already correctly aligned.
+ uint64_t FrontSize = KillingStart - DeadStart;
+
+ int64_t RearStart = alignDown(
+ uint64_t(KillingStart + KillingSize), PrefAlign.value());
+ uint64_t RearSize = (DeadStart + DeadSize) - RearStart;
+
+ // If Front and Rear are both bigger than the threshold they won't be inlined
+ // so this seems like a bad idea. If Dead is smaller than the threshold it
+ // will be inlined so this isn't a good idea.
+ if ((FrontSize > Threshold && RearSize > Threshold) || DeadSize < Threshold)
+ return false;
+
+ Value *DeadWriteLength = DeadIntrinsic->getLength();
+ Value *DeadDest = DeadIntrinsic->getRawDest();
+
+ LLVM_DEBUG(dbgs() << "DSE: Split and shortened partially dead store: ["
+ << DeadStart << ", " << DeadSize + DeadStart
+ << "]\nInto: Front: [" << DeadStart << ", "
+ << DeadStart + FrontSize << "], Rear: [" << RearStart
+ << ", " << RearStart + RearSize << "]\n"
+ << "Killer: [" << KillingStart << ", "
+ << KillingSize + KillingStart << "]\n");
+
+ // Dead is now Front.
+ DeadIntrinsic->setLength(
+ ConstantInt::get(DeadWriteLength->getType(), FrontSize));
+ DeadIntrinsic->addDereferenceableParamAttr(0, FrontSize);
+
+ Value *Indices[1] = {ConstantInt::get(DeadWriteLength->getType(), RearStart)};
+ Instruction *RearDestGEP = GetElementPtrInst::CreateInBounds(
+ Type::getInt8Ty(DeadIntrinsic->getContext()), DeadDest, Indices, "",
+ DeadI);
+ auto *Rear = cast<AnyMemIntrinsic>(DeadIntrinsic->clone());
+ Rear->setDest(RearDestGEP);
+ Rear->setLength(ConstantInt::get(DeadWriteLength->getType(), RearSize));
+ Rear->insertAfter(RearDestGEP);
+ Rear->setDestAlignment(PrefAlign);
+ Rear->addDereferenceableParamAttr(0, RearSize);
+
+ shortenAssignment(DeadI, DeadDest, DeadStart * 8, DeadSize * 8, FrontSize * 8,
+ true);
+
+ IntervalMap.erase(OII);
+ DeadSize = FrontSize;
+ return true;
+}
+
static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
uint64_t &DeadSize, int64_t KillingStart,
uint64_t KillingSize, bool IsOverwriteEnd) {
@@ -821,6 +894,7 @@ struct DSEState {
const TargetLibraryInfo &TLI;
const DataLayout &DL;
const LoopInfo &LI;
+ const TargetTransformInfo &TTI;
// Whether the function contains any irreducible control flow, useful for
// being accurately able to detect loops.
@@ -860,9 +934,9 @@ struct DSEState {
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
- const LoopInfo &LI)
+ const LoopInfo &LI, const TargetTransformInfo& TTI)
: F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
- PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+ PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), TTI(TTI) {
// Collect blocks with throwing instructions not modeled in MemorySSA and
// alloc-like objects.
unsigned PO = 0;
@@ -1978,7 +2052,7 @@ struct DSEState {
return false;
}
- bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) {
+ bool removePartiallyOverlappedIntrinsicStores(InstOverlapIntervalsTy &IOL) {
bool Changed = false;
for (auto OI : IOL) {
Instruction *DeadI = OI.first;
@@ -1994,6 +2068,9 @@ struct DSEState {
if (IntervalMap.empty())
continue;
Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+ if (IntervalMap.empty())
+ continue;
+ Changed |= tryToSplitMiddle(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
}
return Changed;
}
@@ -2059,10 +2136,10 @@ struct DSEState {
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI,
- const LoopInfo &LI) {
+ const LoopInfo &LI, const TargetTransformInfo& TTI) {
bool MadeChange = false;
- DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+ DSEState State(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
// For each store:
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
MemoryDef *KillingDef = State.MemDefs[I];
@@ -2226,7 +2303,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
if (EnablePartialOverwriteTracking)
for (auto &KV : State.IOLs)
- MadeChange |= State.removePartiallyOverlappedStores(KV.second);
+ MadeChange |= State.removePartiallyOverlappedIntrinsicStores(KV.second);
MadeChange |= State.eliminateRedundantStoresOfExistingValues();
MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
@@ -2244,8 +2321,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+ bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
diff --git a/llvm/test/DebugInfo/dse-split-memintrinsics.ll b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
new file mode 100644
index 00000000000000..f7c083a2f0969c
--- /dev/null
+++ b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=dse -S | FileCheck %s
+
+; Check a dbg.assign is inserted that sets the dead middle bits to no-location (see tryToSplitMiddle).
+
+define dso_local void @_Z22overwrite_middle_localv() local_unnamed_addr #0 !dbg !103 {
+; CHECK-LABEL: define dso_local void @_Z22overwrite_middle_localv(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG103:![0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[BLOB:%.*]] = alloca [1000 x i8], align 16, !DIAssignID [[DIASSIGNID112:![0-9]+]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108:![0-9]+]], metadata !DIExpression(), metadata [[DIASSIGNID112]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113:![0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5:[0-9]+]], !dbg [[DBG114:![0-9]+]]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 976
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(24) [[TMP0]], i8 5, i64 24, i1 false), !dbg [[DBG115:![0-9]+]], !DIAssignID [[DIASSIGNID116:![0-9]+]]
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(10) [[BLOB]], i8 5, i64 10, i1 false), !dbg [[DBG115]], !DIAssignID [[DIASSIGNID116]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(), metadata [[DIASSIGNID116]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7920), metadata [[META117:![0-9]+]], metadata ptr undef, metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 10, !dbg [[DBG118:![0-9]+]]
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) [[ADD_PTR]], i8 3, i64 980, i1 false), !dbg [[DBG119:![0-9]+]], !DIAssignID [[DIASSIGNID120:![0-9]+]]
+; CHECK-NEXT: call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata [[DIASSIGNID120]], metadata ptr [[ADD_PTR]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT: call void @_Z3escPc(ptr noundef nonnull [[BLOB]]), !dbg [[DBG121:![0-9]+]]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5]], !dbg [[DBG122:![0-9]+]]
+; CHECK-NEXT: ret void, !dbg [[DBG122]]
+;
+entry:
+ %blob = alloca [1000 x i8], align 16, !DIAssignID !112
+ call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !112, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+ call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull %blob) #5, !dbg !114
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(1000) %blob, i8 5, i64 1000, i1 false), !dbg !115, !DIAssignID !116
+ call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !116, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+ %add.ptr = getelementptr inbounds i8, ptr %blob, i64 10, !dbg !117
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) %add.ptr, i8 3, i64 980, i1 false), !dbg !118, !DIAssignID !119
+ call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata !119, metadata ptr %add.ptr, metadata !DIExpression()), !dbg !113
+ call void @_Z3escPc(ptr noundef nonnull %blob), !dbg !120
+ call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull %blob) #5, !dbg !121
+ ret void, !dbg !121
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+declare !dbg !122 void @_Z3escPc(ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #4
+
+attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #3 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!95, !96, !97, !98, !99, !100, !101}
+!llvm.ident = !{!102}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, imports: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "debuginfo.cpp", directory: "/home", checksumkind: CSK_MD5, checksum: "3dc84462c14a3d86dd372d0473fa13aa")
+!2 = !{!3, !16, !20, !27, !31, !35, !45, !49, !51, !53, !57, !61, !65, !69, !73, !75, !77, !79, !83, !87, !91, !93}
+!3 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5, file: !15, line: 77)
+!4 = !DINamespace(name: "std", scope: null)
+!5 = !DISubprogram(name: "memchr", scope: !6, file: !6, line: 100, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!6 = !DIFile(filename: "/usr/include/string.h", directory: "", checksumkind: CSK_MD5, checksum: "3fc3efdf2e52b973f380a6e7608374ff")
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9, !11, !12}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: null)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !13, line: 13, baseType: !14)
+!13 = !DIFile(filename: "build_upstream/lib/clang/18/include/__stddef_size_t.h", directory: "/home", checksumkind: CSK_MD5, checksum: "405db6ea5fb824de326715f26fa9fab5")
+!14 = !DIBasicType(name: "unsigned long", size: 64, encoding: DW_ATE_unsigned)
+!15 = !DIFile(filename: "/usr/lib64/gcc/x86_64-suse-linux/13/../../../../include/c++/13/cstring", directory: "")
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !17, file: !15, line: 78)
+!17 = !DISubprogram(name: "memcmp", scope: !6, file: !6, line: 64, type: !18, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!11, !9, !9, !12}
+!20 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !21, file: !15, line: 79)
+!21 = !DISubprogram(name: "memcpy", scope: !6, file: !6, line: 43, type: !22, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!22 = !DISubroutineType(types: !23)
+!23 = !{!24, !25, !26, !12}
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!25 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24)
+!26 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!27 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !28, file: !15, line: 80)
+!28 = !DISubprogram(name: "memmove", scope: !6, file: !6, line: 47, type: !29, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!29 = !DISubroutineType(types: !30)
+!30 = !{!24, !24, !9, !12}
+!31 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !32, file: !15, line: 81)
+!32 = !DISubprogram(name: "memset", scope: !6, file: !6, line: 61, type: !33, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!33 = !DISubroutineType(types: !34)
+!34 = !{!24, !24, !11, !12}
+!35 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !36, file: !15, line: 82)
+!36 = !DISubprogram(name: "strcat", scope: !6, file: !6, line: 149, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!37 = !DISubroutineType(types: !38)
+!38 = !{!39, !41, !42}
+!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !40, size: 64)
+!40 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!41 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !39)
+!42 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !43)
+!43 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !44, size: 64)
+!44 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !40)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !46, file: !15, line: 83)
+!46 = !DISubprogram(name: "strcmp", scope: !6, file: !6, line: 156, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!11, !43, !43}
+!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !50, file: !15, line: 84)
+!50 = !DISubprogram(name: "strcoll", scope: !6, file: !6, line: 163, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !52, file: !15, line: 85)
+!52 = !DISubprogram(name: "strcpy", scope: !6, file: !6, line: 141, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!53 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !54, file: !15, line: 86)
+!54 = !DISubprogram(name: "strcspn", scope: !6, file: !6, line: 293, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!55 = !DISubroutineType(types: !56)
+!56 = !{!12, !43, !43}
+!57 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !58, file: !15, line: 87)
+!58 = !DISubprogram(name: "strerror", scope: !6, file: !6, line: 419, type: !59, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!59 = !DISubroutineType(types: !60)
+!60 = !{!39, !11}
+!61 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !62, file: !15, line: 88)
+!62 = !DISubprogram(name: "strlen", scope: !6, file: !6, line: 407, type: !63, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!63 = !DISubroutineType(types: !64)
+!64 = !{!12, !43}
+!65 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !66, file: !15, line: 89)
+!66 = !DISubprogram(name: "strncat", scope: !6, file: !6, line: 152, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!67 = !DISubroutineType(types: !68)
+!68 = !{!39, !41, !42, !12}
+!69 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !70, file: !15, line: 90)
+!70 = !DISubprogram(name: "strncmp", scope: !6, file: !6, line: 159, type: !71, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!71 = !DISubroutineType(types: !72)
+!72 = !{!11, !43, !43, !12}
+!73 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !74, file: !15, line: 91)
+!74 = !DISubprogram(name: "strncpy", scope: !6, file: !6, line: 144, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!75 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !76, file: !15, line: 92)
+!76 = !DISubprogram(name: "strspn", scope: !6, file: !6, line: 297, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!77 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !78, file: !15, line: 93)
+!78 = !DISubprogram(name: "strtok", scope: !6, file: !6, line: 356, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!79 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !80, file: !15, line: 94)
+!80 = !DISubprogram(name: "strxfrm", scope: !6, file: !6, line: 166, type: !81, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!81 = !DISubroutineType(types: !82)
+!82 = !{!12, !41, !42, !12}
+!83 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !84, file: !15, line: 95)
+!84 = !DISubprogram(name: "strchr", scope: !6, file: !6, line: 239, type: !85, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!85 = !DISubroutineType(types: !86)
+!86 = !{!43, !43, !11}
+!87 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !88, file: !15, line: 96)
+!88 = !DISubprogram(name: "strpbrk", scope: !6, file: !6, line: 316, type: !89, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!89 = !DISubroutineType(types: !90)
+!90 = !{!43, !43, !43}
+!91 = !DIImpo...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Please can you reorder the commits so the new tests are added earlier so you can you show the codegen diff? |
The commit message should explain what is being changed here |
; CHECK: [[DBG119]] = !DILocation(line: 6, column: 5, scope: [[DBG103]]) | ||
; CHECK: [[DIASSIGNID120]] = distinct !DIAssignID() | ||
; CHECK: [[DBG121]] = !DILocation(line: 7, column: 5, scope: [[DBG103]]) | ||
; CHECK: [[DBG122]] = !DILocation(line: 8, column: 1, scope: [[DBG103]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are all of these actually necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've simplified the test, thanks.
ConstantInt::get(DeadWriteLength->getType(), FrontSize)); | ||
DeadIntrinsic->addDereferenceableParamAttr(0, FrontSize); | ||
|
||
Value *Indices[1] = {ConstantInt::get(DeadWriteLength->getType(), RearStart)}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC we have an array here for the conversion to ArrayRef later? There is an implicit conversion from T
to ArrayRef<T>
, so we probably don't need this
DSE can already shorten intrinsics which have dead fronts or rears. This patch enables DSE to split memory intrinsics that are dead in the middle into `Front` and `Rear`: ``` // __Front__ ___Rear___ // | ------------- Dead ------------- | // | --- Killing --- | ``` Resolves llvm#72113
; | ||
entry: | ||
%blob = alloca [1000 x i8], align 16, !DIAssignID !112 | ||
call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !112, metadata ptr %blob, metadata !DIExpression()), !dbg !113 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please simplify the test, by dropping the debug info and unneeded lifetime s
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've simplified the test and removed the lifetimes, but I've not removed the debuginfo entirely because that specific test is for dbg.assign intrinsic updates.
// If Front and Rear are both bigger than the threshold they won't be inlined | ||
// so this seems like a bad idea. If Dead is smaller than the threshold it | ||
// will be inlined so this isn't a good idea. | ||
if ((FrontSize > Threshold && RearSize > Threshold) || DeadSize < Threshold) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are all those combos covered by tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added tests to cover all scenarios, thanks.
- Remove unnecessary array - Simplify debuginfo test - Add tests to cover the profitability check -
Sorry for the force push, diff from pre-committed tests is here: 6f26953 |
Thanks a lot for looking into this! Not sure how much time I have to look at this before the end of the year. But I've downloaded the current patch to run some early testing. A few things that I noticed already:
I understand that it could be hard to find a heuristic and TTI hooks that works well for any target (and specially for out-of-targets like mine). And I don't think that needs to be a goal with the first implementation either. Adding more hooks and conditions could be something that evolves over time, when tuning this for different targets. So some of the above comments are just to highlight some different scenarios that sprung to mind when considering how to tune this for my target. |
Hi @bjope, Thanks for looking at this.
I've looked at this now and I think the logic for trimming atomic operations is required here as well. I'll add it shortly.
This is a good point, I'm inclined to add a check to let the transform run when
Yup, I imagine some of these considerations also exist for the heuristics in the tryToShortenBegin and tryToShortenEnd functions, they can be addressed in future iterations. |
Apologies for the delay (I've been away), I've added the check for atomic intrinsics and an associated test like I mentioned earlier. |
(Polite ping!) |
Hi @bjope / @felipepiovezan / @fhahn, are there any other changes that you'd like me to make to this patch? |
I've now also removed the |
@@ -531,7 +533,9 @@ define void @test12_memset_other_store_in_between_partial_overlap(ptr %ptr) { | |||
|
|||
define void @test12_memset_later_store_exceeds_memset(ptr %ptr) { | |||
; CHECK-LABEL: @test12_memset_later_store_exceeds_memset( | |||
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 8, i1 false) | |||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 5 | |||
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(5) [[TMP1]], i8 0, i64 5, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks wrong? We used to have a single memset writing 8 bytes, and now we got two memset writing 5+4=9 bytes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the delay, fixed in bc32bcb.
@@ -246,7 +246,9 @@ define i32 @test5(i32 %x) nounwind ssp { | |||
; CHECK-LABEL: @test5( | |||
; CHECK-NEXT: entry: | |||
; CHECK-NEXT: [[Y:%.*]] = alloca [[STRUCT_S:%.*]], align 16 | |||
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[Y]], ptr align 16 @sS, i64 32, i1 false) | |||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 16 | |||
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 dereferenceable(16) [[TMP0]], ptr align 16 @sS, i64 16, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This also looks weird. Splitting up the memset in two, but we still write 16+16=32 bytes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 1805544.
Ping! I think I've addressed all the review comments, are we happy to go ahead with this now? |
Pinging again! |
@@ -2054,7 +2147,7 @@ struct DSEState { | |||
return false; | |||
} | |||
|
|||
bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) { | |||
bool removePartiallyOverlappedIntrinsicStores(InstOverlapIntervalsTy &IOL) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need to change this function name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure about that, I thought the name doesn't accurately convey what the function does, it works on store-like intrinsics not on store
instructions but I'm happy to make the change if you feel strongly enough about it.
; CHECK-SAME: ptr nocapture noundef writeonly [[X:%.*]]) { | ||
; CHECK-NEXT: entry: | ||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 976 | ||
; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(24) [[TMP0]], i8 5, i64 24, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We also need to use a smaller align value, can you add a test case for that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you handled the case where the alignments are different?
For example,
define void @overwrite_middle(ptr %X) {
entry:
tail call void @llvm.memset.p0.i64(ptr align 16 dereferenceable(1000) %X, i8 5, i64 1000, i1 false)
%arrayidx = getelementptr inbounds i8, ptr %X, i64 15
tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(980) %arrayidx, i8 3, i64 980, i1 false)
ret void
}
|
||
; RUN: opt < %s -passes=dse -S | FileCheck %s | ||
|
||
define void @overwrite_middle(ptr %X) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define void @overwrite_middle(ptr %X) { | |
define void @overwrite_middle(ptr %X) { |
Nits. Elsewhere, too.
; CHECK-NEXT: ret void | ||
; | ||
entry: | ||
call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 %X, i8 5, i64 1000, i32 4) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we split atomic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, the only requirement appears to be that the size of front and rear should be a muiltiple of the element size.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks! But I cannot review about atomic. :)
I mean that using a smaller align to reduce dead memory intrinsic: https://alive2.llvm.org/ce/z/LiGZUd. Maybe this could be another PR.
; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(15) [[TMP0]], i8 5, i64 15, i1 false) | ||
; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(10) [[X]], i8 5, i64 10, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you swap the two memset intrinsic? This will probably make the CPU cache happy?
entry: | ||
tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(50) %X, i8 5, i64 50, i1 false) | ||
%arrayidx = getelementptr inbounds i8, ptr %X, i64 10 | ||
tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(45) %arrayidx, i8 3, i64 25, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(45) %arrayidx, i8 3, i64 25, i1 false) | |
tail call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(45) %arrayidx, i8 3, i64 25, i1 false) |
Maybe you can run opt -S
to get a cleaner IR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please also remove the tail call.
I'm still a bit worried that this will result in lots of regressions, at least for my target. I wonder if there perhaps should be some kind of TTI hook or a cmd line option to easily disable the whole feature (the latter could make it easier to benchmark by comparing results with/without the optimization enabled). I can of course add that downstream, so unless it will be used by in-tree targets it might seem weird to have such TTI hook. But to make sure that this lands softly, maybe the feature should be disabled completely in the first version. Limiting any potential revert to flipping some big switch rather than having to revert all the code changes. Things I'm concerned about when doing some test compiles with this enabled downstream are things like this: The below would copy 29 bytes instead of 31. But for our target I think it will cost more to do two cross-addrspace memcpy compared to copy 2 more bytes in a single copy (both latency wise, and concerning the memory buses etc).
The 144 byte memcpy in the example below is split into 123+20. So we only save a single byte. At least our target has optimized memcpy version copying several bytes per iterations in the inner loop (regardless of alignment). So splitting it up will just
For below example with memset being splitted we used to get three 8 byte stores. Now we get two 8 byte stores + one 4 byte store + one 1 byte store, plus extra pointer fiddeling, including increased register pressure. So even if we memset three bytes less the emitted asm looks much worse.
It is hard for a middle-end pass to know details about the target architecture, and in general we want it to canonicalize to something target independant. But I think it also might be hard to fuse thse memcpy:s back together in the backend. So a bit worried that even if on C-level the developer has decided to use a single memcpy (as a tuning) DSE now might mess up and split it in two memcpy even if it isn't feasible. In the end I think we need more than the already existing getMaxMemIntrinsicInlineSizeThreshold (which we currently has set to UINT64_MAX for our target). So that might be another reason to land this "softly" by letting backend people add more tweaks to tune this and enable it in a more controlled way. Maybe even this first implementation should include some more heuristics, such as not splitting an 144 byte memcpy into two "uneven" 123+20 byte pieces, just to save copying of a single byte? (The goal with #72113 was kind of to split up the memset/memcpy when we can avoid touching a larger chunk of memory twice.) |
Thanks for the reviews and the explanation. Given the current situation I'm going to close this PR as I do not have the bandwith to work on this further at the moment. Hopefully someone else will be able to pick it up in the future. |
This patch enables Dead Store Elimination to split memory intrinsics that are dead in the middle into
Front
andRear
:Resolves #72113