Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DSE] Split memory intrinsics if they are dead in the middle #75478

Open
wants to merge 11 commits into
base: main
Choose a base branch
from

Conversation

omern1
Copy link
Member

@omern1 omern1 commented Dec 14, 2023

This patch enables Dead Store Elimination to split memory intrinsics that are dead in the middle into Front and Rear:

  // __Front__                 ___Rear___
  // | ------------- Dead ------------- |
  //         | --- Killing --- |

Resolves #72113

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 14, 2023

@llvm/pr-subscribers-llvm-transforms

Author: Nabeel Omer (omern1)

Changes

This patch enables Dead Store Elimination to split memory intrinsics that are dead in the middle into Front and Rear:

  // __Front__                 ___Rear___
  // | ------------- Dead ------------- |
  //         | --- Killing --- |

Resolves #72113

Pre-committing the tests to show diff in a moment.


Patch is 38.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75478.diff

3 Files Affected:

  • (modified) llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp (+85-7)
  • (added) llvm/test/DebugInfo/dse-split-memintrinsics.ll (+305)
  • (added) llvm/test/Transforms/DeadStoreElimination/dead-middle-split.ll (+43)
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 008dcc53fd44fc..7e9f1b6350a014 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -554,6 +555,78 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   }
 }
 
+static bool tryToSplitMiddle(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
+                      int64_t &DeadStart, uint64_t &DeadSize, const TargetTransformInfo& TTI) {
+  if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
+    return false;
+
+  OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+  int64_t KillingStart = OII->second;
+  uint64_t KillingSize = OII->first - KillingStart;
+
+  assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
+
+  uint64_t Threshold = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+
+  // __Front__                 ___Rear___
+  // | ------------- Dead ------------- |
+  //         | --- Killing --- |
+
+  if (KillingStart < DeadStart ||
+      uint64_t(KillingStart + KillingSize) > uint64_t(DeadStart + DeadSize))
+    return false;
+
+  auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
+  Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
+
+  // Assume Front is already correctly aligned.
+  uint64_t FrontSize = KillingStart - DeadStart;
+
+  int64_t RearStart = alignDown(
+      uint64_t(KillingStart + KillingSize), PrefAlign.value());
+  uint64_t RearSize = (DeadStart + DeadSize) - RearStart;
+
+  // If Front and Rear are both bigger than the threshold they won't be inlined
+  // so this seems like a bad idea. If Dead is smaller than the threshold it
+  // will be inlined so this isn't a good idea.
+  if ((FrontSize > Threshold && RearSize > Threshold) || DeadSize < Threshold)
+    return false;
+
+  Value *DeadWriteLength = DeadIntrinsic->getLength();
+  Value *DeadDest = DeadIntrinsic->getRawDest();
+
+  LLVM_DEBUG(dbgs() << "DSE: Split and shortened partially dead store: ["
+                    << DeadStart << ", " << DeadSize + DeadStart
+                    << "]\nInto: Front: [" << DeadStart << ", "
+                    << DeadStart + FrontSize << "], Rear: [" << RearStart
+                    << ", " << RearStart + RearSize << "]\n"
+                    << "Killer: [" << KillingStart << ", "
+                    << KillingSize + KillingStart << "]\n");
+
+  // Dead is now Front.
+  DeadIntrinsic->setLength(
+      ConstantInt::get(DeadWriteLength->getType(), FrontSize));
+  DeadIntrinsic->addDereferenceableParamAttr(0, FrontSize);
+
+  Value *Indices[1] = {ConstantInt::get(DeadWriteLength->getType(), RearStart)};
+  Instruction *RearDestGEP = GetElementPtrInst::CreateInBounds(
+      Type::getInt8Ty(DeadIntrinsic->getContext()), DeadDest, Indices, "",
+      DeadI);
+  auto *Rear = cast<AnyMemIntrinsic>(DeadIntrinsic->clone());
+  Rear->setDest(RearDestGEP);
+  Rear->setLength(ConstantInt::get(DeadWriteLength->getType(), RearSize));
+  Rear->insertAfter(RearDestGEP);
+  Rear->setDestAlignment(PrefAlign);
+  Rear->addDereferenceableParamAttr(0, RearSize);
+
+  shortenAssignment(DeadI, DeadDest, DeadStart * 8, DeadSize * 8, FrontSize * 8,
+                    true);
+
+  IntervalMap.erase(OII);
+  DeadSize = FrontSize;
+  return true;
+}
+
 static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
                          uint64_t &DeadSize, int64_t KillingStart,
                          uint64_t KillingSize, bool IsOverwriteEnd) {
@@ -821,6 +894,7 @@ struct DSEState {
   const TargetLibraryInfo &TLI;
   const DataLayout &DL;
   const LoopInfo &LI;
+  const TargetTransformInfo &TTI;
 
   // Whether the function contains any irreducible control flow, useful for
   // being accurately able to detect loops.
@@ -860,9 +934,9 @@ struct DSEState {
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
-           const LoopInfo &LI)
+           const LoopInfo &LI, const TargetTransformInfo& TTI)
       : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
-        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), TTI(TTI) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
@@ -1978,7 +2052,7 @@ struct DSEState {
     return false;
   }
 
-  bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) {
+  bool removePartiallyOverlappedIntrinsicStores(InstOverlapIntervalsTy &IOL) {
     bool Changed = false;
     for (auto OI : IOL) {
       Instruction *DeadI = OI.first;
@@ -1994,6 +2068,9 @@ struct DSEState {
       if (IntervalMap.empty())
         continue;
       Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+      if (IntervalMap.empty())
+        continue;
+      Changed |= tryToSplitMiddle(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
     }
     return Changed;
   }
@@ -2059,10 +2136,10 @@ struct DSEState {
 static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                                 DominatorTree &DT, PostDominatorTree &PDT,
                                 const TargetLibraryInfo &TLI,
-                                const LoopInfo &LI) {
+                                const LoopInfo &LI, const TargetTransformInfo& TTI) {
   bool MadeChange = false;
 
-  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
@@ -2226,7 +2303,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
 
   if (EnablePartialOverwriteTracking)
     for (auto &KV : State.IOLs)
-      MadeChange |= State.removePartiallyOverlappedStores(KV.second);
+      MadeChange |= State.removePartiallyOverlappedIntrinsicStores(KV.second);
 
   MadeChange |= State.eliminateRedundantStoresOfExistingValues();
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
@@ -2244,8 +2321,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
   PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
 
-  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
 
 #ifdef LLVM_ENABLE_STATS
   if (AreStatisticsEnabled())
diff --git a/llvm/test/DebugInfo/dse-split-memintrinsics.ll b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
new file mode 100644
index 00000000000000..f7c083a2f0969c
--- /dev/null
+++ b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=dse -S | FileCheck %s
+
+; Check a dbg.assign is inserted that sets the dead middle bits to no-location (see tryToSplitMiddle).
+
+define dso_local void @_Z22overwrite_middle_localv() local_unnamed_addr #0 !dbg !103 {
+; CHECK-LABEL: define dso_local void @_Z22overwrite_middle_localv(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG103:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BLOB:%.*]] = alloca [1000 x i8], align 16, !DIAssignID [[DIASSIGNID112:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108:![0-9]+]], metadata !DIExpression(), metadata [[DIASSIGNID112]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5:[0-9]+]], !dbg [[DBG114:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 976
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(24) [[TMP0]], i8 5, i64 24, i1 false), !dbg [[DBG115:![0-9]+]], !DIAssignID [[DIASSIGNID116:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(10) [[BLOB]], i8 5, i64 10, i1 false), !dbg [[DBG115]], !DIAssignID [[DIASSIGNID116]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(), metadata [[DIASSIGNID116]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7920), metadata [[META117:![0-9]+]], metadata ptr undef, metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 10, !dbg [[DBG118:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) [[ADD_PTR]], i8 3, i64 980, i1 false), !dbg [[DBG119:![0-9]+]], !DIAssignID [[DIASSIGNID120:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata [[DIASSIGNID120]], metadata ptr [[ADD_PTR]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT:    call void @_Z3escPc(ptr noundef nonnull [[BLOB]]), !dbg [[DBG121:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5]], !dbg [[DBG122:![0-9]+]]
+; CHECK-NEXT:    ret void, !dbg [[DBG122]]
+;
+entry:
+  %blob = alloca [1000 x i8], align 16, !DIAssignID !112
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !112, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+  call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull %blob) #5, !dbg !114
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(1000) %blob, i8 5, i64 1000, i1 false), !dbg !115, !DIAssignID !116
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !116, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+  %add.ptr = getelementptr inbounds i8, ptr %blob, i64 10, !dbg !117
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) %add.ptr, i8 3, i64 980, i1 false), !dbg !118, !DIAssignID !119
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata !119, metadata ptr %add.ptr, metadata !DIExpression()), !dbg !113
+  call void @_Z3escPc(ptr noundef nonnull %blob), !dbg !120
+  call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull %blob) #5, !dbg !121
+  ret void, !dbg !121
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+declare !dbg !122 void @_Z3escPc(ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #4
+
+attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #3 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!95, !96, !97, !98, !99, !100, !101}
+!llvm.ident = !{!102}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, imports: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "debuginfo.cpp", directory: "/home", checksumkind: CSK_MD5, checksum: "3dc84462c14a3d86dd372d0473fa13aa")
+!2 = !{!3, !16, !20, !27, !31, !35, !45, !49, !51, !53, !57, !61, !65, !69, !73, !75, !77, !79, !83, !87, !91, !93}
+!3 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5, file: !15, line: 77)
+!4 = !DINamespace(name: "std", scope: null)
+!5 = !DISubprogram(name: "memchr", scope: !6, file: !6, line: 100, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!6 = !DIFile(filename: "/usr/include/string.h", directory: "", checksumkind: CSK_MD5, checksum: "3fc3efdf2e52b973f380a6e7608374ff")
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9, !11, !12}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: null)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !13, line: 13, baseType: !14)
+!13 = !DIFile(filename: "build_upstream/lib/clang/18/include/__stddef_size_t.h", directory: "/home", checksumkind: CSK_MD5, checksum: "405db6ea5fb824de326715f26fa9fab5")
+!14 = !DIBasicType(name: "unsigned long", size: 64, encoding: DW_ATE_unsigned)
+!15 = !DIFile(filename: "/usr/lib64/gcc/x86_64-suse-linux/13/../../../../include/c++/13/cstring", directory: "")
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !17, file: !15, line: 78)
+!17 = !DISubprogram(name: "memcmp", scope: !6, file: !6, line: 64, type: !18, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!11, !9, !9, !12}
+!20 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !21, file: !15, line: 79)
+!21 = !DISubprogram(name: "memcpy", scope: !6, file: !6, line: 43, type: !22, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!22 = !DISubroutineType(types: !23)
+!23 = !{!24, !25, !26, !12}
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!25 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24)
+!26 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!27 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !28, file: !15, line: 80)
+!28 = !DISubprogram(name: "memmove", scope: !6, file: !6, line: 47, type: !29, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!29 = !DISubroutineType(types: !30)
+!30 = !{!24, !24, !9, !12}
+!31 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !32, file: !15, line: 81)
+!32 = !DISubprogram(name: "memset", scope: !6, file: !6, line: 61, type: !33, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!33 = !DISubroutineType(types: !34)
+!34 = !{!24, !24, !11, !12}
+!35 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !36, file: !15, line: 82)
+!36 = !DISubprogram(name: "strcat", scope: !6, file: !6, line: 149, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!37 = !DISubroutineType(types: !38)
+!38 = !{!39, !41, !42}
+!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !40, size: 64)
+!40 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!41 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !39)
+!42 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !43)
+!43 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !44, size: 64)
+!44 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !40)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !46, file: !15, line: 83)
+!46 = !DISubprogram(name: "strcmp", scope: !6, file: !6, line: 156, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!11, !43, !43}
+!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !50, file: !15, line: 84)
+!50 = !DISubprogram(name: "strcoll", scope: !6, file: !6, line: 163, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !52, file: !15, line: 85)
+!52 = !DISubprogram(name: "strcpy", scope: !6, file: !6, line: 141, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!53 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !54, file: !15, line: 86)
+!54 = !DISubprogram(name: "strcspn", scope: !6, file: !6, line: 293, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!55 = !DISubroutineType(types: !56)
+!56 = !{!12, !43, !43}
+!57 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !58, file: !15, line: 87)
+!58 = !DISubprogram(name: "strerror", scope: !6, file: !6, line: 419, type: !59, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!59 = !DISubroutineType(types: !60)
+!60 = !{!39, !11}
+!61 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !62, file: !15, line: 88)
+!62 = !DISubprogram(name: "strlen", scope: !6, file: !6, line: 407, type: !63, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!63 = !DISubroutineType(types: !64)
+!64 = !{!12, !43}
+!65 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !66, file: !15, line: 89)
+!66 = !DISubprogram(name: "strncat", scope: !6, file: !6, line: 152, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!67 = !DISubroutineType(types: !68)
+!68 = !{!39, !41, !42, !12}
+!69 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !70, file: !15, line: 90)
+!70 = !DISubprogram(name: "strncmp", scope: !6, file: !6, line: 159, type: !71, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!71 = !DISubroutineType(types: !72)
+!72 = !{!11, !43, !43, !12}
+!73 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !74, file: !15, line: 91)
+!74 = !DISubprogram(name: "strncpy", scope: !6, file: !6, line: 144, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!75 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !76, file: !15, line: 92)
+!76 = !DISubprogram(name: "strspn", scope: !6, file: !6, line: 297, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!77 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !78, file: !15, line: 93)
+!78 = !DISubprogram(name: "strtok", scope: !6, file: !6, line: 356, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!79 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !80, file: !15, line: 94)
+!80 = !DISubprogram(name: "strxfrm", scope: !6, file: !6, line: 166, type: !81, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!81 = !DISubroutineType(types: !82)
+!82 = !{!12, !41, !42, !12}
+!83 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !84, file: !15, line: 95)
+!84 = !DISubprogram(name: "strchr", scope: !6, file: !6, line: 239, type: !85, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!85 = !DISubroutineType(types: !86)
+!86 = !{!43, !43, !11}
+!87 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !88, file: !15, line: 96)
+!88 = !DISubprogram(name: "strpbrk", scope: !6, file: !6, line: 316, type: !89, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!89 = !DISubroutineType(types: !90)
+!90 = !{!43, !43, !43}
+!91 = !DIImpo...
[truncated]

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 14, 2023

@llvm/pr-subscribers-debuginfo

Author: Nabeel Omer (omern1)

Changes

This patch enables Dead Store Elimination to split memory intrinsics that are dead in the middle into Front and Rear:

  // __Front__                 ___Rear___
  // | ------------- Dead ------------- |
  //         | --- Killing --- |

Resolves #72113

Pre-committing the tests to show diff in a moment.


Patch is 38.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75478.diff

3 Files Affected:

  • (modified) llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp (+85-7)
  • (added) llvm/test/DebugInfo/dse-split-memintrinsics.ll (+305)
  • (added) llvm/test/Transforms/DeadStoreElimination/dead-middle-split.ll (+43)
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 008dcc53fd44fc..7e9f1b6350a014 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -554,6 +555,78 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   }
 }
 
+static bool tryToSplitMiddle(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
+                      int64_t &DeadStart, uint64_t &DeadSize, const TargetTransformInfo& TTI) {
+  if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
+    return false;
+
+  OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+  int64_t KillingStart = OII->second;
+  uint64_t KillingSize = OII->first - KillingStart;
+
+  assert(OII->first - KillingStart >= 0 && "Size expected to be positive");
+
+  uint64_t Threshold = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+
+  // __Front__                 ___Rear___
+  // | ------------- Dead ------------- |
+  //         | --- Killing --- |
+
+  if (KillingStart < DeadStart ||
+      uint64_t(KillingStart + KillingSize) > uint64_t(DeadStart + DeadSize))
+    return false;
+
+  auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
+  Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
+
+  // Assume Front is already correctly aligned.
+  uint64_t FrontSize = KillingStart - DeadStart;
+
+  int64_t RearStart = alignDown(
+      uint64_t(KillingStart + KillingSize), PrefAlign.value());
+  uint64_t RearSize = (DeadStart + DeadSize) - RearStart;
+
+  // If Front and Rear are both bigger than the threshold they won't be inlined
+  // so this seems like a bad idea. If Dead is smaller than the threshold it
+  // will be inlined so this isn't a good idea.
+  if ((FrontSize > Threshold && RearSize > Threshold) || DeadSize < Threshold)
+    return false;
+
+  Value *DeadWriteLength = DeadIntrinsic->getLength();
+  Value *DeadDest = DeadIntrinsic->getRawDest();
+
+  LLVM_DEBUG(dbgs() << "DSE: Split and shortened partially dead store: ["
+                    << DeadStart << ", " << DeadSize + DeadStart
+                    << "]\nInto: Front: [" << DeadStart << ", "
+                    << DeadStart + FrontSize << "], Rear: [" << RearStart
+                    << ", " << RearStart + RearSize << "]\n"
+                    << "Killer: [" << KillingStart << ", "
+                    << KillingSize + KillingStart << "]\n");
+
+  // Dead is now Front.
+  DeadIntrinsic->setLength(
+      ConstantInt::get(DeadWriteLength->getType(), FrontSize));
+  DeadIntrinsic->addDereferenceableParamAttr(0, FrontSize);
+
+  Value *Indices[1] = {ConstantInt::get(DeadWriteLength->getType(), RearStart)};
+  Instruction *RearDestGEP = GetElementPtrInst::CreateInBounds(
+      Type::getInt8Ty(DeadIntrinsic->getContext()), DeadDest, Indices, "",
+      DeadI);
+  auto *Rear = cast<AnyMemIntrinsic>(DeadIntrinsic->clone());
+  Rear->setDest(RearDestGEP);
+  Rear->setLength(ConstantInt::get(DeadWriteLength->getType(), RearSize));
+  Rear->insertAfter(RearDestGEP);
+  Rear->setDestAlignment(PrefAlign);
+  Rear->addDereferenceableParamAttr(0, RearSize);
+
+  shortenAssignment(DeadI, DeadDest, DeadStart * 8, DeadSize * 8, FrontSize * 8,
+                    true);
+
+  IntervalMap.erase(OII);
+  DeadSize = FrontSize;
+  return true;
+}
+
 static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
                          uint64_t &DeadSize, int64_t KillingStart,
                          uint64_t KillingSize, bool IsOverwriteEnd) {
@@ -821,6 +894,7 @@ struct DSEState {
   const TargetLibraryInfo &TLI;
   const DataLayout &DL;
   const LoopInfo &LI;
+  const TargetTransformInfo &TTI;
 
   // Whether the function contains any irreducible control flow, useful for
   // being accurately able to detect loops.
@@ -860,9 +934,9 @@ struct DSEState {
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
            PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
-           const LoopInfo &LI)
+           const LoopInfo &LI, const TargetTransformInfo& TTI)
       : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
-        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+        PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), TTI(TTI) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
@@ -1978,7 +2052,7 @@ struct DSEState {
     return false;
   }
 
-  bool removePartiallyOverlappedStores(InstOverlapIntervalsTy &IOL) {
+  bool removePartiallyOverlappedIntrinsicStores(InstOverlapIntervalsTy &IOL) {
     bool Changed = false;
     for (auto OI : IOL) {
       Instruction *DeadI = OI.first;
@@ -1994,6 +2068,9 @@ struct DSEState {
       if (IntervalMap.empty())
         continue;
       Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+      if (IntervalMap.empty())
+        continue;
+      Changed |= tryToSplitMiddle(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
     }
     return Changed;
   }
@@ -2059,10 +2136,10 @@ struct DSEState {
 static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                                 DominatorTree &DT, PostDominatorTree &PDT,
                                 const TargetLibraryInfo &TLI,
-                                const LoopInfo &LI) {
+                                const LoopInfo &LI, const TargetTransformInfo& TTI) {
   bool MadeChange = false;
 
-  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+  DSEState State(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
@@ -2226,7 +2303,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
 
   if (EnablePartialOverwriteTracking)
     for (auto &KV : State.IOLs)
-      MadeChange |= State.removePartiallyOverlappedStores(KV.second);
+      MadeChange |= State.removePartiallyOverlappedIntrinsicStores(KV.second);
 
   MadeChange |= State.eliminateRedundantStoresOfExistingValues();
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
@@ -2244,8 +2321,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
   PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
 
-  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI, TTI);
 
 #ifdef LLVM_ENABLE_STATS
   if (AreStatisticsEnabled())
diff --git a/llvm/test/DebugInfo/dse-split-memintrinsics.ll b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
new file mode 100644
index 00000000000000..f7c083a2f0969c
--- /dev/null
+++ b/llvm/test/DebugInfo/dse-split-memintrinsics.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=dse -S | FileCheck %s
+
+; Check a dbg.assign is inserted that sets the dead middle bits to no-location (see tryToSplitMiddle).
+
+define dso_local void @_Z22overwrite_middle_localv() local_unnamed_addr #0 !dbg !103 {
+; CHECK-LABEL: define dso_local void @_Z22overwrite_middle_localv(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG103:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BLOB:%.*]] = alloca [1000 x i8], align 16, !DIAssignID [[DIASSIGNID112:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108:![0-9]+]], metadata !DIExpression(), metadata [[DIASSIGNID112]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5:[0-9]+]], !dbg [[DBG114:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 976
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(24) [[TMP0]], i8 5, i64 24, i1 false), !dbg [[DBG115:![0-9]+]], !DIAssignID [[DIASSIGNID116:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(10) [[BLOB]], i8 5, i64 10, i1 false), !dbg [[DBG115]], !DIAssignID [[DIASSIGNID116]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(), metadata [[DIASSIGNID116]], metadata ptr [[BLOB]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7920), metadata [[META117:![0-9]+]], metadata ptr undef, metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[BLOB]], i64 10, !dbg [[DBG118:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) [[ADD_PTR]], i8 3, i64 980, i1 false), !dbg [[DBG119:![0-9]+]], !DIAssignID [[DIASSIGNID120:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.assign(metadata i1 undef, metadata [[META108]], metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata [[DIASSIGNID120]], metadata ptr [[ADD_PTR]], metadata !DIExpression()), !dbg [[DBG113]]
+; CHECK-NEXT:    call void @_Z3escPc(ptr noundef nonnull [[BLOB]]), !dbg [[DBG121:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull [[BLOB]]) #[[ATTR5]], !dbg [[DBG122:![0-9]+]]
+; CHECK-NEXT:    ret void, !dbg [[DBG122]]
+;
+entry:
+  %blob = alloca [1000 x i8], align 16, !DIAssignID !112
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !112, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+  call void @llvm.lifetime.start.p0(i64 1000, ptr nonnull %blob) #5, !dbg !114
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(1000) %blob, i8 5, i64 1000, i1 false), !dbg !115, !DIAssignID !116
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !116, metadata ptr %blob, metadata !DIExpression()), !dbg !113
+  %add.ptr = getelementptr inbounds i8, ptr %blob, i64 10, !dbg !117
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 2 dereferenceable(980) %add.ptr, i8 3, i64 980, i1 false), !dbg !118, !DIAssignID !119
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(DW_OP_LLVM_fragment, 80, 7840), metadata !119, metadata ptr %add.ptr, metadata !DIExpression()), !dbg !113
+  call void @_Z3escPc(ptr noundef nonnull %blob), !dbg !120
+  call void @llvm.lifetime.end.p0(i64 1000, ptr nonnull %blob) #5, !dbg !121
+  ret void, !dbg !121
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+declare !dbg !122 void @_Z3escPc(ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #4
+
+attributes #0 = { mustprogress uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #3 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #5 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!95, !96, !97, !98, !99, !100, !101}
+!llvm.ident = !{!102}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, imports: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "debuginfo.cpp", directory: "/home", checksumkind: CSK_MD5, checksum: "3dc84462c14a3d86dd372d0473fa13aa")
+!2 = !{!3, !16, !20, !27, !31, !35, !45, !49, !51, !53, !57, !61, !65, !69, !73, !75, !77, !79, !83, !87, !91, !93}
+!3 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5, file: !15, line: 77)
+!4 = !DINamespace(name: "std", scope: null)
+!5 = !DISubprogram(name: "memchr", scope: !6, file: !6, line: 100, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!6 = !DIFile(filename: "/usr/include/string.h", directory: "", checksumkind: CSK_MD5, checksum: "3fc3efdf2e52b973f380a6e7608374ff")
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9, !11, !12}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: null)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !13, line: 13, baseType: !14)
+!13 = !DIFile(filename: "build_upstream/lib/clang/18/include/__stddef_size_t.h", directory: "/home", checksumkind: CSK_MD5, checksum: "405db6ea5fb824de326715f26fa9fab5")
+!14 = !DIBasicType(name: "unsigned long", size: 64, encoding: DW_ATE_unsigned)
+!15 = !DIFile(filename: "/usr/lib64/gcc/x86_64-suse-linux/13/../../../../include/c++/13/cstring", directory: "")
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !17, file: !15, line: 78)
+!17 = !DISubprogram(name: "memcmp", scope: !6, file: !6, line: 64, type: !18, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!11, !9, !9, !12}
+!20 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !21, file: !15, line: 79)
+!21 = !DISubprogram(name: "memcpy", scope: !6, file: !6, line: 43, type: !22, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!22 = !DISubroutineType(types: !23)
+!23 = !{!24, !25, !26, !12}
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!25 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24)
+!26 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!27 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !28, file: !15, line: 80)
+!28 = !DISubprogram(name: "memmove", scope: !6, file: !6, line: 47, type: !29, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!29 = !DISubroutineType(types: !30)
+!30 = !{!24, !24, !9, !12}
+!31 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !32, file: !15, line: 81)
+!32 = !DISubprogram(name: "memset", scope: !6, file: !6, line: 61, type: !33, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!33 = !DISubroutineType(types: !34)
+!34 = !{!24, !24, !11, !12}
+!35 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !36, file: !15, line: 82)
+!36 = !DISubprogram(name: "strcat", scope: !6, file: !6, line: 149, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!37 = !DISubroutineType(types: !38)
+!38 = !{!39, !41, !42}
+!39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !40, size: 64)
+!40 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!41 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !39)
+!42 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !43)
+!43 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !44, size: 64)
+!44 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !40)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !46, file: !15, line: 83)
+!46 = !DISubprogram(name: "strcmp", scope: !6, file: !6, line: 156, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!11, !43, !43}
+!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !50, file: !15, line: 84)
+!50 = !DISubprogram(name: "strcoll", scope: !6, file: !6, line: 163, type: !47, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !52, file: !15, line: 85)
+!52 = !DISubprogram(name: "strcpy", scope: !6, file: !6, line: 141, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!53 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !54, file: !15, line: 86)
+!54 = !DISubprogram(name: "strcspn", scope: !6, file: !6, line: 293, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!55 = !DISubroutineType(types: !56)
+!56 = !{!12, !43, !43}
+!57 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !58, file: !15, line: 87)
+!58 = !DISubprogram(name: "strerror", scope: !6, file: !6, line: 419, type: !59, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!59 = !DISubroutineType(types: !60)
+!60 = !{!39, !11}
+!61 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !62, file: !15, line: 88)
+!62 = !DISubprogram(name: "strlen", scope: !6, file: !6, line: 407, type: !63, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!63 = !DISubroutineType(types: !64)
+!64 = !{!12, !43}
+!65 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !66, file: !15, line: 89)
+!66 = !DISubprogram(name: "strncat", scope: !6, file: !6, line: 152, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!67 = !DISubroutineType(types: !68)
+!68 = !{!39, !41, !42, !12}
+!69 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !70, file: !15, line: 90)
+!70 = !DISubprogram(name: "strncmp", scope: !6, file: !6, line: 159, type: !71, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!71 = !DISubroutineType(types: !72)
+!72 = !{!11, !43, !43, !12}
+!73 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !74, file: !15, line: 91)
+!74 = !DISubprogram(name: "strncpy", scope: !6, file: !6, line: 144, type: !67, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!75 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !76, file: !15, line: 92)
+!76 = !DISubprogram(name: "strspn", scope: !6, file: !6, line: 297, type: !55, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!77 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !78, file: !15, line: 93)
+!78 = !DISubprogram(name: "strtok", scope: !6, file: !6, line: 356, type: !37, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!79 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !80, file: !15, line: 94)
+!80 = !DISubprogram(name: "strxfrm", scope: !6, file: !6, line: 166, type: !81, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!81 = !DISubroutineType(types: !82)
+!82 = !{!12, !41, !42, !12}
+!83 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !84, file: !15, line: 95)
+!84 = !DISubprogram(name: "strchr", scope: !6, file: !6, line: 239, type: !85, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!85 = !DISubroutineType(types: !86)
+!86 = !{!43, !43, !11}
+!87 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !88, file: !15, line: 96)
+!88 = !DISubprogram(name: "strpbrk", scope: !6, file: !6, line: 316, type: !89, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!89 = !DISubroutineType(types: !90)
+!90 = !{!43, !43, !43}
+!91 = !DIImpo...
[truncated]

Copy link

github-actions bot commented Dec 14, 2023

✅ With the latest revision this PR passed the C/C++ code formatter.

@RKSimon
Copy link
Collaborator

RKSimon commented Dec 14, 2023

Please can you reorder the commits so the new tests are added earlier so you can you show the codegen diff?

@felipepiovezan
Copy link
Contributor

The commit message should explain what is being changed here

; CHECK: [[DBG119]] = !DILocation(line: 6, column: 5, scope: [[DBG103]])
; CHECK: [[DIASSIGNID120]] = distinct !DIAssignID()
; CHECK: [[DBG121]] = !DILocation(line: 7, column: 5, scope: [[DBG103]])
; CHECK: [[DBG122]] = !DILocation(line: 8, column: 1, scope: [[DBG103]])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are all of these actually necessary?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've simplified the test, thanks.

ConstantInt::get(DeadWriteLength->getType(), FrontSize));
DeadIntrinsic->addDereferenceableParamAttr(0, FrontSize);

Value *Indices[1] = {ConstantInt::get(DeadWriteLength->getType(), RearStart)};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC we have an array here for the conversion to ArrayRef later? There is an implicit conversion from T to ArrayRef<T>, so we probably don't need this

DSE can already shorten intrinsics which have dead fronts or rears.
This patch enables DSE to split memory intrinsics that
are dead in the middle into `Front` and `Rear`:

```
  // __Front__                 ___Rear___
  // | ------------- Dead ------------- |
  //         | --- Killing --- |
```

Resolves llvm#72113
;
entry:
%blob = alloca [1000 x i8], align 16, !DIAssignID !112
call void @llvm.dbg.assign(metadata i1 undef, metadata !108, metadata !DIExpression(), metadata !112, metadata ptr %blob, metadata !DIExpression()), !dbg !113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please simplify the test, by dropping the debug info and unneeded lifetime s

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've simplified the test and removed the lifetimes, but I've not removed the debuginfo entirely because that specific test is for dbg.assign intrinsic updates.

// If Front and Rear are both bigger than the threshold they won't be inlined
// so this seems like a bad idea. If Dead is smaller than the threshold it
// will be inlined so this isn't a good idea.
if ((FrontSize > Threshold && RearSize > Threshold) || DeadSize < Threshold)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are all those combos covered by tests?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added tests to cover all scenarios, thanks.

- Remove unnecessary array
- Simplify debuginfo test
- Add tests to cover the profitability check
-
@omern1
Copy link
Member Author

omern1 commented Dec 15, 2023

Sorry for the force push, diff from pre-committed tests is here: 6f26953

@bjope
Copy link
Collaborator

bjope commented Dec 20, 2023

Thanks a lot for looking into this!

Not sure how much time I have to look at this before the end of the year. But I've downloaded the current patch to run some early testing.

A few things that I noticed already:

  • The already existing helpers for trimming the start (tryToShorten) is involving some logic for atomic operations. I wonder if that is needed here as well (a bit unsure since the helper for trimming the end (tryToShortenEnd) isn't having any extra logic for atomics. So maybe it isn't needed, but perhaps something to look into unless you've figured out that it isn't needed.
  • The heuristics for when to apply the transform definitely needs to be adjusted for my target (I don't think that getMaxMemIntrinsicInlineSizeThreshold() will be enough). We can for example easily memset/memcpy 1/2/4/8 bytes cheaply (a single store). While 7 bytes would require three stores. So even if for example the "frontsize" can be reduced to something that isn't a power of two, and smaller than eight, it would be profitable for our target to still memset/memcpy 8 bytes. And similarly for the "rearsize". So depending on the target it might be important to keep the start addresses of the two parts aligned, but sometimes also the number of bytes to write.
  • I find the (DeadSize >= Threshold) requirement a bit restrictive (at least for us). One example would be if we have a memset writing X+Y+X bytes, with the Y bytes in the middle being dead. If for example X=8 and Y=1, then doing the split would result in two 8 byte stores, while a memset of 17 bytes would require 3 stores (in this example I assume that the target can do stores with align=1). So even if we normally would inline such a memset, it would be profitable to split it up and skip writing the middle Y bytes as the front/rear would be cheaper as the sizes are well-aligned.

I understand that it could be hard to find a heuristic and TTI hooks that works well for any target (and specially for out-of-targets like mine). And I don't think that needs to be a goal with the first implementation either. Adding more hooks and conditions could be something that evolves over time, when tuning this for different targets. So some of the above comments are just to highlight some different scenarios that sprung to mind when considering how to tune this for my target.

@omern1
Copy link
Member Author

omern1 commented Jan 6, 2024

Hi @bjope,

Thanks for looking at this.

The already existing helpers for trimming the start (tryToShorten) is involving some logic for atomic operations. I wonder if that is needed here as well (a bit unsure since the helper for trimming the end (tryToShortenEnd) isn't having any extra logic for atomics. So maybe it isn't needed, but perhaps something to look into unless you've figured out that it isn't needed.

I've looked at this now and I think the logic for trimming atomic operations is required here as well. I'll add it shortly.

find the (DeadSize >= Threshold) requirement a bit restrictive (at least for us). One example would be if we have a memset writing X+Y+X bytes, with the Y bytes in the middle being dead. If for example X=8 and Y=1, then doing the split would result in two 8 byte stores, while a memset of 17 bytes would require 3 stores (in this example I assume that the target can do stores with align=1). So even if we normally would inline such a memset, it would be profitable to split it up and skip writing the middle Y bytes as the front/rear would be cheaper as the sizes are well-aligned.

This is a good point, I'm inclined to add a check to let the transform run when Front and Rear are aligned.

I understand that it could be hard to find a heuristic and TTI hooks that works well for any target (and specially for out-of-targets like mine). And I don't think that needs to be a goal with the first implementation either. Adding more hooks and conditions could be something that evolves over time, when tuning this for different targets. So some of the above comments are just to highlight some different scenarios that sprung to mind when considering how to tune this for my target.

Yup, I imagine some of these considerations also exist for the heuristics in the tryToShortenBegin and tryToShortenEnd functions, they can be addressed in future iterations.

@omern1
Copy link
Member Author

omern1 commented Jan 30, 2024

Apologies for the delay (I've been away), I've added the check for atomic intrinsics and an associated test like I mentioned earlier.

@omern1
Copy link
Member Author

omern1 commented Feb 14, 2024

(Polite ping!)

@omern1
Copy link
Member Author

omern1 commented Apr 29, 2024

Hi @bjope / @felipepiovezan / @fhahn, are there any other changes that you'd like me to make to this patch?

@omern1
Copy link
Member Author

omern1 commented Apr 29, 2024

I've now also removed the DeadSize >= Threshold condition.

@@ -531,7 +533,9 @@ define void @test12_memset_other_store_in_between_partial_overlap(ptr %ptr) {

define void @test12_memset_later_store_exceeds_memset(ptr %ptr) {
; CHECK-LABEL: @test12_memset_later_store_exceeds_memset(
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 8, i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 5
; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 dereferenceable(5) [[TMP1]], i8 0, i64 5, i1 false)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks wrong? We used to have a single memset writing 8 bytes, and now we got two memset writing 5+4=9 bytes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the delay, fixed in bc32bcb.

@@ -246,7 +246,9 @@ define i32 @test5(i32 %x) nounwind ssp {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[Y:%.*]] = alloca [[STRUCT_S:%.*]], align 16
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[Y]], ptr align 16 @sS, i64 32, i1 false)
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 16
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 dereferenceable(16) [[TMP0]], ptr align 16 @sS, i64 16, i1 false)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also looks weird. Splitting up the memset in two, but we still write 16+16=32 bytes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1805544.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Lack of dead store elimination when only middle part of a memset is overwritten
6 participants