[LLVM][DWARF] Change .debug_names abbrev to be an index #81200

ayermolo · 2024-02-08T22:30:29Z

Based on the discussion in #80229
changed implementation to align with how .debug_abbrev is handled. So that
.debug_names abbrev tag is a monotonically increasing index. This allows for
tools like LLDB to access it in constant time using array like data structure.

clang-19 debug build
before change 
[41] .debug_names PROGBITS 0000000000000000 8f9e0350 137fdbe0 00 0 0 4
after change
[41] .debug_names PROGBITS 0000000000000000 8f9e0350 125bfdec 00 0 0 4

Reduction ~19.1MB

llvmbot · 2024-02-08T22:30:57Z

@llvm/pr-subscribers-debuginfo

Author: Alexander Yermolovich (ayermolo)

Changes

Based on the discussion in #80229
changed implementation to align with how .debug_abbrev is handled. So that
.debug_names abbrev tag is a monotonically increasing index. This allows for
tools like LLDB to access it in constant time.

Full diff: https://github.com/llvm/llvm-project/pull/81200.diff

4 Files Affected:

(modified) llvm/include/llvm/CodeGen/AccelTable.h (+43)
(modified) llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp (+46-58)
(modified) llvm/test/DebugInfo/X86/debug-names-dwarf64.ll (+5-5)
(modified) llvm/test/DebugInfo/X86/debug-names-types.ll (+21-21)

diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index e6a661696354b..38da67636bae8 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -360,6 +360,49 @@ class DWARF5AccelTable : public AccelTable<DWARF5AccelTableData> {
     unsigned Index;
     DWARF5AccelTableData::AttributeEncoding Encoding;
   };
+  union AbbrevDescriptor {
+    struct {
+      uint32_t CompUnit : 1;
+      uint32_t TypeUnit : 1;
+      uint32_t DieOffset : 1;
+      uint32_t Parent : 2;
+      uint32_t TypeHash : 1;
+      uint32_t Tag : 26;
+    } Bits;
+    uint32_t Value = 0;
+  };
+  struct TagIndex {
+    uint32_t DieTag;
+    uint32_t Index;
+  };
+  struct cmpByTagIndex {
+    bool operator()(const TagIndex &LHS, const TagIndex &RHS) const {
+      return LHS.Index < RHS.Index;
+    }
+  };
+  enum IdxParentEncoding : uint8_t {
+    NoIndexedParent =
+        0,        /// Parent information present but parent isn't indexed.
+    Ref4 = 1,     /// Parent information present and parent is indexed.
+    NoParent = 2, /// Parent information missing.
+  };
+
+  /// Returns DW_IDX_parent abbrev encoding for the given form.
+  static uint8_t
+  encodeIdxParent(const std::optional<dwarf::Form> MaybeParentForm) {
+    if (!MaybeParentForm)
+      return NoParent;
+    switch (*MaybeParentForm) {
+    case dwarf::Form::DW_FORM_flag_present:
+      return NoIndexedParent;
+    case dwarf::Form::DW_FORM_ref4:
+      return Ref4;
+    default:
+      // This is not crashing on bad input: we should only reach this if the
+      // internal compiler logic is faulty; see getFormForIdxParent.
+      llvm_unreachable("Bad form for IDX_parent");
+    }
+  }
   /// Returns type units that were constructed.
   const TUVectorTy &getTypeUnitsSymbols() { return TUSymbolsOrHashes; }
   /// Add a type unit start symbol.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index 1024aabf2ab0f..a30a1393ba58f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -208,7 +208,9 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
   };
 
   Header Header;
-  DenseMap<uint32_t, SmallVector<DWARF5AccelTableData::AttributeEncoding, 3>>
+  std::map<DWARF5AccelTable::TagIndex,
+           SmallVector<DWARF5AccelTableData::AttributeEncoding, 3>,
+           DWARF5AccelTable::cmpByTagIndex>
       Abbreviations;
   ArrayRef<std::variant<MCSymbol *, uint64_t>> CompUnits;
   ArrayRef<std::variant<MCSymbol *, uint64_t>> TypeUnits;
@@ -223,6 +225,15 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
   bool IsSplitDwarf = false;
   /// Stores the DIE offsets which are indexed by this table.
   DenseSet<OffsetAndUnitID> IndexedOffsets;
+  /// Mapping between AbbrevTag and Index.
+  std::unordered_map<uint32_t, uint32_t> AbbrevTagToIndexMap;
+
+  /// Constructs and returns a unique AbbrevTag that captures what a DIE
+  /// accesses.
+  DWARF5AccelTable::TagIndex getAbbrevIndex(
+      const unsigned DieTag,
+      const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> &EntryRet,
+      const std::optional<dwarf::Form> &MaybeParentForm);
 
   void populateAbbrevsMap();
 
@@ -234,7 +245,7 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
   void emitEntry(
       const DWARF5AccelTableData &Entry,
       const DenseMap<OffsetAndUnitID, MCSymbol *> &DIEOffsetToAccelEntryLabel,
-      DenseSet<MCSymbol *> &EmittedAccelEntrySymbols) const;
+      DenseSet<MCSymbol *> &EmittedAccelEntrySymbols);
   void emitData();
 
 public:
@@ -409,49 +420,30 @@ DWARF5AccelTableData::getDefiningParentDieOffset(const DIE &Die) {
   return {};
 }
 
-enum IdxParentEncoding : uint8_t {
-  NoIndexedParent = 0, /// Parent information present but parent isn't indexed.
-  Ref4 = 1,            /// Parent information present and parent is indexed.
-  NoParent = 2,        /// Parent information missing.
-};
-
-static uint32_t constexpr NumBitsIdxParent = 2;
-
-uint8_t encodeIdxParent(const std::optional<dwarf::Form> MaybeParentForm) {
-  if (!MaybeParentForm)
-    return NoParent;
-  switch (*MaybeParentForm) {
-  case dwarf::Form::DW_FORM_flag_present:
-    return NoIndexedParent;
-  case dwarf::Form::DW_FORM_ref4:
-    return Ref4;
-  default:
-    // This is not crashing on bad input: we should only reach this if the
-    // internal compiler logic is faulty; see getFormForIdxParent.
-    llvm_unreachable("Bad form for IDX_parent");
-  }
-}
-
-static uint32_t constexpr ParentBitOffset = dwarf::DW_IDX_type_hash;
-static uint32_t constexpr TagBitOffset = ParentBitOffset + NumBitsIdxParent;
-static uint32_t getTagFromAbbreviationTag(const uint32_t AbbrvTag) {
-  return AbbrvTag >> TagBitOffset;
-}
-
-/// Constructs a unique AbbrevTag that captures what a DIE accesses.
-/// Using this tag we can emit a unique abbreviation for each DIE.
-static uint32_t constructAbbreviationTag(
-    const unsigned Tag,
+DWARF5AccelTable::TagIndex Dwarf5AccelTableWriter::getAbbrevIndex(
+    const unsigned DieTag,
     const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> &EntryRet,
-    std::optional<dwarf::Form> MaybeParentForm) {
-  uint32_t AbbrvTag = 0;
-  if (EntryRet)
-    AbbrvTag |= 1 << EntryRet->Encoding.Index;
-  AbbrvTag |= 1 << dwarf::DW_IDX_die_offset;
-  AbbrvTag |= 1 << dwarf::DW_IDX_parent;
-  AbbrvTag |= encodeIdxParent(MaybeParentForm) << ParentBitOffset;
-  AbbrvTag |= Tag << TagBitOffset;
-  return AbbrvTag;
+    const std::optional<dwarf::Form> &MaybeParentForm) {
+  DWARF5AccelTable::AbbrevDescriptor AbbrvDesc;
+  if (EntryRet) {
+    switch (EntryRet->Encoding.Index) {
+    case dwarf::DW_IDX_compile_unit:
+      AbbrvDesc.Bits.CompUnit = true;
+      break;
+    case dwarf::DW_IDX_type_unit:
+      AbbrvDesc.Bits.TypeUnit = true;
+      break;
+    default:
+      llvm_unreachable("Invalid encoding index");
+      break;
+    }
+  }
+  AbbrvDesc.Bits.Parent = DWARF5AccelTable::encodeIdxParent(MaybeParentForm);
+  AbbrvDesc.Bits.DieOffset = true;
+  AbbrvDesc.Bits.Tag = DieTag;
+  auto Iter = AbbrevTagToIndexMap.insert(
+      {AbbrvDesc.Value, static_cast<uint32_t>(AbbrevTagToIndexMap.size() + 1)});
+  return {DieTag, Iter.first->second};
 }
 
 static std::optional<dwarf::Form>
@@ -476,8 +468,8 @@ void Dwarf5AccelTableWriter::populateAbbrevsMap() {
         unsigned Tag = Value->getDieTag();
         std::optional<dwarf::Form> MaybeParentForm = getFormForIdxParent(
             IndexedOffsets, Value->getParentDieOffsetAndUnitID());
-        uint32_t AbbrvTag =
-            constructAbbreviationTag(Tag, EntryRet, MaybeParentForm);
+        const DWARF5AccelTable::TagIndex AbbrvTag =
+            getAbbrevIndex(Tag, EntryRet, MaybeParentForm);
         if (Abbreviations.count(AbbrvTag) == 0) {
           SmallVector<DWARF5AccelTableData::AttributeEncoding, 3> UA;
           if (EntryRet)
@@ -538,11 +530,9 @@ void Dwarf5AccelTableWriter::emitAbbrevs() const {
   Asm->OutStreamer->emitLabel(AbbrevStart);
   for (const auto &Abbrev : Abbreviations) {
     Asm->OutStreamer->AddComment("Abbrev code");
-    uint32_t Tag = getTagFromAbbreviationTag(Abbrev.first);
-    assert(Tag != 0);
-    Asm->emitULEB128(Abbrev.first);
-    Asm->OutStreamer->AddComment(dwarf::TagString(Tag));
-    Asm->emitULEB128(Tag);
+    Asm->emitULEB128(Abbrev.first.Index);
+    Asm->OutStreamer->AddComment(dwarf::TagString(Abbrev.first.DieTag));
+    Asm->emitULEB128(Abbrev.first.DieTag);
     for (const auto &AttrEnc : Abbrev.second) {
       Asm->emitULEB128(AttrEnc.Index, dwarf::IndexString(AttrEnc.Index).data());
       Asm->emitULEB128(AttrEnc.Form,
@@ -558,20 +548,18 @@ void Dwarf5AccelTableWriter::emitAbbrevs() const {
 void Dwarf5AccelTableWriter::emitEntry(
     const DWARF5AccelTableData &Entry,
     const DenseMap<OffsetAndUnitID, MCSymbol *> &DIEOffsetToAccelEntryLabel,
-    DenseSet<MCSymbol *> &EmittedAccelEntrySymbols) const {
+    DenseSet<MCSymbol *> &EmittedAccelEntrySymbols) {
   std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
       getIndexForEntry(Entry);
   std::optional<OffsetAndUnitID> MaybeParentOffset =
       Entry.getParentDieOffsetAndUnitID();
   std::optional<dwarf::Form> MaybeParentForm =
       getFormForIdxParent(IndexedOffsets, MaybeParentOffset);
-  uint32_t AbbrvTag =
-      constructAbbreviationTag(Entry.getDieTag(), EntryRet, MaybeParentForm);
-  auto AbbrevIt = Abbreviations.find(AbbrvTag);
+  const DWARF5AccelTable::TagIndex TagIndexVal =
+      getAbbrevIndex(Entry.getDieTag(), EntryRet, MaybeParentForm);
+  auto AbbrevIt = Abbreviations.find(TagIndexVal);
   assert(AbbrevIt != Abbreviations.end() &&
          "Why wasn't this abbrev generated?");
-  assert(getTagFromAbbreviationTag(AbbrevIt->first) == Entry.getDieTag() &&
-         "Invalid Tag");
 
   auto EntrySymbolIt =
       DIEOffsetToAccelEntryLabel.find(Entry.getDieOffsetAndUnitID());
@@ -584,7 +572,7 @@ void Dwarf5AccelTableWriter::emitEntry(
   if (EmittedAccelEntrySymbols.insert(EntrySymbol).second)
     Asm->OutStreamer->emitLabel(EntrySymbol);
 
-  Asm->emitULEB128(AbbrevIt->first, "Abbreviation code");
+  Asm->emitULEB128(TagIndexVal.Index, "Abbreviation code");
 
   for (const auto &AttrEnc : AbbrevIt->second) {
     Asm->OutStreamer->AddComment(dwarf::IndexString(AttrEnc.Index));
diff --git a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
index c15e2ad1d56b0..9a5fd07335873 100644
--- a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
@@ -30,11 +30,6 @@
 ; CHECK-NEXT:     CU[0]: 0x00000000
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Abbreviations [
-; CHECK-NEXT:     Abbreviation [[ABBREV_LABEL:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_label
-; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-NEXT:       DW_IDX_parent: DW_FORM_ref4
-; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_base_type
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
@@ -50,6 +45,11 @@
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
+; CHECK-NEXT:     Abbreviation [[ABBREV_LABEL:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_label
+; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_ref4
+; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Bucket 0 [
 ; CHECK-NEXT:     Name 1 {
diff --git a/llvm/test/DebugInfo/X86/debug-names-types.ll b/llvm/test/DebugInfo/X86/debug-names-types.ll
index f41bb5524b9c3..ff0d4d52c1f07 100644
--- a/llvm/test/DebugInfo/X86/debug-names-types.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-types.ll
@@ -37,20 +37,14 @@
 ; CHECK-NEXT:        LocalTU[0]: 0x00000000
 ; CHECK-NEXT:      ]
 ; CHECK:        Abbreviations [
-; CHECK-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_structure_type
-; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
-; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
-; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; CHECK-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_base_type
-; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_base_type
+; CHECK-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_structure_type
+; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
@@ -64,6 +58,12 @@
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
+; CHECK-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_base_type
+; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Bucket 0 [
 ; CHECK-NEXT:     Name 1 {
@@ -130,7 +130,7 @@
 ; CHECK-SPLIT:          Foreign TU count: 1
 ; CHECK-SPLIT-NEXT:     Bucket count: 4
 ; CHECK-SPLIT-NEXT:     Name count: 4
-; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x32
+; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x2D
 ; CHECK-SPLIT-NEXT:     Augmentation: 'LLVM0700'
 ; CHECK-SPLIT-NEXT:   }
 ; CHECK-SPLIT-NEXT:   Compilation Unit offsets [
@@ -140,20 +140,14 @@
 ; CHECK-SPLIT-NEXT:     ForeignTU[0]: 0x675d23e4f33235f2
 ; CHECK-SPLIT-NEXT:   ]
 ; CHECK-SPLIT-NEXT:   Abbreviations [
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
-; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
-; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
-; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; CHECK-SPLIT-NEXT:       Tag: DW_TAG_base_type
-; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_base_type
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
+; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
@@ -167,6 +161,12 @@
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:       Tag: DW_TAG_base_type
+; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
 ; CHECK-SPLIT-NEXT:   Bucket 0 [
 ; CHECK-SPLIT-NEXT:     Name 1 {

felipepiovezan

This allows for tools like LLDB to access it in constant time.

I don't think this is true: the spec does not require the table to be monotonically increasing. So LLDB either needs to first check if all entries are neatly organized, or just build a map anyway.

The big motivation for this patch are the space savings: the debug_names section will be much smaller as most abbreviations now fit in 1 byte

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp

dwblaikie · 2024-02-08T22:54:08Z

This allows for tools like LLDB to access it in constant time.

I don't think this is true: the spec does not require the table to be monotonically increasing. So LLDB either needs to first check if all entries are neatly organized, or just build a map anyway.

At least the way libDebugInfoDWARF does it is that it does check - it parses all the abbrevs, but rather than creating a map - it keeps track of whether the abbrev numbers are monotonically increasing and records the first one - and if they are it does the lookup in O(1), and if they aren't it does a linear search.

Yes, a map would be O(1) in both cases, with higher constant factors and memory overhead - but given this approach has worked for some time for abbrevs, I'd like to use it for debug_name abbrevs too. And the sooner we switch to it for both producer and consumers we control, the more likely we are to encourage this as a fairly simple way to do these lookups quickly. (not that the performance of abbrev lookups is likely to be critical, especially in .debug_names when there are probably only a handful of them anyway)

Looks like LLDB maybe uses the llvm::DWARFAbbreviationDeclarationSet which uses this strategy implemented here:

llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp

Line 62 in f720150

DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(

(FirstAbbrevCode is set to the sentinel UINT32_MAX if the abbrev numbers are not monotonically increasing, and a linear search is used - otherwise a direct index is used, after bounds checking)

The big motivation for this patch are the space savings: the debug_names section will be much smaller as most abbreviations now fit in 1 byte

That's nice too, for sure.

dwblaikie · 2024-02-08T22:58:02Z

llvm/include/llvm/CodeGen/AccelTable.h

+      uint32_t TypeHash : 1;
+      uint32_t Tag : 26;
+    } Bits;
+    uint32_t Value = 0;


Could we avoid the union entirely, and use the structure with the bitfield members directly instead?

Changed it struct. Although it requires bit_cast and memset. Unless there is some c++ I am not aware of.

ayermolo · 2024-02-08T23:03:51Z

I put structs and helper method into 8ui9

This allows for tools like LLDB to access it in constant time.

I don't think this is true: the spec does not require the table to be monotonically increasing. So LLDB either needs to first check if all entries are neatly organized, or just build a map anyway.

At least the way libDebugInfoDWARF does it is that it does check - it parses all the abbrevs, but rather than creating a map - it keeps track of whether the abbrev numbers are monotonically increasing and records the first one - and if they are it does the lookup in O(1), and if they aren't it does a linear search.

Yes, a map would be O(1) in both cases, with higher constant factors and memory overhead - but given this approach has worked for some time for abbrevs, I'd like to use it for debug_name abbrevs too. And the sooner we switch to it for both producer and consumers we control, the more likely we are to encourage this as a fairly simple way to do these lookups quickly. (not that the performance of abbrev lookups is likely to be critical, especially in .debug_names when there are probably only a handful of them anyway)

Looks like LLDB maybe uses the llvm::DWARFAbbreviationDeclarationSet which uses this strategy implemented here:

llvm-project/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp

Line 62 in f720150

DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(

(FirstAbbrevCode is set to the sentinel UINT32_MAX if the abbrev numbers are not monotonically increasing, and a linear search is used - otherwise a direct index is used, after bounds checking)

The big motivation for this patch are the space savings: the debug_names section will be much smaller as most abbreviations now fit in 1 byte

That's nice too, for sure.

Right this. Phrased it poorly.

ayermolo · 2024-02-08T23:04:50Z

Also I put the structs into the header so that I can use them on BOLT side also. To reduce duplication of code.

felipepiovezan · 2024-02-09T00:22:34Z

Looks like LLDB maybe uses the llvm::DWARFAbbreviationDeclarationSet which uses this strategy implemented here:

Ohh nice! I wasn't aware of that, I'm glad we do it!

clayborg · 2024-02-09T00:34:18Z

llvm/include/llvm/CodeGen/AccelTable.h

+      uint32_t DieOffset : 1;
+      uint32_t Parent : 2;
+      uint32_t TypeHash : 1;
+      uint32_t Tag : 26;


the tag only requires a uint16_t. Can we make this 16 bits instead of 26?

Or maybe this can be:

union AbbrevDescriptor { struct { uint16_t CompUnit : 1; uint16_t TypeUnit : 1; uint16_t DieOffset : 1; uint16_t Parent : 2; uint16_t TypeHash : 1; uint16_t Tag; } Bits; uint32_t Value = 0; };

Does it matter?

github-actions · 2024-02-09T02:48:15Z

✅ With the latest revision this PR passed the C/C++ code formatter.

dwblaikie · 2024-02-09T15:52:04Z

llvm/include/llvm/CodeGen/AccelTable.h

+
+  /// Returns DW_IDX_parent abbrev encoding for the given form.
+  static uint8_t
+  encodeIdxParent(const std::optional<dwarf::Form> MaybeParentForm) {


LLVM code, by and large, does not use const on values - please don't do this. It's confusing to read in a codebase that doesn't usually do it as it may be mistaken for a missing & even when that's not the intent.

dwblaikie · 2024-02-09T15:52:39Z

llvm/include/llvm/CodeGen/AccelTable.h

+  struct cmpByTagIndex {
+    bool operator()(const TagIndex &LHS, const TagIndex &RHS) const {
+      return LHS.Index < RHS.Index;
+    }
+  };


This is probably better as inline bool operator<(const TagIndex &LHS, const TagIndex &RHS) then it doesn't need to be passed as a parameter to the map later, it'll be the default.

dwblaikie · 2024-02-09T15:53:28Z

llvm/include/llvm/CodeGen/AccelTable.h

+    uint32_t DieTag;
+    uint32_t Index;


Why does the index need to be keyed on both tag and index? The index is a unique identifier for an abbreviation (by definition).

dwblaikie · 2024-02-09T16:05:13Z

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp

@@ -223,6 +225,15 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
  bool IsSplitDwarf = false;
  /// Stores the DIE offsets which are indexed by this table.
  DenseSet<OffsetAndUnitID> IndexedOffsets;
+  /// Mapping between AbbrevTag and Index.
+  DenseMap<uint32_t, uint32_t> AbbrevTagToIndexMap;


It's awkward/brittle to maintain two different maps - I think the Abbreviations map should be the only one.

It should be keyed on the AbbrevDescriptor (it shouldn't need bitcasts to/from - define a hash and equality function for AbbrevDescriptor and use that type itself as the key). Actually, perhaps it should be a DenseSet instead? With the attributes as part of the key/hash - we are currentyl duplicately encoding the attributes in the AbbrevDescirptor bitfields, and then in the SmallVector of AttributeEncodings. That redundancy seems unfortunate/unnecessary. The index can be populated when it's inserted into the abbrev table.

Ultimately, I think it'd be suitable for this code to be shared with the DIE abbreviation handling, which looks like this:

llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp

Line 140 in b081e9d

DIEAbbrev &DIEAbbrevSet::uniqueAbbreviation(DIE &Die) {

(the DIE abbrev stuff could maybe be cleaned up a bit - it's gone pretty much untouched since the early days (which is great - means it's been a robust implementation/not a major source of issues - but equally, could use some love if we're looking at it anyway now))

So some path that goes towards that seems good to me.

Changed to folding set.

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp

dwblaikie

Thanks! This looks like the right direction to me (ideally we'd actually share some of the code, but making it more similar at least goes part way to easing such a merge & at least making the differences/reasons for differences clearer, against the similarities)

llvm/include/llvm/CodeGen/AccelTable.h

dwblaikie · 2024-02-13T15:50:59Z

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp

+    if (Enc.Index == dwarf::DW_IDX_parent)
+      ID.AddInteger(Enc.Form);


This special case is here because we use the same form for every other DW_IDX attribute except parent? Could you add an assert that checks the form for non-parent IDX is the expected form? (so if we tweak the code to use other novel forms in the future, we know to update this code)

For DW_IDX_parent it can be DW_FORM_flag_present or DW_FORM_ref4. It depends on a DIE. For all others form is unique per module.
For DW_IDX_die_offset it's always DW_IDX_die_offset.
For DW_IDX_compile_unit/DW_IDX_type_unit it can change based on size of CU/TU lists in the module. So DW_FORM_data{1,2,4}.

Oh, but it's the same form for all entries, even the smaller ones - if the list is large enough?
Might be good to just include the form in the hash here, as we might optimize it later to use smaller forms for smaller entries? (at least we should assert the expected form so we can catch this later - but some amount of generality might just be easier to do now rather than assert and fix later)

OK. Changed to always include form.

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp

Based on the discussion in llvm#80229 changed implementation to align with how .debug_abbrev is handled. So that .debug_names abbrev tag is a monotonically increasing index. This allows for tools like LLDB to access it in constant time.

ayermolo · 2024-02-14T19:49:15Z

Rebased this branch by mistake. Thus force push.

dwblaikie

Thanks!

ayermolo · 2024-02-14T20:22:41Z

Thanks for reviewing, and valuable feedback.

vitalybuka · 2024-02-15T00:33:27Z

There is a memory leak https://lab.llvm.org/buildbot/#/builders/5/builds/40985

ayermolo · 2024-02-15T00:42:37Z

There is a memory leak https://lab.llvm.org/buildbot/#/builders/5/builds/40985

hmm, let me take a look.

ayermolo · 2024-02-15T07:11:06Z

Fix: #81828

llvmbot added the debuginfo label Feb 8, 2024

ayermolo requested review from felipepiovezan, dwblaikie and clayborg February 8, 2024 22:31

felipepiovezan approved these changes Feb 8, 2024

View reviewed changes

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp Outdated Show resolved Hide resolved

dwblaikie reviewed Feb 8, 2024

View reviewed changes

clayborg reviewed Feb 9, 2024

View reviewed changes

dwblaikie reviewed Feb 9, 2024

View reviewed changes

felipepiovezan reviewed Feb 12, 2024

View reviewed changes

llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp Outdated Show resolved Hide resolved

dwblaikie reviewed Feb 13, 2024

View reviewed changes

ayermolo added 9 commits February 13, 2024 16:13

removed union, changed to DenseMap

4d248a2

format

3f6f8d2

updated test

13e9648

updated test

d4b3a59

Changed to set

b006fe5

changed back to auto

ac7ec22

changed naming, changed index

3354eb4

changed to always hash form

36cb2b6

ayermolo force-pushed the abbrevIndex branch from 66d2e95 to 36cb2b6 Compare February 14, 2024 19:47

dwblaikie approved these changes Feb 14, 2024

View reviewed changes

ayermolo merged commit a78d13d into llvm:main Feb 14, 2024
4 checks passed

ayermolo deleted the abbrevIndex branch February 14, 2024 20:22

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[LLVM][DWARF] Change .debug_names abbrev to be an index #81200

[LLVM][DWARF] Change .debug_names abbrev to be an index #81200

ayermolo commented Feb 8, 2024 •

edited

llvmbot commented Feb 8, 2024

felipepiovezan left a comment •

edited

dwblaikie commented Feb 8, 2024

dwblaikie Feb 8, 2024

ayermolo Feb 9, 2024

ayermolo commented Feb 8, 2024

ayermolo commented Feb 8, 2024

felipepiovezan commented Feb 9, 2024

clayborg Feb 9, 2024

ayermolo Feb 9, 2024

github-actions bot commented Feb 9, 2024 •

edited

dwblaikie Feb 9, 2024

dwblaikie Feb 9, 2024

dwblaikie Feb 9, 2024

dwblaikie Feb 9, 2024

ayermolo Feb 12, 2024

dwblaikie left a comment

dwblaikie Feb 13, 2024

ayermolo Feb 13, 2024

dwblaikie Feb 14, 2024

ayermolo Feb 14, 2024

ayermolo commented Feb 14, 2024

dwblaikie left a comment

ayermolo commented Feb 14, 2024

vitalybuka commented Feb 15, 2024

ayermolo commented Feb 15, 2024

ayermolo commented Feb 15, 2024

		if (Enc.Index == dwarf::DW_IDX_parent)
		ID.AddInteger(Enc.Form);

[LLVM][DWARF] Change .debug_names abbrev to be an index #81200

[LLVM][DWARF] Change .debug_names abbrev to be an index #81200

Conversation

ayermolo commented Feb 8, 2024 • edited

llvmbot commented Feb 8, 2024

felipepiovezan left a comment • edited

Choose a reason for hiding this comment

dwblaikie commented Feb 8, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ayermolo commented Feb 8, 2024

ayermolo commented Feb 8, 2024

felipepiovezan commented Feb 9, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Feb 9, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

dwblaikie left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ayermolo commented Feb 14, 2024

dwblaikie left a comment

Choose a reason for hiding this comment

ayermolo commented Feb 14, 2024

vitalybuka commented Feb 15, 2024

ayermolo commented Feb 15, 2024

ayermolo commented Feb 15, 2024

ayermolo commented Feb 8, 2024 •

edited

felipepiovezan left a comment •

edited

github-actions bot commented Feb 9, 2024 •

edited