[llvm-symbolizer] nfc, use map instead of vector #69552

chenzheng1030 · 2023-10-19T02:46:33Z

Use map instead of vector for the symbols, so that:

don't need to sort after collecting all symbols
easily handle of symbols with same addresses.
efficiently find the symbol which has same address when adding a new symbol. (A follow up patch [XCOFF] make related SD symbols as isFunction #69553 needs this functionality.)

llvmbot · 2023-10-19T02:47:41Z

@llvm/pr-subscribers-llvm-binary-utilities

@llvm/pr-subscribers-debuginfo

Author: Chen Zheng (chenzheng1030)

Changes

Use map instead of vector for the symbols, so that:

don't need to sort after collecting all symbols
easily handle of symbols with same addresses.
efficiently find the symbol which has same address when adding a new symbol. (A follow up patch needs this functionality.)

Full diff: https://github.com/llvm/llvm-project/pull/69552.diff

2 Files Affected:

(modified) llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h (+2-6)
(modified) llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp (+22-27)

diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 075dbe3e0e372ed..8a881acb010ac3c 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -17,6 +17,7 @@
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -73,7 +74,6 @@ class SymbolizableObjectFile : public SymbolizableModule {
   bool UntagAddresses;
 
   struct SymbolDesc {
-    uint64_t Addr;
     // If size is 0, assume that symbol occupies the whole memory range up to
     // the following symbol.
     uint64_t Size;
@@ -82,12 +82,8 @@ class SymbolizableObjectFile : public SymbolizableModule {
     // Non-zero if this is an ELF local symbol. See the comment in
     // getNameFromSymbolTable.
     uint32_t ELFLocalSymIdx;
-
-    bool operator<(const SymbolDesc &RHS) const {
-      return Addr != RHS.Addr ? Addr < RHS.Addr : Size < RHS.Size;
-    }
   };
-  std::vector<SymbolDesc> Symbols;
+  std::map<uint64_t, SymbolDesc> Symbols;
   // (index, filename) pairs of ELF STT_FILE symbols.
   std::vector<std::pair<uint32_t, StringRef>> FileSymbols;
 
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index 6b8068a531c05fa..3b5a0ce709f5c08 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -70,20 +70,6 @@ SymbolizableObjectFile::create(const object::ObjectFile *Obj,
         return std::move(E);
   }
 
-  std::vector<SymbolDesc> &SS = res->Symbols;
-  // Sort by (Addr,Size,Name). If several SymbolDescs share the same Addr,
-  // pick the one with the largest Size. This helps us avoid symbols with no
-  // size information (Size=0).
-  llvm::stable_sort(SS);
-  auto I = SS.begin(), E = SS.end(), J = SS.begin();
-  while (I != E) {
-    auto OI = I;
-    while (++I != E && OI->Addr == I->Addr) {
-    }
-    *J++ = I[-1];
-  }
-  SS.erase(J, SS.end());
-
   return std::move(res);
 }
 
@@ -134,7 +120,10 @@ Error SymbolizableObjectFile::addCoffExportSymbols(
     uint32_t NextOffset = I != E ? I->Offset : Export.Offset + 1;
     uint64_t SymbolStart = ImageBase + Export.Offset;
     uint64_t SymbolSize = NextOffset - Export.Offset;
-    Symbols.push_back({SymbolStart, SymbolSize, Export.Name, 0});
+    // If the SymbolStart exists, use the one with the large Size.
+    // This helps us avoid symbols with no size information (Size = 0).
+    if (!Symbols.count(SymbolStart) || SymbolSize >= Symbols[SymbolStart].Size)
+      Symbols[SymbolStart] = {SymbolSize, Export.Name, 0};
   }
   return Error::success();
 }
@@ -215,7 +204,14 @@ Error SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
 
   if (Obj.isELF() && ELFSymbolRef(Symbol).getBinding() != ELF::STB_LOCAL)
     ELFSymIdx = 0;
-  Symbols.push_back({SymbolAddress, SymbolSize, SymbolName, ELFSymIdx});
+
+  // If the SymbolAddress exists, use the one with the large Size.
+  // This helps us avoid symbols with no size information (Size = 0).
+  if (!Symbols.count(SymbolAddress) ||
+      SymbolSize >= Symbols[SymbolAddress].Size) {
+    Symbols[SymbolAddress] = {SymbolSize, SymbolName, ELFSymIdx};
+  }
+
   return Error::success();
 }
 
@@ -234,26 +230,25 @@ uint64_t SymbolizableObjectFile::getModulePreferredBase() const {
 bool SymbolizableObjectFile::getNameFromSymbolTable(
     uint64_t Address, std::string &Name, uint64_t &Addr, uint64_t &Size,
     std::string &FileName) const {
-  SymbolDesc SD{Address, UINT64_C(-1), StringRef(), 0};
-  auto SymbolIterator = llvm::upper_bound(Symbols, SD);
+  auto SymbolIterator = Symbols.upper_bound(Address);
   if (SymbolIterator == Symbols.begin())
     return false;
   --SymbolIterator;
-  if (SymbolIterator->Size != 0 &&
-      SymbolIterator->Addr + SymbolIterator->Size <= Address)
+
+  auto &SD = SymbolIterator->second;
+  if (SD.Size != 0 && SymbolIterator->first + SD.Size <= Address)
     return false;
-  Name = SymbolIterator->Name.str();
-  Addr = SymbolIterator->Addr;
-  Size = SymbolIterator->Size;
+  Name = SD.Name.str();
+  Addr = SymbolIterator->first;
+  Size = SD.Size;
 
-  if (SymbolIterator->ELFLocalSymIdx != 0) {
+  if (SD.ELFLocalSymIdx != 0) {
     // If this is an ELF local symbol, find the STT_FILE symbol preceding
     // SymbolIterator to get the filename. The ELF spec requires the STT_FILE
     // symbol (if present) precedes the other STB_LOCAL symbols for the file.
     assert(Module->isELF());
-    auto It = llvm::upper_bound(
-        FileSymbols,
-        std::make_pair(SymbolIterator->ELFLocalSymIdx, StringRef()));
+    auto It = llvm::upper_bound(FileSymbols,
+                                std::make_pair(SD.ELFLocalSymIdx, StringRef()));
     if (It != FileSymbols.begin())
       FileName = It[-1].second.str();
   }

jh7370 · 2023-10-19T07:18:42Z

llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h

  };
-  std::vector<SymbolDesc> Symbols;
+  std::map<uint64_t, SymbolDesc> Symbols;


Are you sure you want a std::map and not a sorted vector. https://llvm.org/docs/ProgrammersManual.html#map-like-containers-std-map-densemap-etc gives a lot of thoughts on this and provides various other alternatives too that might be more efficient.

I think map is better than sorted vector for my case. In #69553, when add a new symbol, I need to know if there is any existing symbol that has same address. If I am understanding sorted vector right, each time when a new symbol is inserted to a sorted vector, it has to be sorted manually and then for a new insertion, use a log(n) query(std::lower_bound) to check if the address already exists. I don't see the benefit compared with map.

The main benefit is the performance. As explained in the programming guide, std::map is not a very efficient data structure. Please do some runtime comparisons of the tool with and without this change, where you have for both a large number of elements and a small number. In particular, how does the runtime and memory usage compare before and after the change?

If I am understanding sorted vector right, each time when a new symbol is inserted to a sorted vector, it has to be sorted manually

The programming guide seems pretty clear in this regards: a sorted vector is the right approach if you want to insert lots of elements but don't care about the order during insertion. In other words, you haven't understood "sorted vector" correctly. The existing usage looks precisely like is described in the programmer's manual: lots of insertions, followed by a sort, and only then is the vector looked in:

This approach works really well if your usage pattern has these two distinct phases (insert then query), and can be coupled with a good choice of sequential container.

when add a new symbol, I need to know if there is any existing symbol that has same address.

Why do you need to know this at insertion time rather than just sorting things differently?

Why do you need to know this at insertion time rather than just sorting things differently?

Because of the data we stored to Symbols. For now, The symbol is stored as:

struct SymbolDesc { uint64_t Addr; // If size is 0, assume that symbol occupies the whole memory range up to // the following symbol. uint64_t Size; StringRef Name; // Non-zero if this is an ELF local symbol. See the comment in // getNameFromSymbolTable. uint32_t ELFLocalSymIdx; bool operator<(const SymbolDesc &RHS) const { return Addr != RHS.Addr ? Addr < RHS.Addr : Size < RHS.Size; } };

When we do the sorting, we need to know some symbol flags besides address and size for XCOFF. See the comments here https://github.com/llvm/llvm-project/pull/69553/files#diff-795c0eb20ded2ff47056aa629fc0e183c20225b8fd40b607627ab3106995779dR222-R225

Is it acceptable to you that we add a new element (maybe a SymbolRef type and can eliminate current field ELFLocalSymIdx) in struct SymbolDesc? If using this way, we don't need to change the container, can still use vector and do the sort once.

I don't think there's an issue expanding SymbolDesc with more fields as necessary, though for the sake of memory usage, I'd try to keep those elements as small as possible. That would be the preferred approach for adding more conditions to sort on.

chenzheng1030 · 2023-10-24T00:49:59Z

gentle ping. (Sorry for ping this in less than a week. I am going to move forward for #69553. That patch solves some internal regressions.)

spavloff · 2023-10-24T15:55:07Z

llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp

+    if (!Symbols.count(SymbolStart) || SymbolSize >= Symbols[SymbolStart].Size)
+      Symbols[SymbolStart] = {SymbolSize, Export.Name, 0};


What will happen if two symbols have the same address? For example:

.data .globl aaa .globl bbb .p2align 2 aaa: bbb: .long 0 .size aaa, 4 .size bbb, 1

If two symbols have same address, it will follow legacy logic, i.e. use the one with the large size. So for your case, aaa will be kept in the Symbols while bbb will be dropped.

jh7370

As an additional note, I see your subsequent change is to support some XCOFF functionality. I'd be strongly opposed to making performance worse for all output types, just to support some additional XCOFF functionality, so I'm specifically marking with "Request changes" until you've provided some profile information for ELF and perhaps other output formats.

chenzheng1030 · 2023-10-26T05:04:05Z

Thanks for review @jh7370 @spavloff .

To address latest comment from @jh7370 , there is no need to refactor the container for symbols. So I directly fix this in XCOFF related patch #69553

Abandon this patch

nfc, use map instead of vector

1e39f56

chenzheng1030 requested review from MaskRay, spavloff and jh7370 October 19, 2023 02:46

chenzheng1030 self-assigned this Oct 19, 2023

llvmbot added debuginfo llvm:binary-utilities labels Oct 19, 2023

chenzheng1030 mentioned this pull request Oct 19, 2023

[XCOFF] make related SD symbols as isFunction #69553

Merged

jh7370 reviewed Oct 19, 2023

View reviewed changes

chenzheng1030 requested review from mysterymath and dwblaikie October 24, 2023 00:47

chenzheng1030 requested a review from jh7370 October 24, 2023 00:50

spavloff reviewed Oct 24, 2023

View reviewed changes

chenzheng1030 requested a review from spavloff October 25, 2023 00:45

jh7370 requested changes Oct 25, 2023

View reviewed changes

chenzheng1030 closed this Oct 26, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[llvm-symbolizer] nfc, use map instead of vector #69552

[llvm-symbolizer] nfc, use map instead of vector #69552

chenzheng1030 commented Oct 19, 2023 •

edited

llvmbot commented Oct 19, 2023 •

edited

jh7370 Oct 19, 2023

chenzheng1030 Oct 19, 2023

jh7370 Oct 25, 2023

chenzheng1030 Oct 25, 2023

jh7370 Oct 25, 2023

chenzheng1030 commented Oct 24, 2023

spavloff Oct 24, 2023

chenzheng1030 Oct 25, 2023

jh7370 left a comment

chenzheng1030 commented Oct 26, 2023

		if (!Symbols.count(SymbolStart) \|\| SymbolSize >= Symbols[SymbolStart].Size)
		Symbols[SymbolStart] = {SymbolSize, Export.Name, 0};

[llvm-symbolizer] nfc, use map instead of vector #69552

[llvm-symbolizer] nfc, use map instead of vector #69552

Conversation

chenzheng1030 commented Oct 19, 2023 • edited

llvmbot commented Oct 19, 2023 • edited

jh7370 Oct 19, 2023

Choose a reason for hiding this comment

chenzheng1030 Oct 19, 2023

Choose a reason for hiding this comment

jh7370 Oct 25, 2023

Choose a reason for hiding this comment

chenzheng1030 Oct 25, 2023

Choose a reason for hiding this comment

jh7370 Oct 25, 2023

Choose a reason for hiding this comment

chenzheng1030 commented Oct 24, 2023

spavloff Oct 24, 2023

Choose a reason for hiding this comment

chenzheng1030 Oct 25, 2023

Choose a reason for hiding this comment

jh7370 left a comment

Choose a reason for hiding this comment

chenzheng1030 commented Oct 26, 2023

chenzheng1030 commented Oct 19, 2023 •

edited

llvmbot commented Oct 19, 2023 •

edited