[Symbolizer] Compute Nearest Line Info for Address. #71032

ampandey-AMD · 2023-11-02T07:33:21Z

Addresses which has line number info zero for O1,O2,O3 with debug info optimized binaries will try to find the nearest non-zero line in a bidirectional fashion starting from input address.

Addresses which has line number info for O1,O2,O3 with debug info optimized binaries will try to find the nearest non-zero line in a bidirectional fashion starting from input address.

llvmbot · 2023-11-02T07:33:57Z

@llvm/pr-subscribers-debuginfo

@llvm/pr-subscribers-llvm-binary-utilities

Author: Amit Kumar Pandey (ampandey-AMD)

Changes

Addresses which has line number info for O1,O2,O3 with debug info optimized binaries will try to find the nearest non-zero line in a bidirectional fashion starting from input address.

Full diff: https://github.com/llvm/llvm-project/pull/71032.diff

2 Files Affected:

(modified) llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h (+8-4)
(modified) llvm/lib/DebugInfo/Symbolize/Symbolize.cpp (+35-10)

diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index bc4aa74073a6557..d517abc9808c1f4 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -75,11 +75,14 @@ class LLVMSymbolizer {
 
   // Overloads accepting ObjectFile does not support COFF currently
   Expected<DILineInfo> symbolizeCode(const ObjectFile &Obj,
-                                     object::SectionedAddress ModuleOffset);
+                                     object::SectionedAddress ModuleOffset,
+                                     bool nearest = false);
   Expected<DILineInfo> symbolizeCode(const std::string &ModuleName,
-                                     object::SectionedAddress ModuleOffset);
+                                     object::SectionedAddress ModuleOffset,
+                                     bool nearest = false);
   Expected<DILineInfo> symbolizeCode(ArrayRef<uint8_t> BuildID,
-                                     object::SectionedAddress ModuleOffset);
+                                     object::SectionedAddress ModuleOffset,
+                                     bool nearest = false);
   Expected<DIInliningInfo>
   symbolizeInlinedCode(const ObjectFile &Obj,
                        object::SectionedAddress ModuleOffset);
@@ -142,7 +145,8 @@ class LLVMSymbolizer {
   template <typename T>
   Expected<DILineInfo>
   symbolizeCodeCommon(const T &ModuleSpecifier,
-                      object::SectionedAddress ModuleOffset);
+                      object::SectionedAddress ModuleOffset,
+                      bool nearest = false);
   template <typename T>
   Expected<DIInliningInfo>
   symbolizeInlinedCodeCommon(const T &ModuleSpecifier,
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 36d112a5f3fb299..07309d0e687570b 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/BTF/BTFContext.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/DebugInfo/PDB/PDBContext.h"
@@ -24,6 +25,7 @@
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
@@ -52,7 +54,8 @@ LLVMSymbolizer::~LLVMSymbolizer() = default;
 template <typename T>
 Expected<DILineInfo>
 LLVMSymbolizer::symbolizeCodeCommon(const T &ModuleSpecifier,
-                                    object::SectionedAddress ModuleOffset) {
+                                    object::SectionedAddress ModuleOffset,
+                                    bool nearest) {
 
   auto InfoOrErr = getOrCreateModuleInfo(ModuleSpecifier);
   if (!InfoOrErr)
@@ -70,9 +73,28 @@ LLVMSymbolizer::symbolizeCodeCommon(const T &ModuleSpecifier,
   if (Opts.RelativeAddresses)
     ModuleOffset.Address += Info->getModulePreferredBase();
 
-  DILineInfo LineInfo = Info->symbolizeCode(
-      ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
-      Opts.UseSymbolTable);
+  DILineInfo LineInfo;
+  if (!nearest)
+    LineInfo = Info->symbolizeCode(
+        ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
+        Opts.UseSymbolTable);
+  else {
+    object::SectionedAddress PrevModuleOffset = ModuleOffset;
+    while (LineInfo.Line = 0) {
+      LineInfo = Info->symbolizeCode(
+          ModuleOffset,
+          DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
+          Opts.UseSymbolTable);
+      if (LineInfo.Line != 0)
+        break;
+      ModuleOffset.Address++;
+      --PrevModuleOffset.Address;
+      LineInfo = Info->symbolizeCode(
+          PrevModuleOffset,
+          DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
+          Opts.UseSymbolTable);
+    }
+  }
   if (Opts.Demangle)
     LineInfo.FunctionName = DemangleName(LineInfo.FunctionName, Info);
   return LineInfo;
@@ -80,20 +102,23 @@ LLVMSymbolizer::symbolizeCodeCommon(const T &ModuleSpecifier,
 
 Expected<DILineInfo>
 LLVMSymbolizer::symbolizeCode(const ObjectFile &Obj,
-                              object::SectionedAddress ModuleOffset) {
-  return symbolizeCodeCommon(Obj, ModuleOffset);
+                              object::SectionedAddress ModuleOffset,
+                              bool nearest) {
+  return symbolizeCodeCommon(Obj, ModuleOffset, nearest);
 }
 
 Expected<DILineInfo>
 LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
-                              object::SectionedAddress ModuleOffset) {
-  return symbolizeCodeCommon(ModuleName, ModuleOffset);
+                              object::SectionedAddress ModuleOffset,
+                              bool nearest) {
+  return symbolizeCodeCommon(ModuleName, ModuleOffset, nearest);
 }
 
 Expected<DILineInfo>
 LLVMSymbolizer::symbolizeCode(ArrayRef<uint8_t> BuildID,
-                              object::SectionedAddress ModuleOffset) {
-  return symbolizeCodeCommon(BuildID, ModuleOffset);
+                              object::SectionedAddress ModuleOffset,
+                              bool nearest) {
+  return symbolizeCodeCommon(BuildID, ModuleOffset, nearest);
 }
 
 template <typename T>

jh7370

Please make sure your code follows the LLVM coding standards, for example, variable names need to be UpperCase.

This code doesn't appear to be being used by anything?

pogo59

Has no test.

I'm curious why the bidirectional approach would be better than linearly searching backwards, which I think would be more usual.

What keeps a module address from running off the end of valid addresses?

pogo59 · 2023-11-02T16:04:30Z

llvm/lib/DebugInfo/Symbolize/Symbolize.cpp

-      ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
-      Opts.UseSymbolTable);
+  DILineInfo LineInfo;
+  if (!Nearest)


Please use braces around a multi-line "then" clause.

gbreynoo · 2023-11-02T16:59:55Z

Thanks for looking into this, we have a customer use-case in which they would find a feature like this useful for binutils. Did you have particular tools in mind to make use of this change? I also lean towards just moving backwards unless there is a case in which this is unhelpful.

gbreynoo · 2023-11-02T17:03:59Z

llvm/lib/DebugInfo/Symbolize/Symbolize.cpp

+        Opts.UseSymbolTable);
+  else {
+    object::SectionedAddress PrevModuleOffset = ModuleOffset;
+    while (LineInfo.Line = 0) {


Should this be while (LineInfo.Line == 0) { ?

dwblaikie · 2023-11-02T21:44:29Z

Not sure if we want this in llvm-symbolizer, and if we do, probably not by default (due to all the reasons the compiler's producing line zero in the first place - ambiguity, mis-conclusions about reachable code, etc)

gbreynoo · 2023-11-06T16:56:53Z

I agree, this should not be default behavior. I think having a switch that approximates a line number in these cases could still be useful though, as long as it's very clear it is only an estimate.

dwblaikie · 2023-11-06T18:38:21Z

I'd be inclined not to accept this, but I tend to be a bit overly conservative. @jh7370 what're your thoughts on the functionality in general (patch/code details notwithstanding, and enabling the functionality only behind a flag)?

jh7370 · 2023-11-07T08:18:51Z

@jh7370 what're your thoughts on the functionality in general (patch/code details notwithstanding, and enabling the functionality only behind a flag)?

Honestly, I don't have much to add beyond what @gbreynoo has said (for clarity, @gbreynoo is on our company's team that is responsible for maintaining our downstream versions of these tools, I no longer am, but keep monitoring upstream anyway). As I understand it, line 0 is used in places where it's not clear what the real line is. Picking 0 avoids confusion and misleading information. However, as long as the proposed functionality is optional, I think it makes a degree of sense to provide users with a method to try to guess the real line. Exactly the form this takes probably deserves a wider discussion. In particular, I doubt there'll be a single right answer, and it might be necessary to parameterise the option to provide different methods. In particular, I'm thinking:

The last address before the current one with a non-zero line value.
Same as 1) but the next address rather than the previous one.
The last line table entry before the one for the specified address, with a non-zero line value.
Same as 3) but using the next entry instead of the previous one.

If this implemented however, it needs to be clear in the command guide and help text that the option is not going to give 100% accurate results, possibly with some good examples in the docs.

dwblaikie · 2023-11-07T16:27:11Z

for clarity, @gbreynoo is on our company's team that is responsible for maintaining our downstream versions of these tools,

Ah, good to know!

I no longer am, but keep monitoring upstream anyway

Ah - somewhat sorry to hear (hope you're working on other good things) - really appreciated your reviews/ownership (& been meaning to suggest maybe formalizing that - putting you down as code owner of llvm-symbolizer, if you think that's suitable) in this area - appreciate your continued investment here, even/especially when it's not your primary work anymore.

The last address before the current one with a non-zero line value.

The last line table entry before the one for the specified address, with a non-zero line value.

Hmm, not sure I understand the difference between these two cases - could you show a small example/describe this in more detail?

jh7370 · 2023-11-08T08:23:12Z

I no longer am, but keep monitoring upstream anyway

Ah - somewhat sorry to hear (hope you're working on other good things) - really appreciated your reviews/ownership (& been meaning to suggest maybe formalizing that - putting you down as code owner of llvm-symbolizer, if you think that's suitable) in this area - appreciate your continued investment here, even/especially when it's not your primary work anymore.

Thanks for the apppreciation. It's actually been that way for 2 and a half years, so I expect my activity levels to continue that way for the foreseeable future. Regarding code ownership, I appreciate the suggestion. However, I don't think I'd be comfortable adopting that role formally, because there will be times due to work priorities when I might not be able to invest the necessary time.

The last address before the current one with a non-zero line value.

The last line table entry before the one for the specified address, with a non-zero line value.

Hmm, not sure I understand the difference between these two cases - could you show a small example/describe this in more detail?

It's subtle, I admit. It's based on my memory of the line table structure. IIRC, a line table sequence is a series of increasing address values, and could be roughly equated to a function. Clearly going forward or backward within the sequence would be expected to yield the next/previous address where some usable line info may reside. However, the distinction comes at the edge of these sequences: if you go back/forward within the same table to the previous/next sequence, you'd likely jump to the function adjacent to the "real" one in the source file, which might or might not be correct, depending on what the optimizer has done. On the other hand, jumping to the previous/next address might result in a jump to a completely different part of the line table, because the function has somehow been merged with a different function in the source file.

dwblaikie · 2023-11-08T18:06:50Z

ah, think I'm with you @jh7370 - maybe if we just say "the nearest non-zero previous address within this line table sequence" (line tables have a concept of a sequence - essentially from a set_address operation to an end_sequence operation) - we shouldn't cross into another sequence when doing this sort of thing.

ampandey-1995 · 2024-02-19T12:31:50Z

Closing in favour of #82240

[Symbolizer] Compute Nearest Line Info for Address.

fa798a7

Addresses which has line number info for O1,O2,O3 with debug info optimized binaries will try to find the nearest non-zero line in a bidirectional fashion starting from input address.

llvmbot added debuginfo llvm:binary-utilities labels Nov 2, 2023

jh7370 reviewed Nov 2, 2023

View reviewed changes

ampandey-AMD added 2 commits November 2, 2023 15:18

Fix variable names to follow llvm style of coding.

8f18a61

Fix Buildbot error.

f80faa3

pogo59 reviewed Nov 2, 2023

View reviewed changes

gbreynoo reviewed Nov 2, 2023

View reviewed changes

lamb-j self-requested a review November 16, 2023 04:47

jh7370 mentioned this pull request Feb 19, 2024

[Symbolizer] Support for Missing Line Numbers. #82240

Merged

ampandey-1995 closed this Feb 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Symbolizer] Compute Nearest Line Info for Address. #71032

[Symbolizer] Compute Nearest Line Info for Address. #71032

ampandey-AMD commented Nov 2, 2023 •

edited

Loading

llvmbot commented Nov 2, 2023 •

edited

Loading

jh7370 left a comment

pogo59 left a comment

pogo59 Nov 2, 2023

gbreynoo commented Nov 2, 2023

gbreynoo Nov 2, 2023

dwblaikie commented Nov 2, 2023

gbreynoo commented Nov 6, 2023

dwblaikie commented Nov 6, 2023

jh7370 commented Nov 7, 2023

dwblaikie commented Nov 7, 2023

jh7370 commented Nov 8, 2023

dwblaikie commented Nov 8, 2023

ampandey-1995 commented Feb 19, 2024

[Symbolizer] Compute Nearest Line Info for Address. #71032

[Symbolizer] Compute Nearest Line Info for Address. #71032

Conversation

ampandey-AMD commented Nov 2, 2023 • edited Loading

llvmbot commented Nov 2, 2023 • edited Loading

jh7370 left a comment

Choose a reason for hiding this comment

pogo59 left a comment

Choose a reason for hiding this comment

pogo59 Nov 2, 2023

Choose a reason for hiding this comment

gbreynoo commented Nov 2, 2023

gbreynoo Nov 2, 2023

Choose a reason for hiding this comment

dwblaikie commented Nov 2, 2023

gbreynoo commented Nov 6, 2023

dwblaikie commented Nov 6, 2023

jh7370 commented Nov 7, 2023

dwblaikie commented Nov 7, 2023

jh7370 commented Nov 8, 2023

dwblaikie commented Nov 8, 2023

ampandey-1995 commented Feb 19, 2024

ampandey-AMD commented Nov 2, 2023 •

edited

Loading

llvmbot commented Nov 2, 2023 •

edited

Loading